diff --git a/example/resnet101_imagenet/eval.py b/example/resnet101_imagenet/eval.py index 00fe825e916289e64d74b60e978e16345c7d62d8..979c6ca949f4f68ace388fd9903f9d0254e03aa0 100755 --- a/example/resnet101_imagenet/eval.py +++ b/example/resnet101_imagenet/eval.py @@ -51,17 +51,11 @@ context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) if __name__ == '__main__': - if args_opt.do_eval: - context.set_context(enable_hccl=False) - else: - if args_opt.run_distribute: - context.set_context(enable_hccl=True) - context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - mirror_mean=True, parameter_broadcast=True) - auto_parallel_context().set_all_reduce_fusion_split_indices([140]) - init() - else: - context.set_context(enable_hccl=False) + if not args_opt.do_eval and args_opt.run_distribute: + context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mirror_mean=True, parameter_broadcast=True) + auto_parallel_context().set_all_reduce_fusion_split_indices([140]) + init() epoch_size = config.epoch_size net = resnet101(class_num=config.class_num) diff --git a/example/resnet101_imagenet/train.py b/example/resnet101_imagenet/train.py index 0f2063759596ca1b9726b99f8e6eceab45d1a023..c2de3e8d98b6fc52b823b2724ea8064286fafe1e 100755 --- a/example/resnet101_imagenet/train.py +++ b/example/resnet101_imagenet/train.py @@ -56,17 +56,11 @@ context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) if __name__ == '__main__': - if args_opt.do_eval: - context.set_context(enable_hccl=False) - else: - if args_opt.run_distribute: - context.set_context(enable_hccl=True) - context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - mirror_mean=True, parameter_broadcast=True) - auto_parallel_context().set_all_reduce_fusion_split_indices([140]) - init() - else: - context.set_context(enable_hccl=False) + if not args_opt.do_eval and args_opt.run_distribute: + context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mirror_mean=True, parameter_broadcast=True) + auto_parallel_context().set_all_reduce_fusion_split_indices([140]) + init() epoch_size = config.epoch_size net = resnet101(class_num=config.class_num) diff --git a/example/resnet50_cifar10/eval.py b/example/resnet50_cifar10/eval.py index 243dc2a332e2bb8e6a60606f077c64623e383940..1134d0bd2ed4b1ce30872751d3a39b70bc6aeab0 100755 --- a/example/resnet50_cifar10/eval.py +++ b/example/resnet50_cifar10/eval.py @@ -51,17 +51,11 @@ context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) if __name__ == '__main__': - if args_opt.do_eval: - context.set_context(enable_hccl=False) - else: - if args_opt.run_distribute: - context.set_context(enable_hccl=True) - context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - mirror_mean=True) - auto_parallel_context().set_all_reduce_fusion_split_indices([140]) - init() - else: - context.set_context(enable_hccl=False) + if not args_opt.do_eval and args_opt.run_distribute: + context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mirror_mean=True) + auto_parallel_context().set_all_reduce_fusion_split_indices([140]) + init() epoch_size = config.epoch_size net = resnet50(class_num=config.class_num) diff --git a/example/resnet50_cifar10/train.py b/example/resnet50_cifar10/train.py index b18c3778de013e77bac8a0f0386d00b1930ee5d5..0a3ad9dc5a7e73c3c06e2b89d4225b5001da6bbf 100755 --- a/example/resnet50_cifar10/train.py +++ b/example/resnet50_cifar10/train.py @@ -54,17 +54,11 @@ context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) if __name__ == '__main__': - if args_opt.do_eval: - context.set_context(enable_hccl=False) - else: - if args_opt.run_distribute: - context.set_context(enable_hccl=True) - context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, - mirror_mean=True) - auto_parallel_context().set_all_reduce_fusion_split_indices([140]) - init() - else: - context.set_context(enable_hccl=False) + if not args_opt.do_eval and args_opt.run_distribute: + context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mirror_mean=True) + auto_parallel_context().set_all_reduce_fusion_split_indices([140]) + init() epoch_size = config.epoch_size net = resnet50(class_num=config.class_num) diff --git a/example/vgg16_cifar10/eval.py b/example/vgg16_cifar10/eval.py index ca2bbd12eb07ec95379d534acd49a038b96c2361..68c23d250ff4063ba19ec875a3ff21310b209524 100644 --- a/example/vgg16_cifar10/eval.py +++ b/example/vgg16_cifar10/eval.py @@ -37,7 +37,7 @@ if __name__ == '__main__': context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target) context.set_context(device_id=args_opt.device_id) - context.set_context(enable_mem_reuse=True, enable_hccl=False) + context.set_context(enable_mem_reuse=True) net = vgg16(num_classes=cfg.num_classes) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, cfg.momentum, diff --git a/example/vgg16_cifar10/train.py b/example/vgg16_cifar10/train.py index 234e3f7c7e573a5fe6120071b434e26ff94a16ed..52ba0ecdf4620e24bccf42f51ad6c929bf23ae18 100644 --- a/example/vgg16_cifar10/train.py +++ b/example/vgg16_cifar10/train.py @@ -66,7 +66,7 @@ if __name__ == '__main__': context.set_context(device_id=args_opt.device_id) context.set_context(enable_task_sink=True) context.set_context(enable_loop_sink=True) - context.set_context(enable_mem_reuse=True, enable_hccl=False) + context.set_context(enable_mem_reuse=True) device_num = int(os.environ.get("DEVICE_NUM", 1)) if device_num > 1: diff --git a/example/yolov3_coco2017/train.py b/example/yolov3_coco2017/train.py index c7d28a8350f35a5369a573804f7de8501e32caf8..bccc66d996f74d5020587d6b1ccaa11664c6b8a3 100644 --- a/example/yolov3_coco2017/train.py +++ b/example/yolov3_coco2017/train.py @@ -90,13 +90,11 @@ if __name__ == '__main__': if args_opt.distribute: device_num = args_opt.device_num context.reset_auto_parallel_context() - context.set_context(enable_hccl=True) context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, device_num=device_num) init() rank = args_opt.device_id % device_num else: - context.set_context(enable_hccl=False) rank = 0 device_num = 1 diff --git a/mindspore/ccsrc/pipeline/init.cc b/mindspore/ccsrc/pipeline/init.cc index f5cacc7ed5cba087b46c41b15f82ca7b77671340..9bf2aedde468aedb23d24868c9cdcd35d0ce05a2 100644 --- a/mindspore/ccsrc/pipeline/init.cc +++ b/mindspore/ccsrc/pipeline/init.cc @@ -115,8 +115,6 @@ PYBIND11_MODULE(_c_expression, m) { .def("set_device_id", &mindspore::MsContext::set_device_id, "Set device id.") .def("open_tsd", &mindspore::MsContext::OpenTsd, "Open tdt dataset client.") .def("close_tsd", &mindspore::MsContext::CloseTsd, "Close tdt dataset client.") - .def("set_hccl_flag", &mindspore::MsContext::set_enable_hccl, "Set enable hccl.") - .def("get_hccl_flag", &mindspore::MsContext::enable_hccl, "Get whether to enable hccl.") .def("set_task_sink_flag", &mindspore::MsContext::set_enable_task_sink, "Set enable task sink.") .def("get_task_sink_flag", &mindspore::MsContext::enable_task_sink, "Get whether to enable task sink.") .def("get_save_graphs_flag", &mindspore::MsContext::save_graphs_flag, "Get whether to save graphs.") diff --git a/mindspore/ccsrc/pipeline/pipeline.cc b/mindspore/ccsrc/pipeline/pipeline.cc index 7a1830e89a892e5ba7257dae93ae790fc967d499..63920cac13308f87652e75b961cdc9171782bc70 100644 --- a/mindspore/ccsrc/pipeline/pipeline.cc +++ b/mindspore/ccsrc/pipeline/pipeline.cc @@ -773,7 +773,7 @@ void InitHccl() { (void)ms_context->OpenTsd(); uint32_t device_id = ms_context->device_id(); std::string device_name = ms_context->device_target(); - + ms_context->set_enable_hccl(true); if (ms_context->backend_policy() == "ms" && ms_context->device_target() == kAscendDevice) { auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(device_name, device_id); MS_EXCEPTION_IF_NULL(runtime_instance); diff --git a/mindspore/context.py b/mindspore/context.py index 237b2143ed30908c262a202526966649c19f766a..9b6842e4b6243a7be909b734ed77ad3a9d7e2c4e 100644 --- a/mindspore/context.py +++ b/mindspore/context.py @@ -225,14 +225,6 @@ class _Context: if not success: raise RuntimeError("Device id set failed!!!") - @property - def enable_hccl(self): - return self._context_handle.get_hccl_flag() - - @enable_hccl.setter - def enable_hccl(self, hccl): - self._context_handle.set_hccl_flag(hccl) - @property def enable_ir_fusion(self): return self._context_handle.get_ir_fusion_flag() @@ -482,7 +474,7 @@ def reset_auto_parallel_context(): @args_type_check(mode=int, precompile_only=bool, device_target=str, - device_id=int, enable_ir_fusion=bool, save_graphs=bool, enable_hccl=bool, + device_id=int, enable_ir_fusion=bool, save_graphs=bool, enable_task_sink=bool, save_graphs_path=str, enable_loop_sink=bool, enable_mem_reuse=bool, save_ms_model=bool, save_ms_model_path=str, enable_gpu_summary=bool, enable_auto_mixed_precision=bool, enable_dump=bool, save_dump_path=str, @@ -515,7 +507,6 @@ def set_context(**kwargs): while device_num_per_host should no more than 4096. Default: 0. enable_ir_fusion (bool): Whether to enable ir fusion. Default: True. save_graphs (bool): Whether to save graphs. Default: False. - enable_hccl (bool): Whether to enable hccl. Default: False. enable_loop_sink (bool): Whether to enable loop sink. Default: True. enable_task_sink (bool): Whether to enable task sink. Default: True. enable_mem_reuse (bool): Whether to enable memory reuse. Default: True. diff --git a/mindspore/nn/wrap/grad_reducer.py b/mindspore/nn/wrap/grad_reducer.py index 01346698eece321fce19717e97941df02eebab3a..ee57297fe0f9f5b5f3f2c754b153212cd335ab5e 100644 --- a/mindspore/nn/wrap/grad_reducer.py +++ b/mindspore/nn/wrap/grad_reducer.py @@ -130,7 +130,7 @@ class DistributedGradReducer(Cell): >>> >>> device_id = int(os.environ["DEVICE_ID"]) >>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True, - >>> device_id=int(device_id), enable_hccl=True) + >>> device_id=int(device_id)) >>> init() >>> context.reset_auto_parallel_context() >>> context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL) diff --git a/tests/st/auto_parallel/onehot_model_parallel.py b/tests/st/auto_parallel/onehot_model_parallel.py index 1f35ac1f80bae3a89c8f3f252f84f7d2ac9a4d94..3c41e2975e8b54af944a80a7bcef07da9dfce11c 100644 --- a/tests/st/auto_parallel/onehot_model_parallel.py +++ b/tests/st/auto_parallel/onehot_model_parallel.py @@ -33,7 +33,6 @@ def setup_module(): global rank_id np.random.seed(0) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") - context.set_context(enable_hccl=True) context.set_context(enable_task_sink=True, device_id=device_id) context.set_context(enable_ir_fusion=True) diff --git a/tests/st/auto_parallel/soft_entropy_loss_expand_parallel.py b/tests/st/auto_parallel/soft_entropy_loss_expand_parallel.py index 86a8b89521f653dbc8eb200e86275f0dccc313a4..767094c044f6df4277822a1e7aab0b26a9f50ca5 100644 --- a/tests/st/auto_parallel/soft_entropy_loss_expand_parallel.py +++ b/tests/st/auto_parallel/soft_entropy_loss_expand_parallel.py @@ -46,7 +46,6 @@ def setup_module(): global rank_id np.random.seed(0) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") - context.set_context(enable_hccl=True) context.set_context(enable_task_sink=True, device_id=device_id) context.set_context(enable_ir_fusion=True) diff --git a/tests/st/auto_parallel/test_resnet50_expand_loss_2p.py b/tests/st/auto_parallel/test_resnet50_expand_loss_2p.py index b28ad510e398d7ffa590eaf0593544d32401094a..41f08f54eec06c342d4032a3b45550ff78a5ea35 100644 --- a/tests/st/auto_parallel/test_resnet50_expand_loss_2p.py +++ b/tests/st/auto_parallel/test_resnet50_expand_loss_2p.py @@ -31,7 +31,6 @@ from mindspore.train.callback import Callback from mindspore.parallel import set_algo_parameters context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") -context.set_context(enable_hccl=True) context.set_context(enable_task_sink=True, device_id=int(os.getenv('DEVICE_ID'))) context.set_context(enable_ir_fusion=True) context.set_context(enable_loop_sink=False) diff --git a/tests/st/mem_reuse/resnet_cifar_memreuse.py b/tests/st/mem_reuse/resnet_cifar_memreuse.py index d6310612b65c815391e1b6ad16e329b4ac819add..2604fe58505c27bed9af93cfbc914780b82dbc7d 100644 --- a/tests/st/mem_reuse/resnet_cifar_memreuse.py +++ b/tests/st/mem_reuse/resnet_cifar_memreuse.py @@ -122,16 +122,10 @@ class CrossEntropyLoss(nn.Cell): if __name__ == '__main__': - if args_opt.do_eval: - context.set_context(enable_hccl=False) - else: - if args_opt.run_distribute: - context.set_context(enable_hccl=True) - context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL) - context.set_auto_parallel_context(all_reduce_fusion_split_indices=[140]) - init() - else: - context.set_context(enable_hccl=False) + if not args_opt.do_eval and args_opt.run_distribute: + context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL) + context.set_auto_parallel_context(all_reduce_fusion_split_indices=[140]) + init() context.set_context(mode=context.GRAPH_MODE) epoch_size = args_opt.epoch_size diff --git a/tests/st/mem_reuse/resnet_cifar_normal.py b/tests/st/mem_reuse/resnet_cifar_normal.py index 2b6741e57afabbb659a26c797f018fc6a80e6bc5..8e037212d04ed22a258089c3f51260b5a435e1d9 100644 --- a/tests/st/mem_reuse/resnet_cifar_normal.py +++ b/tests/st/mem_reuse/resnet_cifar_normal.py @@ -123,16 +123,10 @@ class CrossEntropyLoss(nn.Cell): if __name__ == '__main__': - if args_opt.do_eval: - context.set_context(enable_hccl=False) - else: - if args_opt.run_distribute: - context.set_context(enable_hccl=True) - context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL) - context.set_auto_parallel_context(all_reduce_fusion_split_indices=[140]) - init() - else: - context.set_context(enable_hccl=False) + if not args_opt.do_eval and args_opt.run_distribute: + context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL) + context.set_auto_parallel_context(all_reduce_fusion_split_indices=[140]) + init() context.set_context(mode=context.GRAPH_MODE) epoch_size = args_opt.epoch_size diff --git a/tests/st/tbe_networks/resnet_cifar.py b/tests/st/tbe_networks/resnet_cifar.py index 7bd03f5d810116bda576255f1d61cf1b70c941d0..4709b3ac70e8695e1e45e4a1346ca1df7ae0b38e 100644 --- a/tests/st/tbe_networks/resnet_cifar.py +++ b/tests/st/tbe_networks/resnet_cifar.py @@ -122,16 +122,10 @@ class CrossEntropyLoss(nn.Cell): if __name__ == '__main__': - if args_opt.do_eval: - context.set_context(enable_hccl=False) - else: - if args_opt.run_distribute: - context.set_context(enable_hccl=True) - context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL) - auto_parallel_context().set_all_reduce_fusion_split_indices([140]) - init() - else: - context.set_context(enable_hccl=False) + if not args_opt.do_eval and args_opt.run_distribute: + context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL) + auto_parallel_context().set_all_reduce_fusion_split_indices([140]) + init() context.set_context(mode=context.GRAPH_MODE) epoch_size = args_opt.epoch_size diff --git a/tests/st/tbe_networks/test_resnet_cifar_8p.py b/tests/st/tbe_networks/test_resnet_cifar_8p.py index 69f0a80d127c7b8f099c5bb71535dadc267c1fe0..1e4372925279c1380edbfa0c7ae8b12c63a311b9 100644 --- a/tests/st/tbe_networks/test_resnet_cifar_8p.py +++ b/tests/st/tbe_networks/test_resnet_cifar_8p.py @@ -153,7 +153,6 @@ def train_process(q, device_id, epoch_size, num_classes, device_num, batch_size, context.set_context(enable_task_sink=True, device_id=device_id) context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) - context.set_context(enable_hccl=enable_hccl) os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = MINDSPORE_HCCL_CONFIG_PATH os.environ['RANK_ID'] = str(device_id) os.environ['RANK_SIZE'] = str(device_num) diff --git a/tests/ut/python/parallel/test_auto_parallel_resnet.py b/tests/ut/python/parallel/test_auto_parallel_resnet.py index a563efb06d2df79b79f920ec96df3faa9aea8fa0..1088ad736d4c97e3d5f12bb60e09745372fd05fd 100644 --- a/tests/ut/python/parallel/test_auto_parallel_resnet.py +++ b/tests/ut/python/parallel/test_auto_parallel_resnet.py @@ -19,6 +19,7 @@ from mindspore import Tensor from mindspore.ops import operations as P from mindspore.nn.optim.momentum import Momentum from mindspore.common.initializer import TruncatedNormal +from mindspore.communication.management import init from mindspore.train.model import Model, ParallelMode from mindspore import context import os @@ -31,10 +32,10 @@ from mindspore.parallel import set_algo_parameters from mindspore.parallel import _cost_model_context as cost_model_context context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") -context.set_context(enable_hccl=True) context.set_context(enable_task_sink=True, device_id= 0) context.set_context(enable_ir_fusion=True) context.set_context(enable_loop_sink=False) +init() def weight_variable(shape, factor=0.1): return TruncatedNormal(0.02)