diff --git a/model_zoo/utils/hccl_tools/README.md b/model_zoo/utils/hccl_tools/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b73a99e592c85ed260f39b5cb82f84ac3d32282a --- /dev/null +++ b/model_zoo/utils/hccl_tools/README.md @@ -0,0 +1,14 @@ +# description + +mindspore distributed training launch helper utilty that will generate hccl config file. + +# use + +``` +python hccl_tools.py --device_num [1,8] +``` + +output: +``` +hccl_[device_num]p_[which device]_[server_ip].json +``` \ No newline at end of file diff --git a/model_zoo/utils/hccl_tools/hccl_tools.py b/model_zoo/utils/hccl_tools/hccl_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..ac4114c0a8d5ea4174d4b6e2a4eff80b6455b2ab --- /dev/null +++ b/model_zoo/utils/hccl_tools/hccl_tools.py @@ -0,0 +1,165 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""generate hccl config file script""" +import os +import sys +import json +import socket +import platform +from argparse import ArgumentParser +from typing import Dict, Any + + +def parse_args(): + """ + parse args . + + Args: + + Returns: + args. + + Examples: + >>> parse_args() + """ + parser = ArgumentParser(description="mindspore distributed training launch " + "helper utilty that will generate hccl" + " config file") + parser.add_argument("--device_num", type=str, default="[0,8]", + help="The number of the D chip used. please note that the D chips" + "used must be continuous, such [0,4] means to use four chips " + "0,1,2,3; [0,1] means to use chip 0; The first four chips are" + "a group, and the last four chips are a group. In addition to" + "the [0,8] chips are allowed, other cross-group such as [3,6]" + "are prohibited.") + parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7", + help="will use the visible devices sequentially") + parser.add_argument("--server_ip", type=str, default="", + help="server ip") + args = parser.parse_args() + return args + + +def get_host_ip(): + """ + get host ip + """ + ip = None + + try: + hostname = socket.gethostname() + ip = socket.gethostbyname(hostname) + except EOFError: + pass + + return ip + + +def main(): + print("start", __file__) + args = parse_args() + + # visible_devices + visible_devices = args.visible_devices.split(',') + print('visible_devices:{}'.format(visible_devices)) + + # server_id + ip = get_host_ip() + if args.server_ip: + server_id = args.server_ip + elif ip: + server_id = ip + else: + raise ValueError("please input server ip!") + print('server_id:{}'.format(server_id)) + + # device_num + first_num = int(args.device_num[1]) + last_num = int(args.device_num[3]) + if first_num < 0 or last_num > 8: + raise ValueError("device num {} must be in range [0,8] !".format(args.device_num)) + if first_num > last_num: + raise ValueError("First num {} of device num {} must less than last num {} !".format(first_num, args.device_num, + last_num)) + if first_num < 4: + if last_num > 4: + if first_num == 0 and last_num == 8: + pass + else: + raise ValueError("device num {} must be in the same group of [0,4] or [4,8] !".format(args.device_num)) + + device_num_list = list(range(first_num, last_num)) + print("device_num_list:", device_num_list) + + assert len(visible_devices) >= len(device_num_list) + + # construct hccn_table + device_ips: Dict[Any, Any] = {} + with open('/etc/hccn.conf', 'r') as fin: + for hccn_item in fin.readlines(): + if hccn_item.strip().startswith('address_'): + device_id, device_ip = hccn_item.split('=') + device_id = device_id.split('_')[1] + device_ips[device_id] = device_ip.strip() + + arch = platform.processor() + hccn_table = {'board_id': {'aarch64': '0x002f', 'x86_64': '0x0000'}[arch], + 'chip_info': '910', + 'deploy_mode': 'lab', + 'group_count': '1', + 'group_list': []} + instance_list = [] + rank_id = 0 + for instance_id in device_num_list: + instance = {'devices': []} + device_id = visible_devices[instance_id] + device_ip = device_ips[device_id] + instance['devices'].append({ + 'device_id': device_id, + 'device_ip': device_ip, + }) + print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip)) + instance['rank_id'] = str(rank_id) + rank_id += 1 + instance['server_id'] = server_id + instance_list.append(instance) + hccn_table['group_list'].append({ + 'device_num': str(len(device_num_list)), + 'server_num': '1', + 'group_name': '', + 'instance_count': str(len(device_num_list)), + 'instance_list': instance_list, + }) + hccn_table['para_plane_nic_location'] = 'device' + hccn_table['para_plane_nic_name'] = [] + for instance_id in device_num_list: + eth_id = visible_devices[instance_id] + hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) + hccn_table['para_plane_nic_num'] = str(len(device_num_list)) + hccn_table['status'] = 'completed' + + # save hccn_table to file + table_path = os.getcwd() + table_fn = os.path.join(table_path, + 'hccl_{}p_{}_{}.json'.format(len(device_num_list), "".join(map(str, device_num_list)), + server_id)) + with open(table_fn, 'w') as table_fp: + json.dump(hccn_table, table_fp, indent=4) + sys.stdout.flush() + print("Completed: hccl file was save in :", table_fn) + + +if __name__ == "__main__": + main()