提交 40cea1e4 编写于 作者: S ShawnXuan

support multi nodes

上级 519bbc50
......@@ -13,7 +13,6 @@ args = parser.parse_args()
configs.print_args(args)
from util import Snapshot, Summary, InitNodes, Metric
#from dali_util import get_rec_iter
import ofrecord_util
from job_function_util import get_train_config, get_val_config
import oneflow as flow
......@@ -99,9 +98,6 @@ def main():
for i in range(num_val_steps):
InferenceNet().async_get(metric.metric_cb(epoch, i))
#summary.save()
#snapshot.save('epoch_{}'.format(epoch+1))
if __name__ == "__main__":
main()
......@@ -13,10 +13,11 @@ import oneflow as flow
def InitNodes(args):
if args.num_nodes > 1:
assert args.num_nodes <= len(args.node_ips)
flow.env.ctrl_port(12138)
nodes = []
for n in args.node_list.strip().split(","):
for ip in args.node_ips:
addr_dict = {}
addr_dict["addr"] = n
addr_dict["addr"] = ip
nodes.append(addr_dict)
flow.env.machine(nodes)
......
......@@ -9,6 +9,8 @@ DATA_ROOT=/dataset/ImageNet/ofrecord
--train_data_part_num=256 \
--val_data_dir=$DATA_ROOT/validation \
--val_data_part_num=256 \
--num_nodes=2 \
--node_ips='11.11.1.13,11.11.1.14' \
--gpu_num_per_node=4 \
--optimizer="momentum-cosine-decay" \
--learning_rate=0.256 \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册