提交 40f9d6fc 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!665 add distribute train for vgg16

Merge pull request !665 from caojian05/mindspore_master_dev
......@@ -28,7 +28,11 @@ def create_dataset(data_home, repeat_num=1, training=True):
data_dir = os.path.join(data_home, "cifar-10-batches-bin")
if not training:
data_dir = os.path.join(data_home, "cifar-10-verify-bin")
data_set = ds.Cifar10Dataset(data_dir)
rank_size = int(os.environ.get("RANK_SIZE")) if os.environ.get("RANK_SIZE") else None
rank_id = int(os.environ.get("RANK_ID")) if os.environ.get("RANK_ID") else None
data_set = ds.Cifar10Dataset(data_dir, num_shards=rank_size, shard_id=rank_id)
resize_height = cfg.image_height
resize_width = cfg.image_width
rescale = 1.0 / 255.0
......
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 2 ]
then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]"
exit 1
fi
if [ ! -f $1 ]
then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
exit 1
fi
if [ ! -d $2 ]
then
echo "error: DATA_PATH=$2 is not a directory"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=8
export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$1
for((i=0; i<${DEVICE_NUM}; i++))
do
export DEVICE_ID=$i
export RANK_ID=$i
rm -rf ./train_parallel$i
mkdir ./train_parallel$i
cp *.py ./train_parallel$i
cp *.sh ./train_parallel$i
cd ./train_parallel$i || exit
echo "start training for rank $RANK_ID, device $DEVICE_ID"
env > env.log
python train.py --data_path=$2 --device_id=$i &> log &
cd ..
done
......@@ -17,16 +17,18 @@
python train.py --data_path=$DATA_HOME --device_id=$DEVICE_ID
"""
import argparse
import os
import random
import numpy as np
import mindspore.nn as nn
from mindspore import Tensor
from mindspore.communication.management import init
from mindspore.nn.optim.momentum import Momentum
from mindspore.train.model import Model
from mindspore.train.model import Model, ParallelMode
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.model_zoo.vgg import vgg16
import dataset
from dataset import create_dataset
from config import cifar_cfg as cfg
random.seed(1)
np.random.seed(1)
......@@ -62,18 +64,31 @@ if __name__ == '__main__':
context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
context.set_context(device_id=args_opt.device_id)
context.set_context(enable_task_sink=True)
context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True, enable_hccl=False)
device_num = int(os.environ.get("DEVICE_NUM", 1))
if device_num > 1:
context.reset_auto_parallel_context()
context.set_context(enable_hccl=True)
context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
init()
dataset = create_dataset(args_opt.data_path, cfg.epoch_size)
batch_num = dataset.get_dataset_size()
net = vgg16(num_classes=cfg.num_classes)
lr = lr_steps(0, lr_max=cfg.lr_init, total_epochs=cfg.epoch_size, steps_per_epoch=50000 // cfg.batch_size)
lr = lr_steps(0, lr_max=cfg.lr_init, total_epochs=cfg.epoch_size, steps_per_epoch=batch_num)
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), Tensor(lr), cfg.momentum, weight_decay=cfg.weight_decay)
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False)
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'},
amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=None)
dataset = dataset.create_dataset(args_opt.data_path, cfg.epoch_size)
batch_num = dataset.get_dataset_size()
config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5, keep_checkpoint_max=cfg.keep_checkpoint_max)
time_cb = TimeMonitor(data_size=batch_num)
ckpoint_cb = ModelCheckpoint(prefix="train_vgg_cifar10", directory="./", config=config_ck)
loss_cb = LossMonitor()
model.train(cfg.epoch_size, dataset, callbacks=[ckpoint_cb, loss_cb])
model.train(cfg.epoch_size, dataset, callbacks=[time_cb, ckpoint_cb, loss_cb])
print("train success")
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册