提交 1289c3e4 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!1592 bug fix while evaluation

Merge pull request !1592 from SanjayChan/r0.3
......@@ -133,7 +133,7 @@ result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.
#### Inference Performance
| Parameters | GoogLeNet | | |
| Parameters | | | |
| -------------------------- | ----------------------------- | ------------------------- | -------------------- |
| Model Version | V1 | | |
| Resource | Huawei 910 | NV SMX2 V100-32G | Huawei 310 |
......
......@@ -36,21 +36,23 @@ args_opt = parser.parse_args()
if __name__ == '__main__':
config_platform = None
net = None
if args_opt.platform == "Ascend":
config_platform = config_ascend
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend",
device_id=device_id, save_graphs=False)
net = mobilenet_v2(num_classes=config_platform.num_classes, platform="Ascend")
elif args_opt.platform == "GPU":
config_platform = config_gpu
context.set_context(mode=context.GRAPH_MODE,
device_target="GPU", save_graphs=False)
net = mobilenet_v2(num_classes=config_platform.num_classes, platform="GPU")
else:
raise ValueError("Unsupport platform.")
loss = nn.SoftmaxCrossEntropyWithLogits(
is_grad=False, sparse=True, reduction='mean')
net = mobilenet_v2(num_classes=config_platform.num_classes)
if args_opt.platform == "Ascend":
net.to_float(mstype.float16)
......
......@@ -52,4 +52,4 @@ python ${BASEPATH}/../eval.py \
--platform=$1 \
--dataset_path=$2 \
--checkpoint_path=$3 \
&> infer.log & # dataset val folder path
&> ../infer.log & # dataset val folder path
......@@ -36,13 +36,13 @@ run_ascend()
fi
mkdir ../train
cd ../train || exit
python ${BASEPATH}/../launch.py \
python ${BASEPATH}/../src/launch.py \
--nproc_per_node=$2 \
--visible_devices=$4 \
--server_id=$3 \
--training_script=${BASEPATH}/train.py \
--training_script=${BASEPATH}/../train.py \
--dataset_path=$5 \
--platform=$1 &> train.log & # dataset train folder
--platform=$1 &> ../train.log & # dataset train folder
}
run_gpu()
......@@ -73,7 +73,7 @@ run_gpu()
python ${BASEPATH}/../train.py \
--dataset_path=$4 \
--platform=$1 \
&> train.log & # dataset train folder
&> ../train.log & # dataset train folder
}
if [ $# -gt 5 ] || [ $# -lt 4 ]
......
......@@ -21,7 +21,6 @@ import mindspore.dataset.engine as de
import mindspore.dataset.transforms.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2
def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch_size=32):
"""
create a train or eval dataset
......@@ -29,8 +28,8 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
Args:
dataset_path(string): the path of dataset.
do_train(bool): whether dataset is used for train or eval.
repeat_num(int): the repeat times of dataset. Default: 1
batch_size(int): the batch size of dataset. Default: 32
repeat_num(int): the repeat times of dataset. Default: 1.
batch_size(int): the batch size of dataset. Default: 32.
Returns:
dataset
......@@ -44,7 +43,12 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
elif platform == "GPU":
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
if do_train:
from mindspore.communication.management import get_rank, get_group_size
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
else:
raise ValueError("Unsupport platform.")
......
......@@ -32,7 +32,7 @@ from mindspore.train.model import Model, ParallelMode
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.communication.management import init
from mindspore.communication.management import init, get_group_size
import mindspore.dataset.engine as de
from src.dataset import create_dataset
from src.lr_generator import get_lr
......@@ -146,7 +146,7 @@ class Monitor(Callback):
self.losses.append(step_loss)
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num
print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.3f}/{:5.3f}], time:[{:5.3f}], lr:[{:5.3f}]".format(
print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.3f}/{:5.3f}], time:[{:5.3f}], lr:[{:5.5f}]".format(
cb_params.cur_epoch_num -
1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss,
np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1]))
......@@ -157,6 +157,11 @@ if __name__ == '__main__':
# train on gpu
print("train args: ", args_opt, "\ncfg: ", config_gpu)
init('nccl')
context.set_auto_parallel_context(parallel_mode="data_parallel",
mirror_mean=True,
device_num=get_group_size())
# define net
net = mobilenet_v2(num_classes=config_gpu.num_classes, platform="GPU")
# define loss
......@@ -216,14 +221,14 @@ if __name__ == '__main__':
init()
epoch_size = config_ascend.epoch_size
net = mobilenet_v2(num_classes=config_ascend.num_classes)
net = mobilenet_v2(num_classes=config_ascend.num_classes, platform="Ascend")
net.to_float(mstype.float16)
for _, cell in net.cells_and_names():
if isinstance(cell, nn.Dense):
cell.to_float(mstype.float32)
if config_ascend.label_smooth > 0:
loss = CrossEntropyWithLabelSmooth(
smooth_factor=config_ascend.label_smooth, num_classes=config.num_classes)
smooth_factor=config_ascend.label_smooth, num_classes=config_ascend.num_classes)
else:
loss = SoftmaxCrossEntropyWithLogits(
is_grad=False, sparse=True, reduction='mean')
......
......@@ -133,7 +133,7 @@ result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.
#### Inference Performance
| Parameters | GoogLeNet | | |
| Parameters | | | |
| -------------------------- | ----------------------------- | ------------------------- | -------------------- |
| Model Version | V1 | | |
| Resource | Huawei 910 | NV SMX2 V100-32G | Huawei 310 |
......
......@@ -24,7 +24,8 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.common import dtype as mstype
from src.dataset import create_dataset
from src.config import config_ascend, config_gpu
from src.mobilenetV2 import mobilenet_v2
from src.mobilenetV3 import mobilenet_v3_large
parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
......@@ -49,7 +50,7 @@ if __name__ == '__main__':
loss = nn.SoftmaxCrossEntropyWithLogits(
is_grad=False, sparse=True, reduction='mean')
net = mobilenet_v2(num_classes=config_platform.num_classes)
net = mobilenet_v3_large(num_classes=config_platform.num_classes)
if args_opt.platform == "Ascend":
net.to_float(mstype.float16)
......
......@@ -42,14 +42,14 @@ export RANK_ID=0
export RANK_SIZE=1
if [ -d "eval" ];
then
rm -rf ./eval
rm -rf ../eval
fi
mkdir ./eval
cd ./eval || exit
mkdir ../eval
cd ../eval || exit
# luanch
python ${BASEPATH}/eval.py \
python ${BASEPATH}/../eval.py \
--platform=$1 \
--dataset_path=$2 \
--checkpoint_path=$3 \
&> infer.log & # dataset val folder path
&> ../infer.log & # dataset val folder path
......@@ -31,17 +31,17 @@ run_ascend()
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
if [ -d "train" ];
then
rm -rf ./train
rm -rf ../train
fi
mkdir ./train
cd ./train || exit
python ${BASEPATH}/launch.py \
mkdir ../train
cd ../train || exit
python ${BASEPATH}/../src/launch.py \
--nproc_per_node=$2 \
--visible_devices=$4 \
--server_id=$3 \
--training_script=${BASEPATH}/train.py \
--training_script=${BASEPATH}/../train.py \
--dataset_path=$5 \
--platform=$1 &> train.log & # dataset train folder
--platform=$1 &> ../train.log & # dataset train folder
}
run_gpu()
......@@ -62,17 +62,17 @@ run_gpu()
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
if [ -d "train" ];
then
rm -rf ./train
rm -rf ../train
fi
mkdir ./train
cd ./train || exit
mkdir ../train
cd ../train || exit
export CUDA_VISIBLE_DEVICES="$3"
mpirun -n $2 --allow-run-as-root \
python ${BASEPATH}/train.py \
python ${BASEPATH}/../train.py \
--dataset_path=$4 \
--platform=$1 \
&> train.log & # dataset train folder
&> ../train.log & # dataset train folder
}
if [ $# -gt 5 ] || [ $# -lt 4 ]
......
......@@ -44,7 +44,12 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id)
elif platform == "GPU":
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
if do_train:
from mindspore.communication.management import get_rank, get_group_size
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
else:
raise ValueError("Unsupport platform.")
......
......@@ -33,7 +33,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train.serialization import load_checkpoint, load_param_into_net
import mindspore.dataset.engine as de
from mindspore.communication.management import init
from mindspore.communication.management import init, get_group_size
from src.dataset import create_dataset
from src.lr_generator import get_lr
from src.config import config_gpu, config_ascend
......@@ -157,6 +157,11 @@ if __name__ == '__main__':
# train on gpu
print("train args: ", args_opt, "\ncfg: ", config_gpu)
init('nccl')
context.set_auto_parallel_context(parallel_mode="data_parallel",
mirror_mean=True,
device_num=get_group_size())
# define net
net = mobilenet_v3_large(num_classes=config_gpu.num_classes)
# define loss
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册