提交 97610885 编写于 作者: C chenzomi

bug fix while evaluation

上级 771a88d4
...@@ -133,7 +133,7 @@ result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625. ...@@ -133,7 +133,7 @@ result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.
#### Inference Performance #### Inference Performance
| Parameters | GoogLeNet | | | | Parameters | | | |
| -------------------------- | ----------------------------- | ------------------------- | -------------------- | | -------------------------- | ----------------------------- | ------------------------- | -------------------- |
| Model Version | V1 | | | | Model Version | V1 | | |
| Resource | Huawei 910 | NV SMX2 V100-32G | Huawei 310 | | Resource | Huawei 910 | NV SMX2 V100-32G | Huawei 310 |
......
...@@ -36,21 +36,23 @@ args_opt = parser.parse_args() ...@@ -36,21 +36,23 @@ args_opt = parser.parse_args()
if __name__ == '__main__': if __name__ == '__main__':
config_platform = None config_platform = None
net = None
if args_opt.platform == "Ascend": if args_opt.platform == "Ascend":
config_platform = config_ascend config_platform = config_ascend
device_id = int(os.getenv('DEVICE_ID')) device_id = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", context.set_context(mode=context.GRAPH_MODE, device_target="Ascend",
device_id=device_id, save_graphs=False) device_id=device_id, save_graphs=False)
net = mobilenet_v2(num_classes=config_platform.num_classes, platform="Ascend")
elif args_opt.platform == "GPU": elif args_opt.platform == "GPU":
config_platform = config_gpu config_platform = config_gpu
context.set_context(mode=context.GRAPH_MODE, context.set_context(mode=context.GRAPH_MODE,
device_target="GPU", save_graphs=False) device_target="GPU", save_graphs=False)
net = mobilenet_v2(num_classes=config_platform.num_classes, platform="GPU")
else: else:
raise ValueError("Unsupport platform.") raise ValueError("Unsupport platform.")
loss = nn.SoftmaxCrossEntropyWithLogits( loss = nn.SoftmaxCrossEntropyWithLogits(
is_grad=False, sparse=True, reduction='mean') is_grad=False, sparse=True, reduction='mean')
net = mobilenet_v2(num_classes=config_platform.num_classes)
if args_opt.platform == "Ascend": if args_opt.platform == "Ascend":
net.to_float(mstype.float16) net.to_float(mstype.float16)
......
...@@ -52,4 +52,4 @@ python ${BASEPATH}/../eval.py \ ...@@ -52,4 +52,4 @@ python ${BASEPATH}/../eval.py \
--platform=$1 \ --platform=$1 \
--dataset_path=$2 \ --dataset_path=$2 \
--checkpoint_path=$3 \ --checkpoint_path=$3 \
&> infer.log & # dataset val folder path &> ../infer.log & # dataset val folder path
...@@ -36,13 +36,13 @@ run_ascend() ...@@ -36,13 +36,13 @@ run_ascend()
fi fi
mkdir ../train mkdir ../train
cd ../train || exit cd ../train || exit
python ${BASEPATH}/../launch.py \ python ${BASEPATH}/../src/launch.py \
--nproc_per_node=$2 \ --nproc_per_node=$2 \
--visible_devices=$4 \ --visible_devices=$4 \
--server_id=$3 \ --server_id=$3 \
--training_script=${BASEPATH}/train.py \ --training_script=${BASEPATH}/../train.py \
--dataset_path=$5 \ --dataset_path=$5 \
--platform=$1 &> train.log & # dataset train folder --platform=$1 &> ../train.log & # dataset train folder
} }
run_gpu() run_gpu()
...@@ -73,7 +73,7 @@ run_gpu() ...@@ -73,7 +73,7 @@ run_gpu()
python ${BASEPATH}/../train.py \ python ${BASEPATH}/../train.py \
--dataset_path=$4 \ --dataset_path=$4 \
--platform=$1 \ --platform=$1 \
&> train.log & # dataset train folder &> ../train.log & # dataset train folder
} }
if [ $# -gt 5 ] || [ $# -lt 4 ] if [ $# -gt 5 ] || [ $# -lt 4 ]
......
...@@ -21,7 +21,6 @@ import mindspore.dataset.engine as de ...@@ -21,7 +21,6 @@ import mindspore.dataset.engine as de
import mindspore.dataset.transforms.vision.c_transforms as C import mindspore.dataset.transforms.vision.c_transforms as C
import mindspore.dataset.transforms.c_transforms as C2 import mindspore.dataset.transforms.c_transforms as C2
def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch_size=32): def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch_size=32):
""" """
create a train or eval dataset create a train or eval dataset
...@@ -29,8 +28,8 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch ...@@ -29,8 +28,8 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
Args: Args:
dataset_path(string): the path of dataset. dataset_path(string): the path of dataset.
do_train(bool): whether dataset is used for train or eval. do_train(bool): whether dataset is used for train or eval.
repeat_num(int): the repeat times of dataset. Default: 1 repeat_num(int): the repeat times of dataset. Default: 1.
batch_size(int): the batch size of dataset. Default: 32 batch_size(int): the batch size of dataset. Default: 32.
Returns: Returns:
dataset dataset
...@@ -44,7 +43,12 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch ...@@ -44,7 +43,12 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id) num_shards=rank_size, shard_id=rank_id)
elif platform == "GPU": elif platform == "GPU":
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) if do_train:
from mindspore.communication.management import get_rank, get_group_size
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
raise ValueError("Unsupport platform.") raise ValueError("Unsupport platform.")
......
...@@ -32,7 +32,7 @@ from mindspore.train.model import Model, ParallelMode ...@@ -32,7 +32,7 @@ from mindspore.train.model import Model, ParallelMode
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback
from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.communication.management import init from mindspore.communication.management import init, get_group_size
import mindspore.dataset.engine as de import mindspore.dataset.engine as de
from src.dataset import create_dataset from src.dataset import create_dataset
from src.lr_generator import get_lr from src.lr_generator import get_lr
...@@ -146,7 +146,7 @@ class Monitor(Callback): ...@@ -146,7 +146,7 @@ class Monitor(Callback):
self.losses.append(step_loss) self.losses.append(step_loss)
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num
print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.3f}/{:5.3f}], time:[{:5.3f}], lr:[{:5.3f}]".format( print("epoch: [{:3d}/{:3d}], step:[{:5d}/{:5d}], loss:[{:5.3f}/{:5.3f}], time:[{:5.3f}], lr:[{:5.5f}]".format(
cb_params.cur_epoch_num - cb_params.cur_epoch_num -
1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss, 1, cb_params.epoch_num, cur_step_in_epoch, cb_params.batch_num, step_loss,
np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1])) np.mean(self.losses), step_mseconds, self.lr_init[cb_params.cur_step_num - 1]))
...@@ -157,6 +157,11 @@ if __name__ == '__main__': ...@@ -157,6 +157,11 @@ if __name__ == '__main__':
# train on gpu # train on gpu
print("train args: ", args_opt, "\ncfg: ", config_gpu) print("train args: ", args_opt, "\ncfg: ", config_gpu)
init('nccl')
context.set_auto_parallel_context(parallel_mode="data_parallel",
mirror_mean=True,
device_num=get_group_size())
# define net # define net
net = mobilenet_v2(num_classes=config_gpu.num_classes, platform="GPU") net = mobilenet_v2(num_classes=config_gpu.num_classes, platform="GPU")
# define loss # define loss
...@@ -216,14 +221,14 @@ if __name__ == '__main__': ...@@ -216,14 +221,14 @@ if __name__ == '__main__':
init() init()
epoch_size = config_ascend.epoch_size epoch_size = config_ascend.epoch_size
net = mobilenet_v2(num_classes=config_ascend.num_classes) net = mobilenet_v2(num_classes=config_ascend.num_classes, platform="Ascend")
net.to_float(mstype.float16) net.to_float(mstype.float16)
for _, cell in net.cells_and_names(): for _, cell in net.cells_and_names():
if isinstance(cell, nn.Dense): if isinstance(cell, nn.Dense):
cell.to_float(mstype.float32) cell.to_float(mstype.float32)
if config_ascend.label_smooth > 0: if config_ascend.label_smooth > 0:
loss = CrossEntropyWithLabelSmooth( loss = CrossEntropyWithLabelSmooth(
smooth_factor=config_ascend.label_smooth, num_classes=config.num_classes) smooth_factor=config_ascend.label_smooth, num_classes=config_ascend.num_classes)
else: else:
loss = SoftmaxCrossEntropyWithLogits( loss = SoftmaxCrossEntropyWithLogits(
is_grad=False, sparse=True, reduction='mean') is_grad=False, sparse=True, reduction='mean')
......
...@@ -133,7 +133,7 @@ result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625. ...@@ -133,7 +133,7 @@ result: {'acc': 0.71976314102564111} ckpt=/path/to/checkpoint/mobilenet-200_625.
#### Inference Performance #### Inference Performance
| Parameters | GoogLeNet | | | | Parameters | | | |
| -------------------------- | ----------------------------- | ------------------------- | -------------------- | | -------------------------- | ----------------------------- | ------------------------- | -------------------- |
| Model Version | V1 | | | | Model Version | V1 | | |
| Resource | Huawei 910 | NV SMX2 V100-32G | Huawei 310 | | Resource | Huawei 910 | NV SMX2 V100-32G | Huawei 310 |
......
...@@ -24,7 +24,8 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net ...@@ -24,7 +24,8 @@ from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.common import dtype as mstype from mindspore.common import dtype as mstype
from src.dataset import create_dataset from src.dataset import create_dataset
from src.config import config_ascend, config_gpu from src.config import config_ascend, config_gpu
from src.mobilenetV2 import mobilenet_v2 from src.mobilenetV3 import mobilenet_v3_large
parser = argparse.ArgumentParser(description='Image classification') parser = argparse.ArgumentParser(description='Image classification')
parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path') parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
...@@ -49,7 +50,7 @@ if __name__ == '__main__': ...@@ -49,7 +50,7 @@ if __name__ == '__main__':
loss = nn.SoftmaxCrossEntropyWithLogits( loss = nn.SoftmaxCrossEntropyWithLogits(
is_grad=False, sparse=True, reduction='mean') is_grad=False, sparse=True, reduction='mean')
net = mobilenet_v2(num_classes=config_platform.num_classes) net = mobilenet_v3_large(num_classes=config_platform.num_classes)
if args_opt.platform == "Ascend": if args_opt.platform == "Ascend":
net.to_float(mstype.float16) net.to_float(mstype.float16)
......
...@@ -42,14 +42,14 @@ export RANK_ID=0 ...@@ -42,14 +42,14 @@ export RANK_ID=0
export RANK_SIZE=1 export RANK_SIZE=1
if [ -d "eval" ]; if [ -d "eval" ];
then then
rm -rf ./eval rm -rf ../eval
fi fi
mkdir ./eval mkdir ../eval
cd ./eval || exit cd ../eval || exit
# luanch # luanch
python ${BASEPATH}/eval.py \ python ${BASEPATH}/../eval.py \
--platform=$1 \ --platform=$1 \
--dataset_path=$2 \ --dataset_path=$2 \
--checkpoint_path=$3 \ --checkpoint_path=$3 \
&> infer.log & # dataset val folder path &> ../infer.log & # dataset val folder path
...@@ -31,17 +31,17 @@ run_ascend() ...@@ -31,17 +31,17 @@ run_ascend()
export PYTHONPATH=${BASEPATH}:$PYTHONPATH export PYTHONPATH=${BASEPATH}:$PYTHONPATH
if [ -d "train" ]; if [ -d "train" ];
then then
rm -rf ./train rm -rf ../train
fi fi
mkdir ./train mkdir ../train
cd ./train || exit cd ../train || exit
python ${BASEPATH}/launch.py \ python ${BASEPATH}/../src/launch.py \
--nproc_per_node=$2 \ --nproc_per_node=$2 \
--visible_devices=$4 \ --visible_devices=$4 \
--server_id=$3 \ --server_id=$3 \
--training_script=${BASEPATH}/train.py \ --training_script=${BASEPATH}/../train.py \
--dataset_path=$5 \ --dataset_path=$5 \
--platform=$1 &> train.log & # dataset train folder --platform=$1 &> ../train.log & # dataset train folder
} }
run_gpu() run_gpu()
...@@ -62,17 +62,17 @@ run_gpu() ...@@ -62,17 +62,17 @@ run_gpu()
export PYTHONPATH=${BASEPATH}:$PYTHONPATH export PYTHONPATH=${BASEPATH}:$PYTHONPATH
if [ -d "train" ]; if [ -d "train" ];
then then
rm -rf ./train rm -rf ../train
fi fi
mkdir ./train mkdir ../train
cd ./train || exit cd ../train || exit
export CUDA_VISIBLE_DEVICES="$3" export CUDA_VISIBLE_DEVICES="$3"
mpirun -n $2 --allow-run-as-root \ mpirun -n $2 --allow-run-as-root \
python ${BASEPATH}/train.py \ python ${BASEPATH}/../train.py \
--dataset_path=$4 \ --dataset_path=$4 \
--platform=$1 \ --platform=$1 \
&> train.log & # dataset train folder &> ../train.log & # dataset train folder
} }
if [ $# -gt 5 ] || [ $# -lt 4 ] if [ $# -gt 5 ] || [ $# -lt 4 ]
......
...@@ -44,7 +44,12 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch ...@@ -44,7 +44,12 @@ def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=rank_size, shard_id=rank_id) num_shards=rank_size, shard_id=rank_id)
elif platform == "GPU": elif platform == "GPU":
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) if do_train:
from mindspore.communication.management import get_rank, get_group_size
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
num_shards=get_group_size(), shard_id=get_rank())
else:
ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
else: else:
raise ValueError("Unsupport platform.") raise ValueError("Unsupport platform.")
......
...@@ -33,7 +33,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback ...@@ -33,7 +33,7 @@ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, Callback
from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
import mindspore.dataset.engine as de import mindspore.dataset.engine as de
from mindspore.communication.management import init from mindspore.communication.management import init, get_group_size
from src.dataset import create_dataset from src.dataset import create_dataset
from src.lr_generator import get_lr from src.lr_generator import get_lr
from src.config import config_gpu, config_ascend from src.config import config_gpu, config_ascend
...@@ -157,6 +157,11 @@ if __name__ == '__main__': ...@@ -157,6 +157,11 @@ if __name__ == '__main__':
# train on gpu # train on gpu
print("train args: ", args_opt, "\ncfg: ", config_gpu) print("train args: ", args_opt, "\ncfg: ", config_gpu)
init('nccl')
context.set_auto_parallel_context(parallel_mode="data_parallel",
mirror_mean=True,
device_num=get_group_size())
# define net # define net
net = mobilenet_v3_large(num_classes=config_gpu.num_classes) net = mobilenet_v3_large(num_classes=config_gpu.num_classes)
# define loss # define loss
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册