提交 b92b4ded 编写于 作者: M moran

fix alexnet and rename .sh

上级 3bb78a2e
......@@ -29,7 +29,7 @@ def render_template(template_file_path, context):
class TemplateManager:
"""BaseNetwork code generator."""
replace_template_suffixes = [('.py-tpl', '.py')]
replace_template_suffixes = [('.py-tpl', '.py'), ('.sh-tpl', '.sh'), ('.md-tpl', '.md')]
def __init__(self, template_base_dir, exclude_dirs=None, exclude_files=None):
self.template_base_dir = template_base_dir
......@@ -70,7 +70,7 @@ class TemplateManager:
"""Generate the network files."""
source_files = []
template_files = self.get_template_files()
extensions = tuple(options.get('extensions', '.py'))
extensions = tuple([new_extension for _, new_extension in self.replace_template_suffixes])
for template_file in template_files:
new_file_path = template_file
template_file_path = template_file
......
# AlexNet Example
## Description
These are examples of training AlexNet with dataset in MindSpore.
## Requirements
- Install [MindSpore](https://www.mindspore.cn/install/en).
- Download the dataset, the directory structure is as follows:
{% if dataset=='Cifar10' %}
CIFAR-10
```
└─Data
├─test
│ cifar-10-verify-bin
└─train
cifar-10-batches-bin
```
{% elif dataset=='ImageNet' %}
ImageNet
```
└─Data
├─test
│ validation_preprocess
└─train
ilsvrc
```
{% endif %}
## Structure
```shell
.
└──alexnet
├── README.md
├── script
├── run_distribute_train.sh # launch distributed training(8 pcs)
├── run_eval.sh # launch evaluation
├── run_standalone_train.sh # launch standalone training(1 pcs)
├── run_distribute_train_gpu.sh # launch gpu distributed training(4 pcs)
├── run_eval_gpu.sh # launch gpu evaluation
└── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs)
├── src
├── config.py # parameter configuration
├── dataset.py # data preprocessing
├── generator_lr.py # generate learning rate for each step
└── alexnet.py # alexnet network definition
├── eval.py # eval net
└── train.py # train net
```
## Parameter configuration
Parameters for both training and evaluation can be set in src/config.py.
## Running the example
### Train
#### Usage
```
# distributed training
Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training
Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
```
#### Launch
```
# distribute training example
./run_distribute_train.sh rank_table.json ~/dataset_path
# standalone training example
./run_standalone_train.sh ~/dataset_path
```
> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
#### Result
Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log.
```
epoch: 1 step: 1, loss is 2.3041954
epoch: 1 step: 2, loss is 2.3079312
...
epoch: 1 step: 601, loss is 2.314184
epoch: 1 step: 603, loss is 2.305666
...
```
### Evaluation
#### Usage
```
# evaluation
Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
```
#### Launch
```
# evaluation example
./run_eval.sh ~/cifar-10-batches-bin ~/alexnet/train/alexnet-1.591.ckpt
```
> checkpoint can be produced in training process.
### Running on GPU
```
# distributed training example
./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training example
./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
# infer example
./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
```
......@@ -24,6 +24,7 @@ from mindspore.common import dtype as mstype
from .config import cfg
from mindspore.communication.management import init, get_rank, get_group_size
def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"):
"""
create dataset for train or test
......@@ -66,6 +67,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe
cifar_ds = cifar_ds.repeat(repeat_size)
return cifar_ds
def _get_rank_info():
"""
get rank size and rank id
......
......@@ -24,6 +24,7 @@ from mindspore.communication.management import init, get_rank, get_group_size
from .config import cfg
def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"):
"""
create a train or eval imagenet dataset
......@@ -88,6 +89,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe
return ds
def _get_rank_info():
"""
get rank size and rank id
......
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Produce the dataset
"""
import os
import mindspore.dataset as ds
import mindspore.dataset.transforms.vision.c_transforms as CV
import mindspore.dataset.transforms.c_transforms as C
from mindspore.dataset.transforms.vision import Inter
from mindspore.common import dtype as mstype
from mindspore.communication.management import init, get_rank, get_group_size
from .config import cfg
from mindspore.communication.management import init, get_rank, get_group_size
def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"):
"""
create dataset for train or test
"""
if do_train:
data_path = os.path.join(data_path, "train")
else:
data_path = os.path.join(data_path, "test")
if target == 'Ascend':
device_num, rank_id = _get_rank_info()
elif target == 'GPU':
init("nccl")
rank_id = get_rank()
device_num = get_group_size()
else:
device_num = 1
# define dataset
if device_num == 1:
mnist_ds = ds.MnistDataset(data_path)
else:
mnist_ds = ds.MnistDataset(data_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
resize_height, resize_width = cfg.image_height, cfg.image_width
rescale = 1.0 / 255.0
shift = 0.0
rescale_nml = 1 / 0.3081
shift_nml = -1 * 0.1307 / 0.3081
# define map operations
resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode
rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
rescale_op = CV.Rescale(rescale, shift)
hwc2chw_op = CV.HWC2CHW()
type_cast_op = C.TypeCast(mstype.int32)
# apply map operations on images
mnist_ds = mnist_ds.map(input_columns="label", operations=type_cast_op)
mnist_ds = mnist_ds.map(input_columns="image", operations=resize_op)
mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_op)
mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_nml_op)
mnist_ds = mnist_ds.map(input_columns="image", operations=hwc2chw_op)
# apply DatasetOps
buffer_size = 10000
mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size)
mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
mnist_ds = mnist_ds.repeat(repeat_size)
return mnist_ds
def _get_rank_info():
"""
get rank size and rank id
"""
rank_size = int(os.environ.get("RANK_SIZE", 1))
if rank_size > 1:
rank_size = get_group_size()
rank_id = get_rank()
else:
rank_size = 1
rank_id = 0
return rank_size, rank_id
......@@ -18,6 +18,7 @@ eval alexnet according to model file:
python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt
"""
import os
import argparse
from src.config import cfg
from src.dataset import create_dataset
......@@ -33,15 +34,16 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description='MindSpore AlexNet Example')
parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU'],
help='device where the code will be implemented (default: Ascend)')
parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved')
parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\
parser.add_argument('--dataset_path', type=str, default="./", help='path where the dataset is saved')
parser.add_argument('--checkpoint_path', type=str, default="./ckpt", help='if is test, must provide\
path where the trained ckpt file')
parser.add_argument('--dataset_sink_mode', type=str, default='True', choices = ['True', 'False'],
help='DataSet sink mode is True or False')
args = parser.parse_args()
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
data_path = args.data_path
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False, device_id=device_id)
data_path = args.dataset_path
dataset_sink_mode = args.dataset_sink_mode=='True'
network = AlexNet(cfg.num_classes)
......@@ -50,15 +52,10 @@ if __name__ == "__main__":
{% elif loss=='SoftmaxCrossEntropyExpand' %}
net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
{% endif %}
{% if optimizer=='Lamb' %}
net_opt = nn.Lamb(network.trainable_params(), learning_rate=cfg.lr)
{% elif optimizer=='Momentum' %}
net_opt = nn.Momentum(network.trainable_params(), learning_rate=cfg.lr, momentum=cfg.momentum)
{% endif %}
model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
model = Model(network, loss_fn=net_loss, metrics={"Accuracy": Accuracy()})
print("============== Starting Testing ==============")
param_dict = load_checkpoint(args.ckpt_path)
param_dict = load_checkpoint(args.checkpoint_path)
load_param_into_net(network, param_dict)
do_train = False
ds_eval = create_dataset(data_path=data_path, batch_size=cfg.batch_size, do_train=do_train,
......
......@@ -16,7 +16,7 @@
if [ $# != 2 ] && [ $# != 3 ]
then
echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1
fi
......@@ -31,15 +31,15 @@ get_real_path(){
PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2)
if [ ! -d $PATH1 ]
if [ ! -f $PATH1 ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1
fi
if [ ! -f $PATH2 ]
if [ ! -d $PATH2 ]
then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH2 is not a file"
echo "error: DATASET_PATH=$PATH2 is not a directory"
exit 1
fi
......@@ -56,15 +56,15 @@ fi
ulimit -u unlimited
export DEVICE_NUM=8
export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH2
export RANK_TABLE_FILE=$PATH2
rank_start=$((DEVICE_NUM * SERVER_ID))
export RANK_SIZE=$DEVICE_NUM
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1
for((i=0; i<DEVICE_NUM; i++))
start_id=0
for((i=start_id; i<DEVICE_NUM + start_id; i++))
do
export DEVICE_ID=$i
export RANK_ID=$((rank_start + i))
export RANK_ID=$((i - start_id))
rm -rf ./train_parallel$i
mkdir ./train_parallel$i
cp ../*.py ./train_parallel$i
......@@ -75,12 +75,12 @@ do
env > env.log
if [ $# == 2 ]
then
python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --dataset_sink_mode=False &> log &
python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --dataset_sink_mode=False &> log &
fi
if [ $# == 3 ]
then
python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --pre_trained=$PATH2 --dataset_sink_mode=False &> log &
python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 --dataset_sink_mode=False &> log &
fi
cd ..
......
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 1 ]
then
echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH]"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
if [ ! -d $PATH1 ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=4
export RANK_SIZE=4
rm -rf ./train_parallel
mkdir ./train_parallel
cp ../*.py ./train_parallel
cp *.sh ./train_parallel
cp -r ../src ./train_parallel
cd ./train_parallel || exit
mpirun --allow-run-as-root -n $RANK_SIZE \
python train.py --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
\ No newline at end of file
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 1 ] && [ $# != 2 ]
then
echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
if [ ! -d $PATH1 ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
exit 1
fi
if [ $# == 2 ]
then
PATH2=$(get_real_path $2)
fi
if [ $# == 2 ] && [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=4
export RANK_SIZE=$DEVICE_NUM
rm -rf ./train_parallel
mkdir ./train_parallel
cp ../*.py ./train_parallel
cp *.sh ./train_parallel
cp -r ../src ./train_parallel
cd ./train_parallel || exit
echo "start training"
env > env.log
if [ $# == 1 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE \
python train.py --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
fi
if [ $# == 2 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE \
python train.py --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
fi
......@@ -61,6 +61,6 @@ cp *.sh ./eval
cp -r ../src ./eval
cd ./eval || exit
env > env.log
echo "start evaluation for device $DEVICE_ID"
echo "start evaluation"
python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log &
cd ..
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 1 ]
then
echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH]"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
if [ ! -d $PATH1 ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=1
export DEVICE_ID=0
export RANK_ID=0
export RANK_SIZE=1
if [ -d "train" ];
then
rm -rf ./train
fi
mkdir ./train
cp ../*.py ./train
cp *.sh ./train
cp -r ../src ./train
cd ./train || exit
python train.py --device_target="GPU" --dataset_path=$PATH1 &> log &
cd ..
\ No newline at end of file
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 1 ] && [ $# != 2 ]
then
echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
if [ ! -d $PATH1 ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
exit 1
fi
if [ $# == 2 ]
then
PATH2=$(get_real_path $2)
fi
if [ $# == 2 ] && [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=1
export DEVICE_ID=0
export RANK_ID=0
export RANK_SIZE=1
if [ -d "train" ];
then
rm -rf ./train
fi
mkdir ./train
cp ../*.py ./train
cp *.sh ./train
cp -r ../src ./train
cd ./train || exit
echo "start training"
env > env.log
if [ $# == 1 ]
then
python train.py --device_target="GPU" --dataset_path=$PATH1 &> log &
fi
if [ $# == 2 ]
then
python train.py --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
fi
cd ..
......@@ -17,17 +17,20 @@ import mindspore.nn as nn
from mindspore.common.initializer import TruncatedNormal
from mindspore.ops import operations as P
def conv(in_channels, out_channels, kernel_size, stride=1, padding=0, pad_mode="valid"):
weight = weight_variable()
return nn.Conv2d(in_channels, out_channels,
kernel_size=kernel_size, stride=stride, padding=padding,
weight_init=weight, has_bias=False, pad_mode=pad_mode)
def fc_with_initialize(input_channels, out_channels):
weight = weight_variable()
bias = weight_variable()
return nn.Dense(input_channels, out_channels, weight, bias)
def weight_variable():
return TruncatedNormal(0.02)
......
......@@ -26,15 +26,21 @@ cfg = edict({
{% elif dataset=='ImageNet' %}
'num_classes': 1001,
{% endif %}
'lr': 0.002,
{% if optimizer=='Momentum' %}
'lr': 0.002,
"momentum": 0.9,
{% elif optimizer=='SGD' %}
'lr': 0.1,
{% else %}
'lr': 0.001,
{% endif %}
'epoch_size': 1,
'batch_size': 32,
'loss_scale': 1024,
'buffer_size': 1000,
'image_height': 227,
'image_width': 227,
'weight_decay': 1e-4,
'save_checkpoint': True,
'save_checkpoint_epochs': 5,
'keep_checkpoint_max': 10,
......
......@@ -18,6 +18,7 @@ train alexnet and get network model files(.ckpt) :
python train.py --data_path /YourDataPath
"""
import os
import argparse
from src.config import cfg
from src.dataset import create_dataset
......@@ -26,9 +27,10 @@ from src.alexnet import AlexNet
import mindspore.nn as nn
from mindspore import context
from mindspore import Tensor
from mindspore.train import Model
from mindspore.train import Model, ParallelMode
from mindspore.nn.metrics import Accuracy
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.train.loss_scale_manager import FixedLossScaleManager
from mindspore.parallel._auto_parallel_context import auto_parallel_context
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.communication.management import init, get_rank, get_group_size
......@@ -41,7 +43,7 @@ if __name__ == "__main__":
parser.add_argument('--device_num', type=int, default=1, help='Device num')
parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'],
help='device where the code will be implemented (default: Ascend)')
parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved')
parser.add_argument('--dataset_path', type=str, default="./", help='path where the dataset is saved')
parser.add_argument('--pre_trained', type=str, default=None, help='Pre-trained checkpoint path')
parser.add_argument('--dataset_sink_mode', type=str, default='True', choices = ['True', 'False'],
help='DataSet sink mode is True or False')
......@@ -58,7 +60,6 @@ if __name__ == "__main__":
context.set_context(device_id=device_id, enable_auto_mixed_precision=True)
context.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
init()
# GPU target
......@@ -69,7 +70,7 @@ if __name__ == "__main__":
ckpt_save_dir = cfg.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
data_path = args.data_path
data_path = args.dataset_path
do_train = True
ds_train = create_dataset(data_path=data_path, batch_size=cfg.batch_size, do_train=do_train,
......@@ -77,14 +78,14 @@ if __name__ == "__main__":
step_size = ds_train.get_dataset_size()
# define net
network = AlexNet(cfg.num_classes)
net = AlexNet(cfg.num_classes)
# init weight
if args.pre_trained:
param_dict = load_checkpoint(args.pre_trained)
load_param_into_net(network, param_dict)
load_param_into_net(net, param_dict)
else:
for _, cell in network.cells_and_names():
for _, cell in net.cells_and_names():
if isinstance(cell, nn.Conv2d):
cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(),
cell.weight.default_input.shape,
......@@ -93,20 +94,37 @@ if __name__ == "__main__":
cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(),
cell.weight.default_input.shape,
cell.weight.default_input.dtype).to_tensor()
{% if loss=='SoftmaxCrossEntropyWithLogits' %}
net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
{% elif loss=='SoftmaxCrossEntropyExpand' %}
net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
{% endif %}
# define learning rate
lr = Tensor(get_lr(0, cfg.lr, cfg.epoch_size, ds_train.get_dataset_size()))
{% if optimizer=='Lamb' %}
net_opt = nn.Lamb(network.trainable_params(), learning_rate=lr)
{% elif optimizer=='Momentum' %}
net_opt = nn.Momentum(network.trainable_params(), learning_rate=lr, momentum=cfg.momentum)
{% endif %}
model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
# define loss, model
if target == "Ascend":
{% if loss=='SoftmaxCrossEntropyWithLogits' %}
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
{% elif loss=='SoftmaxCrossEntropyExpand' %}
loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
{% endif %}
{% if optimizer=='Momentum' %}
opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, momentum=cfg.momentum,
weight_decay=cfg.weight_decay, loss_scale=cfg.loss_scale)
{% else %}
opt = nn.{{ optimizer }}(net.trainable_params(), learning_rate=cfg.lr)
{% endif %}
loss_scale = FixedLossScaleManager(cfg.loss_scale, drop_overflow_update=False)
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'},
amp_level="O2", keep_batchnorm_fp32=False)
else:
{% if loss=='SoftmaxCrossEntropyWithLogits' %}
loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean")
{% elif loss=='SoftmaxCrossEntropyExpand' %}
loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
{% endif %}
{% if optimizer=='Momentum' %}
opt = nn.Momentum(net.trainable_params(), learning_rate=lr, momentum=cfg.momentum)
{% else %}
opt = nn.{{ optimizer }}(net.trainable_params(), learning_rate=lr)
{% endif %}
model = Model(net, loss, opt, metrics={"Accuracy": Accuracy()})
# define callbacks
time_cb = TimeMonitor(data_size=step_size)
......@@ -114,7 +132,7 @@ if __name__ == "__main__":
cb = [time_cb, loss_cb]
if cfg.save_checkpoint:
cfg_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * step_size,
keep_checkpoint_max=cfg.keep_checkpoint_max)
keep_checkpoint_max=cfg.keep_checkpoint_max)
ckpt_cb = ModelCheckpoint(prefix="alexnet", directory=ckpt_save_dir, config=cfg_ck)
cb += [ckpt_cb]
......
# LeNet Example
## Description
These are examples of training LeNet with dataset in MindSpore.
## Requirements
- Install [MindSpore](https://www.mindspore.cn/install/en).
- Download the dataset, the directory structure is as follows:
```
└─Data
├─test
│ t10k-images.idx3-ubyte
│ t10k-labels.idx1-ubyte
└─train
train-images.idx3-ubyte
train-labels.idx1-ubyte
```
## Structure
```shell
.
└──lenet
├── README.md
├── script
├── run_distribute_train.sh # launch distributed training(8 pcs)
├── run_eval.sh # launch evaluation
├── run_standalone_train.sh # launch standalone training(1 pcs)
├── run_distribute_train_gpu.sh # launch gpu distributed training(8 pcs)
├── run_eval_gpu.sh # launch gpu evaluation
└── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs)
├── src
├── config.py # parameter configuration
├── dataset.py # data preprocessing
└── lenet.py # lenet network definition
├── eval.py # eval net
└── train.py # train net
```
## Parameter configuration
Parameters for both training and evaluation can be set in src/config.py.
## Running the example
### Train
#### Usage
```
# distributed training
Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training
Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
```
#### Launch
```
# distribute training example
./run_distribute_train.sh rank_table.json ~/MNIST_data
# standalone training example
./run_standalone_train.sh ~/MNIST_data
```
> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
#### Result
Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log.
```
epoch: 1 step: 1, loss is 2.3041954
epoch: 1 step: 2, loss is 2.3079312
...
epoch: 1 step: 601, loss is 2.314184
epoch: 1 step: 603, loss is 2.305666
...
```
### Evaluation
#### Usage
```
# evaluation
Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
```
#### Launch
```
# evaluation example
./run_eval.sh ~/MNIST_data ~/lenet/train_parallel0/ckpt_0/checkpoint_lenet-2_937.ckpt
```
> checkpoint can be produced in training process.
### Running on GPU
```
# distributed training example
./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training example
./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
# infer example
./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
```
......@@ -17,6 +17,8 @@
eval lenet according to model file:
python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt
"""
import os
import argparse
import mindspore.nn as nn
......@@ -37,11 +39,12 @@ if __name__ == "__main__":
help='path where the dataset is saved')
parser.add_argument('--checkpoint_path', type=str, default="", help='if mode is test, must provide\
path where the trained ckpt file')
parser.add_argument('--dataset_sink_mode', type=bool, default=False, help='dataset_sink_mode is False or True')
parser.add_argument('--dataset_sink', action='store_true', help='enable dataset sink or not')
args = parser.parse_args()
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
device_id = int(os.getenv('DEVICE_ID'))
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=device_id)
network = LeNet5(cfg.num_classes)
{% if loss=='SoftmaxCrossEntropyWithLogits' %}
......@@ -49,12 +52,7 @@ if __name__ == "__main__":
{% elif loss=='SoftmaxCrossEntropyExpand' %}
net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
{% endif %}
{% if optimizer=='Lamb' %}
net_opt = nn.Lamb(network.trainable_params(), learning_rate=cfg.lr)
{% elif optimizer=='Momentum' %}
net_opt = nn.Momentum(network.trainable_params(), learning_rate=cfg.lr, momentum=cfg.momentum)
{% endif %}
model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()})
model = Model(network, loss_fn=net_loss, metrics={"Accuracy": Accuracy()})
print("============== Starting Testing ==============")
param_dict = load_checkpoint(args.checkpoint_path)
......@@ -63,5 +61,5 @@ if __name__ == "__main__":
do_train = False
ds_eval = create_dataset(data_path=data_path, do_train=do_train, batch_size=cfg.batch_size,
target=args.device_target)
acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode)
acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink)
print("============== {} ==============".format(acc))
......@@ -57,6 +57,9 @@ cp *.sh ./train_parallel
cp -r ../src ./train_parallel
cd ./train_parallel || exit
echo "start training"
env > env.log
if [ $# == 1 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE \
......
......@@ -61,6 +61,6 @@ cp *.sh ./eval
cp -r ../src ./eval
cd ./eval || exit
env > env.log
echo "start evaluation for device $DEVICE_ID"
echo "start evaluation"
python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log &
cd ..
......@@ -65,6 +65,9 @@ cp *.sh ./train
cp -r ../src ./train
cd ./train || exit
echo "start training"
env > env.log
if [ $# == 1 ]
then
python train.py --device_target="GPU" --dataset_path=$PATH1 &> log &
......
......@@ -18,21 +18,15 @@ network config setting, will be used in train.py
from easydict import EasyDict as edict
cfg = edict({
{% if dataset=='MNIST' %}
'num_classes': 10,
{% elif dataset=='Cifar10' %}
'num_classes': 10,
{% elif dataset=='ImageNet' %}
'num_classes': 1001,
{% endif %}
{% if dataset=='Momentum' %}
{% if optimizer=='Momentum' %}
'lr': 0.01,
"momentum": 0.9,
{% elif optimizer=='SGD' %}
'lr': 0.1,
{% else %}
'lr': 0.001,
{% endif %}
{% if optimizer=='Momentum' %}
"momentum": 0.9,
{% endif %}
'epoch_size': 1,
'batch_size': 32,
'buffer_size': 1000,
......
......@@ -48,6 +48,7 @@ if __name__ == "__main__":
if args.device_target == "CPU":
args.dataset_sink = False
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
ckpt_save_dir = './'
if args.run_distribute:
if args.device_target == 'Ascend':
......@@ -62,7 +63,6 @@ if __name__ == "__main__":
context.reset_auto_parallel_context()
context.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
data_path = args.dataset_path
do_train = True
......@@ -79,10 +79,10 @@ if __name__ == "__main__":
{% elif loss=='SoftmaxCrossEntropyExpand' %}
net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
{% endif %}
{% if optimizer=='Lamb' %}
net_opt = nn.Lamb(network.trainable_params(), learning_rate=cfg.lr)
{% elif optimizer=='Momentum' %}
{% if optimizer=='Momentum' %}
net_opt = nn.Momentum(network.trainable_params(), learning_rate=cfg.lr, momentum=cfg.momentum)
{% else %}
net_opt = nn.{{ optimizer }}(network.trainable_params(), learning_rate=cfg.lr)
{% endif %}
time_cb = TimeMonitor(data_size=ds_train.get_dataset_size())
config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps,
......
# ResNet50 Example
## Description
These are examples of training ResNet50 with dataset in MindSpore.
## Requirements
- Install [MindSpore](https://www.mindspore.cn/install/en).
- Download the dataset, the directory structure is as follows:
{% if dataset=='Cifar10' %}
CIFAR-10
```
└─Data
├─test
│ cifar-10-verify-bin
└─train
cifar-10-batches-bin
```
{% elif dataset=='ImageNet' %}
ImageNet
```
└─Data
├─test
│ validation_preprocess
└─train
ilsvrc
```
{% endif %}
## Structure
```shell
.
└──resnet50
├── README.md
├── script
├── run_distribute_train.sh # launch distributed training(8 pcs)
├── run_eval.sh # launch evaluation
├── run_standalone_train.sh # launch standalone training(1 pcs)
├── run_distribute_train_gpu.sh # launch gpu distributed training(4 pcs)
├── run_eval_gpu.sh # launch gpu evaluation
└── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs)
├── src
├── config.py # parameter configuration
├── crossentropy.py # loss definition for ImageNet2012 dataset
├── dataset.py # data preprocessing
├── lr_generator.py # generate learning rate for each step
└── resnet50.py # resNet50 network definition
├── eval.py # eval net
└── train.py # train net
```
## Parameter configuration
Parameters for both training and evaluation can be set in src/config.py.
## Running the example
### Train
#### Usage
```
# distributed training
Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training
Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
```
#### Launch
```
# distribute training example
./run_distribute_train.sh rank_table.json ~/dataset_path
# standalone training example
./run_standalone_train.sh ~/dataset_path
```
> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
#### Result
Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log.
```
epoch: 1 step: 1, loss is 2.3041954
epoch: 1 step: 2, loss is 2.3079312
...
epoch: 1 step: 601, loss is 2.314184
epoch: 1 step: 603, loss is 2.305666
...
```
### Evaluation
#### Usage
```
# evaluation
Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
```
#### Launch
```
# evaluation example
./run_eval.sh ~/cifar-10-batches-bin ~/resnet50/train/alexnet-1.591.ckpt
```
> checkpoint can be produced in training process.
### Running on GPU
```
# distributed training example
./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
# standalone training example
./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
# infer example
./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
```
......@@ -71,6 +71,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe
cifar_ds = cifar_ds.repeat(repeat_size)
return cifar_ds
def _get_rank_info():
"""
get rank size and rank id
......
......@@ -24,6 +24,7 @@ from mindspore.communication.management import init, get_rank, get_group_size
from .config import cfg
def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"):
"""
create a train or eval imagenet dataset
......@@ -88,6 +89,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe
return ds
def _get_rank_info():
"""
get rank size and rank id
......
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Produce the dataset
"""
import os
import mindspore.dataset as ds
import mindspore.dataset.transforms.vision.c_transforms as CV
import mindspore.dataset.transforms.c_transforms as C
from mindspore.dataset.transforms.vision import Inter
from mindspore.common import dtype as mstype
from mindspore.communication.management import init, get_rank, get_group_size
from .config import cfg
def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target='Ascend'):
"""
create dataset for train or test
"""
if do_train:
data_path = os.path.join(data_path, "train")
else:
data_path = os.path.join(data_path, "test")
if target == 'Ascend':
device_num, rank_id = _get_rank_info()
elif target == 'GPU':
init("nccl")
rank_id = get_rank()
device_num = get_group_size()
else:
device_num = 1
# define dataset
if device_num == 1:
mnist_ds = ds.MnistDataset(data_path)
else:
mnist_ds = ds.MnistDataset(data_path, num_parallel_workers=8, shuffle=True,
num_shards=device_num, shard_id=rank_id)
resize_height, resize_width = cfg.image_height, cfg.image_width
rescale = 1.0 / 255.0
shift = 0.0
rescale_nml = 1 / 0.3081
shift_nml = -1 * 0.1307 / 0.3081
# define map operations
resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode
rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
rescale_op = CV.Rescale(rescale, shift)
hwc2chw_op = CV.HWC2CHW()
type_cast_op = C.TypeCast(mstype.int32)
# apply map operations on images
mnist_ds = mnist_ds.map(input_columns="label", operations=type_cast_op)
mnist_ds = mnist_ds.map(input_columns="image", operations=resize_op)
mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_op)
mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_nml_op)
mnist_ds = mnist_ds.map(input_columns="image", operations=hwc2chw_op)
# apply DatasetOps
buffer_size = 10000
mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size)
mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
mnist_ds = mnist_ds.repeat(repeat_size)
return mnist_ds
def _get_rank_info():
"""
get rank size and rank id
"""
rank_size = int(os.environ.get("RANK_SIZE", 1))
if rank_size > 1:
rank_size = get_group_size()
rank_id = get_rank()
else:
rank_size = 1
rank_id = 0
return rank_size, rank_id
......@@ -16,7 +16,7 @@
if [ $# != 2 ] && [ $# != 3 ]
then
echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)"
echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1
fi
......@@ -31,15 +31,15 @@ get_real_path(){
PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2)
if [ ! -d $PATH1 ]
if [ ! -f $PATH1 ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1
fi
if [ ! -f $PATH2 ]
if [ ! -d $PATH2 ]
then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH2 is not a file"
echo "error: DATASET_PATH=$PATH2 is not a directory"
exit 1
fi
......@@ -56,16 +56,15 @@ fi
ulimit -u unlimited
export DEVICE_NUM=8
export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH2
export RANK_TABLE_FILE=$PATH2
export SERVER_ID=0
rank_start=$((DEVICE_NUM * SERVER_ID))
export RANK_SIZE=$DEVICE_NUM
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1
for((i=0; i<DEVICE_NUM; i++))
start_id=0
for((i=start_id; i<DEVICE_NUM + start_id; i++))
do
export DEVICE_ID=$i
export RANK_ID=$((rank_start + i))
export RANK_ID=$((i - start_id))
rm -rf ./train_parallel$i
mkdir ./train_parallel$i
cp ../*.py ./train_parallel$i
......@@ -76,12 +75,12 @@ do
env > env.log
if [ $# == 2 ]
then
python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --dataset_sink_mode=False &> log &
python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --dataset_sink_mode=False &> log &
fi
if [ $# == 3 ]
then
python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --pre_trained=$PATH2 --dataset_sink_mode=False &> log &
python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 --dataset_sink_mode=False &> log &
fi
cd ..
......
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 1 ]
then
echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH]"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
if [ ! -d $PATH1 ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=4
export RANK_SIZE=4
rm -rf ./train_parallel
mkdir ./train_parallel
cp ../*.py ./train_parallel
cp *.sh ./train_parallel
cp -r ../src ./train_parallel
cd ./train_parallel || exit
mpirun --allow-run-as-root -n $RANK_SIZE \
python train.py --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
\ No newline at end of file
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 1 ] && [ $# != 2 ]
then
echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
if [ ! -d $PATH1 ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
exit 1
fi
if [ $# == 2 ]
then
PATH2=$(get_real_path $2)
fi
if [ $# == 2 ] && [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=4
export RANK_SIZE=$DEVICE_NUM
rm -rf ./train_parallel
mkdir ./train_parallel
cp ../*.py ./train_parallel
cp *.sh ./train_parallel
cp -r ../src ./train_parallel
cd ./train_parallel || exit
echo "start training"
env > env.log
if [ $# == 1 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE \
python train.py --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
fi
if [ $# == 2 ]
then
mpirun --allow-run-as-root -n $RANK_SIZE \
python train.py --run_distribute=True \
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH3 &> log &
fi
......@@ -61,6 +61,6 @@ cp *.sh ./eval
cp -r ../src ./eval
cd ./eval || exit
env > env.log
echo "start evaluation for device $DEVICE_ID"
echo "start evaluation"
python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log &
cd ..
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 1 ]
then
echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH]"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
if [ ! -d $PATH1 ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=1
export DEVICE_ID=0
export RANK_ID=0
export RANK_SIZE=1
if [ -d "train" ];
then
rm -rf ./train
fi
mkdir ./train
cp ../*.py ./train
cp *.sh ./train
cp -r ../src ./train
cd ./train || exit
python train.py --device_target="GPU" --dataset_path=$PATH1 &> log &
cd ..
\ No newline at end of file
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 1 ] && [ $# != 2 ]
then
echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1
fi
get_real_path(){
if [ "${1:0:1}" == "/" ]; then
echo "$1"
else
echo "$(realpath -m $PWD/$1)"
fi
}
PATH1=$(get_real_path $1)
if [ ! -d $PATH1 ]
then
echo "error: DATASET_PATH=$PATH1 is not a directory"
exit 1
fi
if [ $# == 2 ]
then
PATH2=$(get_real_path $2)
fi
if [ $# == 2 ] && [ ! -f $PATH2 ]
then
echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=1
export DEVICE_ID=0
export RANK_ID=0
export RANK_SIZE=1
if [ -d "train" ];
then
rm -rf ./train
fi
mkdir ./train
cp ../*.py ./train
cp *.sh ./train
cp -r ../src ./train
cd ./train || exit
echo "start training"
env > env.log
if [ $# == 1 ]
then
python train.py --device_target="GPU" --dataset_path=$PATH1 &> log &
fi
if [ $# == 2 ]
then
python train.py --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
fi
cd ..
......@@ -28,7 +28,13 @@ cfg = ed({
"batch_size": 32,
"loss_scale": 1024,
{% if optimizer=='Momentum' %}
"lr": 0.01,
"momentum": 0.9,
"lr": 0.01,
{% elif optimizer=='SGD' %}
'lr': 0.1,
{% else %}
'lr': 0.001,
{% endif %}
"image_height": 224,
"image_width": 224,
......@@ -48,7 +54,6 @@ cfg = ed({
{% endif %}
"use_label_smooth": True,
"label_smooth_factor": 0.1,
"lr": 0.01,
"lr_init": 0.01,
"lr_end": 0.00001,
"lr_max": 0.1
......
......@@ -112,12 +112,11 @@ if __name__ == '__main__':
lr = Tensor(lr)
# define opt
{% if optimizer=='Lamb' %}
opt = nn.Lamb(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr,
weight_decay=cfg.weight_decay)
{% elif optimizer=='Momentum' %}
{% if optimizer=='Momentum' %}
opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, momentum=cfg.momentum,
weight_decay=cfg.weight_decay, loss_scale=cfg.loss_scale)
{% else %}
opt = nn.{{optimizer}}(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=cfg.lr)
{% endif %}
# define loss, model
......@@ -125,7 +124,7 @@ if __name__ == '__main__':
{% if dataset=='ImageNet' %}
if not cfg.use_label_smooth:
cfg.label_smooth_factor = 0.0
loss = CrossEntropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes)
loss = CrossEntLambropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes)
{% else %}
{% if loss=='SoftmaxCrossEntropyWithLogits' %}
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
......@@ -143,10 +142,10 @@ if __name__ == '__main__':
{% elif loss=='SoftmaxCrossEntropyExpand' %}
loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
{% endif %}
{% if optimizer=='Lamb' %}
opt = nn.Lamb(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr)
{% elif optimizer=='Momentum' %}
{% if optimizer=='Momentum' %}
opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, momentum=cfg.momentum)
{% else %}
opt = nn.{{optimizer}}(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr)
{% endif %}
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})
......
......@@ -15,4 +15,4 @@ class Network(GenericNetwork):
name = 'alexnet'
supported_datasets = ['Cifar10', 'ImageNet']
supported_loss_functions = ['SoftmaxCrossEntropyWithLogits', 'SoftmaxCrossEntropyExpand']
supported_optimizers = ['Momentum', 'Lamb']
supported_optimizers = ['Momentum', 'Adam', 'SGD']
......@@ -21,4 +21,4 @@ class Network(GenericNetwork):
name = 'lenet'
supported_datasets = ['MNIST']
supported_loss_functions = ['SoftmaxCrossEntropyWithLogits', 'SoftmaxCrossEntropyExpand']
supported_optimizers = ['Momentum', 'Lamb']
supported_optimizers = ['Momentum', 'Adam', 'SGD']
......@@ -6,7 +6,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""alexnet module."""
"""Resnet50 module."""
from mindinsight.wizard.network.generic_network import GenericNetwork
......@@ -15,4 +15,4 @@ class Network(GenericNetwork):
name = 'resnet50'
supported_datasets = ['Cifar10', 'ImageNet']
supported_loss_functions = ['SoftmaxCrossEntropyWithLogits', 'SoftmaxCrossEntropyExpand']
supported_optimizers = ['Momentum', 'Lamb']
supported_optimizers = ['Momentum', 'Adam', 'SGD']
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册