diff --git a/model_zoo/resnet101/README.md b/model_zoo/resnet101/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d5576d64321e8a3cc23fae1df4708c49c00b5f76 --- /dev/null +++ b/model_zoo/resnet101/README.md @@ -0,0 +1,146 @@ +# ResNet101 Example + +## Description + +This is an example of training ResNet101 with ImageNet dataset in MindSpore. + +## Requirements + +- Install [MindSpore](https://www.mindspore.cn/install/en). + +- Download the dataset ImageNet2012. + +> Unzip the ImageNet2012 dataset to any path you want, the folder should include train and eval dataset as follows: + +``` +. +└─dataset + ├─ilsvrc + │ + └─validation_preprocess +``` + +## Structure + +```shell +. +└─resnet101 + ├─README.md + ├─scripts + ├─run_standalone_train.sh # launch standalone training(1p) + ├─run_distribute_train.sh # launch distributed training(8p) + └─run_eval.sh # launch evaluating + ├─src + ├─config.py # parameter configuration + ├─crossentropy.py # CrossEntropy loss function + ├─dataset.py # data preprocessin + ├─lr_generator.py # generate learning rate + ├─eval.py # eval net + └─train.py # train net +``` + +## Parameter configuration + +Parameters for both training and evaluating can be set in config.py. + +``` +"class_num": 1001, # dataset class number +"batch_size": 32, # batch size of input tensor +"loss_scale": 1024, # loss scale +"momentum": 0.9, # momentum optimizer +"weight_decay": 1e-4, # weight decay +"epoch_size": 120, # epoch sizes for training +"pretrain_epoch_size": 0, # epoch size of pretrain checkpoint +"buffer_size": 1000, # number of queue size in data preprocessing +"image_height": 224, # image height +"image_width": 224, # image width +"save_checkpoint": True, # whether save checkpoint or not +"save_checkpoint_epochs": 1, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch +"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint +"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path +"warmup_epochs": 0, # number of warmup epoch +"lr_decay_mode": "cosine" # decay mode for generating learning rate +"label_smooth": 1, # label_smooth +"label_smooth_factor": 0.1, # label_smooth_factor +"lr": 0.1 # base learning rate +``` + +## Running the example + +### Train + +#### Usage + +``` +# distributed training +sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_PATH](optional) + +# standalone training +sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_PATH](optional) +``` + +#### Launch + +```bash +# distributed training example(8p) +sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc + +If you want to load pretrained ckpt file, +sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc ./ckpt/pretrained.ckpt + +# standalone training example(1p) +sh run_standalone_train.sh dataset/ilsvrc + +If you want to load pretrained ckpt file, +sh run_standalone_train.sh dataset/ilsvrc ./ckpt/pretrained.ckpt +``` + +> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). + +#### Result + +Training result will be stored in the scripts path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in log. + + +``` +# distribute training result(8p) +epoch: 1 step: 5004, loss is 4.805483 +epoch: 2 step: 5004, loss is 3.2121816 +epoch: 3 step: 5004, loss is 3.429647 +epoch: 4 step: 5004, loss is 3.3667371 +epoch: 5 step: 5004, loss is 3.1718972 +... +epoch: 67 step: 5004, loss is 2.2768745 +epoch: 68 step: 5004, loss is 1.7223864 +epoch: 69 step: 5004, loss is 2.0665488 +epoch: 70 step: 5004, loss is 1.8717369 +... +``` + +### Infer + +#### Usage + +``` +# infer +sh run_eval.sh [VALIDATION_DATASET_PATH] [CHECKPOINT_PATH] +``` + +#### Launch + +```bash +# infer with checkpoint +sh run_eval.sh dataset/validation_preprocess/ train_parallel0/resnet-120_5004.ckpt + +``` + +> checkpoint can be produced in training process. + + +#### Result + +Inference result will be stored in the scripts path, whose folder name is "eval". Under this, you can find result like the followings in log. + +``` +result: {'top_5_accuracy': 0.9429417413572343, 'top_1_accuracy': 0.7853513124199744} ckpt=train_parallel0/resnet-120_5004.ckpt +``` diff --git a/model_zoo/resnet101/eval.py b/model_zoo/resnet101/eval.py new file mode 100755 index 0000000000000000000000000000000000000000..e60e4d73fcf60c84292f57ea6e5a961f290d1ca9 --- /dev/null +++ b/model_zoo/resnet101/eval.py @@ -0,0 +1,75 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +eval. +""" +import os +import argparse +import random +import numpy as np +from mindspore import context +from mindspore.model_zoo.resnet import resnet101 +from mindspore.parallel._auto_parallel_context import auto_parallel_context +from mindspore.train.model import Model, ParallelMode +from mindspore.train.serialization import load_checkpoint, load_param_into_net +import mindspore.dataset.engine as de +from mindspore.communication.management import init +from src.dataset import create_dataset +from src.config import config +from src.crossentropy import CrossEntropy + +random.seed(1) +np.random.seed(1) +de.config.set_seed(1) + +parser = argparse.ArgumentParser(description='Image classification') +parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute') +parser.add_argument('--device_num', type=int, default=1, help='Device num.') +parser.add_argument('--do_train', type=bool, default=False, help='Do train or not.') +parser.add_argument('--do_eval', type=bool, default=True, help='Do eval or not.') +parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path') +parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') +args_opt = parser.parse_args() + +device_id = int(os.getenv('DEVICE_ID')) + +context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id) + +if __name__ == '__main__': + if not args_opt.do_eval and args_opt.run_distribute: + context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mirror_mean=True, parameter_broadcast=True) + auto_parallel_context().set_all_reduce_fusion_split_indices([180, 313]) + init() + + epoch_size = config.epoch_size + net = resnet101(class_num=config.class_num) + + if not config.label_smooth: + config.label_smooth_factor = 0.0 + loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) + + if args_opt.do_eval: + dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size) + step_size = dataset.get_dataset_size() + + if args_opt.checkpoint_path: + param_dict = load_checkpoint(args_opt.checkpoint_path) + load_param_into_net(net, param_dict) + net.set_train(False) + + model = Model(net, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) + res = model.eval(dataset) + print("result:", res, "ckpt=", args_opt.checkpoint_path) diff --git a/model_zoo/resnet101/scripts/run_distribute_train.sh b/model_zoo/resnet101/scripts/run_distribute_train.sh new file mode 100755 index 0000000000000000000000000000000000000000..65790b88c1fb1313b20282642c60f29b4dd16133 --- /dev/null +++ b/model_zoo/resnet101/scripts/run_distribute_train.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] && [ $# != 3 ] +then + echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_PATH](optional)" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +echo $PATH1 +echo $PATH2 +if [ $# == 3 ] +then + PATH3=$(get_real_path $3) + echo $PATH3 +fi + +if [ ! -f $PATH1 ] +then + echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" +exit 1 +fi + +if [ ! -d $PATH2 ] +then + echo "error: DATASET_PATH=$PATH2 is not a directory" +exit 1 +fi + +if [ $# == 3 ] && [ ! -f $PATH3 ] +then + echo "error: PRETRAINED_PATH=$PATH3 is not a file" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=8 +export RANK_SIZE=8 +export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 +export RANK_TABLE_FILE=$PATH1 + +for((i=0; i<${DEVICE_NUM}; i++)) +do + export DEVICE_ID=$i + export RANK_ID=$i + rm -rf ./train_parallel$i + mkdir ./train_parallel$i + cp ../*.py ./train_parallel$i + cp *.sh ./train_parallel$i + cp -r ../src ./train_parallel$i + cd ./train_parallel$i || exit + echo "start training for rank $RANK_ID, device $DEVICE_ID" + env > env.log + if [ $# == 2 ] + then + python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log & + fi + + if [ $# == 3 ] + then + python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> log & + fi + + cd .. +done diff --git a/model_zoo/resnet101/scripts/run_eval.sh b/model_zoo/resnet101/scripts/run_eval.sh new file mode 100755 index 0000000000000000000000000000000000000000..88f5d364ce21036123827730c686b3be1e481ac1 --- /dev/null +++ b/model_zoo/resnet101/scripts/run_eval.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) +echo $PATH1 +echo $PATH2 + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ ! -f $PATH2 ] +then + echo "error: CHECKPOINT_PATH=$PATH2 is not a file" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=1 +export DEVICE_ID=0 +export RANK_SIZE=$DEVICE_NUM +export RANK_ID=0 + +if [ -d "eval" ]; +then + rm -rf ./eval +fi +mkdir ./eval +cp ../*.py ./eval +cp *.sh ./eval +cp -r ../src ./eval +cd ./eval || exit +env > env.log +echo "start infering for device $DEVICE_ID" +python eval.py --do_eval=True --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log & +cd .. diff --git a/model_zoo/resnet101/scripts/run_standalone_train.sh b/model_zoo/resnet101/scripts/run_standalone_train.sh new file mode 100755 index 0000000000000000000000000000000000000000..7214d114d55ad7157c616e23a0dfc0fbc530341e --- /dev/null +++ b/model_zoo/resnet101/scripts/run_standalone_train.sh @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 1 ] && [ $# != 2 ] +then + echo "Usage: sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_PATH](optional)" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} +PATH1=$(get_real_path $1) +echo $PATH1 +if [ $# == 2 ] +then + PATH2=$(get_real_path $2) + echo $PATH2 +fi + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ $# == 2 ] && [ ! -f $PATH2 ] +then + echo "error: PRETRAINED_PATH=$PATH2 is not a file" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=1 +export DEVICE_ID=0 +export RANK_ID=0 +export RANK_SIZE=1 + +if [ -d "train" ]; +then + rm -rf ./train +fi +mkdir ./train +cp ../*.py ./train +cp *.sh ./train +cp -r ../src ./train +cd ./train || exit +echo "start training for device $DEVICE_ID" +env > env.log +if [ $# == 1 ] +then + python train.py --do_train=True --dataset_path=$PATH1 &> log & +fi + +if [ $# == 2 ] +then + python train.py --do_train=True --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & +fi +cd .. diff --git a/model_zoo/resnet101/src/config.py b/model_zoo/resnet101/src/config.py new file mode 100755 index 0000000000000000000000000000000000000000..594b28522ac1c07f45f646773efd621e84a4d27b --- /dev/null +++ b/model_zoo/resnet101/src/config.py @@ -0,0 +1,40 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in train.py and eval.py +""" +from easydict import EasyDict as ed + +config = ed({ + "class_num": 1001, + "batch_size": 32, + "loss_scale": 1024, + "momentum": 0.9, + "weight_decay": 1e-4, + "epoch_size": 120, + "pretrain_epoch_size": 0, + "buffer_size": 1000, + "image_height": 224, + "image_width": 224, + "save_checkpoint": True, + "save_checkpoint_epochs": 5, + "keep_checkpoint_max": 10, + "save_checkpoint_path": "./", + "warmup_epochs": 0, + "lr_decay_mode": "cosine", + "label_smooth": 1, + "label_smooth_factor": 0.1, + "lr": 0.1 +}) diff --git a/model_zoo/resnet101/src/crossentropy.py b/model_zoo/resnet101/src/crossentropy.py new file mode 100755 index 0000000000000000000000000000000000000000..1145a41804b956f90506a24c9628b23f3fb90d61 --- /dev/null +++ b/model_zoo/resnet101/src/crossentropy.py @@ -0,0 +1,36 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""define loss function for network""" +from mindspore.nn.loss.loss import _Loss +from mindspore.ops import operations as P +from mindspore.ops import functional as F +from mindspore import Tensor +from mindspore.common import dtype as mstype +import mindspore.nn as nn + +class CrossEntropy(_Loss): + """the redefined loss function with SoftmaxCrossEntropyWithLogits""" + def __init__(self, smooth_factor=0., num_classes=1001): + super(CrossEntropy, self).__init__() + self.onehot = P.OneHot() + self.on_value = Tensor(1.0 - smooth_factor, mstype.float32) + self.off_value = Tensor(1.0 * smooth_factor / (num_classes -1), mstype.float32) + self.ce = nn.SoftmaxCrossEntropyWithLogits() + self.mean = P.ReduceMean(False) + def construct(self, logit, label): + one_hot_label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value) + loss = self.ce(logit, one_hot_label) + loss = self.mean(loss, 0) + return loss diff --git a/model_zoo/resnet101/src/dataset.py b/model_zoo/resnet101/src/dataset.py new file mode 100755 index 0000000000000000000000000000000000000000..b2a074a535ac756976c0075dc4b84bb80077139b --- /dev/null +++ b/model_zoo/resnet101/src/dataset.py @@ -0,0 +1,89 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +create train or eval dataset. +""" +import os +import mindspore.common.dtype as mstype +import mindspore.dataset.engine as de +import mindspore.dataset.transforms.vision.c_transforms as C +import mindspore.dataset.transforms.c_transforms as C2 +from src.config import config + +def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): + """ + create a train or evaluate dataset + Args: + dataset_path(string): the path of dataset. + do_train(bool): whether dataset is used for train or eval. + repeat_num(int): the repeat times of dataset. Default: 1 + batch_size(int): the batch size of dataset. Default: 32 + + Returns: + dataset + """ + device_num = int(os.getenv("RANK_SIZE")) + rank_id = int(os.getenv("RANK_ID")) + + if device_num == 1: + ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) + else: + ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) + resize_height = 224 + rescale = 1.0 / 255.0 + shift = 0.0 + + # define map operations + decode_op = C.Decode() + + random_resize_crop_op = C.RandomResizedCrop(resize_height, (0.08, 1.0), (0.75, 1.33), max_attempts=100) + horizontal_flip_op = C.RandomHorizontalFlip(rank_id / (rank_id + 1)) + resize_op_256 = C.Resize((256, 256)) + center_crop = C.CenterCrop(224) + rescale_op = C.Rescale(rescale, shift) + normalize_op = C.Normalize((0.475, 0.451, 0.392), (0.275, 0.267, 0.278)) + changeswap_op = C.HWC2CHW() + + trans = [] + if do_train: + trans = [decode_op, + random_resize_crop_op, + horizontal_flip_op, + rescale_op, + normalize_op, + changeswap_op] + + else: + trans = [decode_op, + resize_op_256, + center_crop, + rescale_op, + normalize_op, + changeswap_op] + + type_cast_op = C2.TypeCast(mstype.int32) + + ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8) + ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=8) + + # apply shuffle operations + ds = ds.shuffle(buffer_size=config.buffer_size) + # apply batch operations + ds = ds.batch(batch_size, drop_remainder=True) + # apply dataset repeat operation + ds = ds.repeat(repeat_num) + + return ds diff --git a/model_zoo/resnet101/src/lr_generator.py b/model_zoo/resnet101/src/lr_generator.py new file mode 100755 index 0000000000000000000000000000000000000000..2392e7a7bf8816f28a0e01d6846d5326954a47b2 --- /dev/null +++ b/model_zoo/resnet101/src/lr_generator.py @@ -0,0 +1,56 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""learning rate generator""" +import math +import numpy as np + +def linear_warmup_lr(current_step, warmup_steps, base_lr, init_lr): + lr_inc = (float(base_lr) - float(init_lr)) / float(warmup_steps) + lr = float(init_lr) + lr_inc * current_step + return lr + +def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch=120, global_step=0): + """ + generate learning rate array with cosine + + Args: + lr(float): base learning rate + steps_per_epoch(int): steps size of one epoch + warmup_epochs(int): number of warmup epochs + max_epoch(int): total epochs of training + global_step(int): the current start index of lr array + Returns: + np.array, learning rate array + """ + base_lr = lr + warmup_init_lr = 0 + total_steps = int(max_epoch * steps_per_epoch) + warmup_steps = int(warmup_epochs * steps_per_epoch) + decay_steps = total_steps - warmup_steps + + lr_each_step = [] + for i in range(total_steps): + if i < warmup_steps: + lr = linear_warmup_lr(i + 1, warmup_steps, base_lr, warmup_init_lr) + else: + linear_decay = (total_steps - i) / decay_steps + cosine_decay = 0.5 * (1 + math.cos(math.pi * 2 * 0.47 * i / decay_steps)) + decayed = linear_decay * cosine_decay + 0.00001 + lr = base_lr * decayed + lr_each_step.append(lr) + + lr_each_step = np.array(lr_each_step).astype(np.float32) + learning_rate = lr_each_step[global_step:] + return learning_rate diff --git a/model_zoo/resnet101/train.py b/model_zoo/resnet101/train.py new file mode 100755 index 0000000000000000000000000000000000000000..6cb28e795406e21835939f3781100d8c615199b4 --- /dev/null +++ b/model_zoo/resnet101/train.py @@ -0,0 +1,102 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""train_imagenet.""" +import os +import argparse +import random +import numpy as np +from mindspore import context +from mindspore import Tensor +from mindspore.model_zoo.resnet import resnet101 +from mindspore.parallel._auto_parallel_context import auto_parallel_context +from mindspore.nn.optim.momentum import Momentum +from mindspore.train.model import Model, ParallelMode +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train.loss_scale_manager import FixedLossScaleManager +from mindspore.train.serialization import load_checkpoint, load_param_into_net +import mindspore.dataset.engine as de +from mindspore.communication.management import init +import mindspore.nn as nn +import mindspore.common.initializer as weight_init +from src.dataset import create_dataset +from src.lr_generator import warmup_cosine_annealing_lr +from src.config import config +from src.crossentropy import CrossEntropy + +random.seed(1) +np.random.seed(1) +de.config.set_seed(1) + +parser = argparse.ArgumentParser(description='Image classification') +parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute') +parser.add_argument('--device_num', type=int, default=1, help='Device num.') +parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.') +parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.') +parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') +parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') +args_opt = parser.parse_args() + +device_id = int(os.getenv('DEVICE_ID')) + +context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id, + enable_auto_mixed_precision=True) + +if __name__ == '__main__': + if not args_opt.do_eval and args_opt.run_distribute: + context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mirror_mean=True, parameter_broadcast=True) + auto_parallel_context().set_all_reduce_fusion_split_indices([180, 313]) + init() + + epoch_size = config.epoch_size + net = resnet101(class_num=config.class_num) + # weight init + for _, cell in net.cells_and_names(): + if isinstance(cell, nn.Conv2d): + cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(), + cell.weight.default_input.shape(), + cell.weight.default_input.dtype()).to_tensor() + if isinstance(cell, nn.Dense): + cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(), + cell.weight.default_input.shape(), + cell.weight.default_input.dtype()).to_tensor() + if not config.label_smooth: + config.label_smooth_factor = 0.0 + loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) + if args_opt.do_train: + dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, + repeat_num=epoch_size, batch_size=config.batch_size) + step_size = dataset.get_dataset_size() + loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + if args_opt.pre_trained: + param_dict = load_checkpoint(args_opt.pre_trained) + load_param_into_net(net, param_dict) + + # learning rate strategy with cosine + lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, 120, + config.pretrain_epoch_size*step_size)) + opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, + config.weight_decay, config.loss_scale) + model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', keep_batchnorm_fp32=False, + loss_scale_manager=loss_scale, metrics={'acc'}) + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossMonitor() + cb = [time_cb, loss_cb] + if config.save_checkpoint: + config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs*step_size, + keep_checkpoint_max=config.keep_checkpoint_max) + ckpt_cb = ModelCheckpoint(prefix="resnet", directory=config.save_checkpoint_path, config=config_ck) + cb += [ckpt_cb] + model.train(epoch_size, dataset, callbacks=cb)