diff --git a/example/resnet50_cifar10/README.md b/example/resnet50_cifar10/README.md index afa8519bee1267e07bb58c85099d8d4d6d0862e4..bbe7688fbf4a2ff4ba5aca325dd8455d2c2462da 100644 --- a/example/resnet50_cifar10/README.md +++ b/example/resnet50_cifar10/README.md @@ -8,9 +8,9 @@ This is an example of training ResNet-50 with CIFAR-10 dataset in MindSpore. - Install [MindSpore](https://www.mindspore.cn/install/en). -- Download the dataset [CIFAR-10](http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz). +- Download the dataset CIFAR-10 -> Unzip the CIFAR-10 dataset to any path you want and the folder structure should be as follows: +> Unzip the CIFAR-10 dataset to any path you want and the folder structure should include train and eval dataset as follows: > ``` > . > ├── cifar-10-batches-bin # train dataset @@ -26,9 +26,9 @@ This is an example of training ResNet-50 with CIFAR-10 dataset in MindSpore. ├── dataset.py # data preprocessing ├── eval.py # infer script ├── lr_generator.py # generate learning rate for each step -├── run_distribute_train.sh # launch distributed training +├── run_distribute_train.sh # launch distributed training(8 pcs) ├── run_infer.sh # launch infering -├── run_standalone_train.sh # launch standalone training +├── run_standalone_train.sh # launch standalone training(1 pcs) └── train.py # train script ``` @@ -51,11 +51,11 @@ Parameters for both training and inference can be set in config.py. "save_checkpoint_steps": 195, # the step interval between two checkpoints. By default, the last checkpoint will be saved after the last step "keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint "save_checkpoint_path": "./", # path to save checkpoint +"warmup_epochs": 5, # number of warmup epoch +"lr_decay_mode": "poly" # decay mode can be selected in steps, ploy and default "lr_init": 0.01, # initial learning rate "lr_end": 0.00001, # final learning rate "lr_max": 0.1, # maximum learning rate -"warmup_epochs": 5, # number of warmup epoch -"lr_decay_mode": "poly" # decay mode can be selected in steps, ploy and default ``` ## Running the example @@ -65,7 +65,7 @@ Parameters for both training and inference can be set in config.py. #### Usage ``` -# distribute training +# distributed training Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] # standalone training @@ -90,7 +90,7 @@ sh run_standalone_train.sh ~/cifar-10-batches-bin Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. ``` -# distribute training result(8p) +# distribute training result(8 pcs) epoch: 1 step: 195, loss is 1.9601055 epoch: 2 step: 195, loss is 1.8555021 epoch: 3 step: 195, loss is 1.6707983 diff --git a/example/resnet50_cifar10/config.py b/example/resnet50_cifar10/config.py index 8e295537044b63c642281678656cd9f7f314e562..8b5e53542533403c4c503d5827ce070ef1fbc6f6 100755 --- a/example/resnet50_cifar10/config.py +++ b/example/resnet50_cifar10/config.py @@ -31,9 +31,9 @@ config = ed({ "save_checkpoint_steps": 195, "keep_checkpoint_max": 10, "save_checkpoint_path": "./", + "warmup_epochs": 5, + "lr_decay_mode": "poly", "lr_init": 0.01, "lr_end": 0.00001, - "lr_max": 0.1, - "warmup_epochs": 5, - "lr_decay_mode": "poly" + "lr_max": 0.1 }) diff --git a/example/resnet50_cifar10/dataset.py b/example/resnet50_cifar10/dataset.py index 9ed16f08b5555d2f6d73c28388a28282fb1dca8e..0a1f6eb3fe67e83707de84455c31702df9d078fd 100755 --- a/example/resnet50_cifar10/dataset.py +++ b/example/resnet50_cifar10/dataset.py @@ -40,39 +40,30 @@ def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): rank_id = int(os.getenv("RANK_ID")) if device_num == 1: - ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=4, shuffle=True) + ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True) else: - ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=4, shuffle=True, + ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) - resize_height = config.image_height - resize_width = config.image_width - rescale = 1.0 / 255.0 - shift = 0.0 - # define map operations - random_crop_op = C.RandomCrop((32, 32), (4, 4, 4, 4)) - random_horizontal_flip_op = C.RandomHorizontalFlip(rank_id / (rank_id + 1)) - - resize_op = C.Resize((resize_height, resize_width)) - rescale_op = C.Rescale(rescale, shift) - normalize_op = C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]) - - change_swap_op = C.HWC2CHW() - trans = [] if do_train: - trans += [random_crop_op, random_horizontal_flip_op] - - trans += [resize_op, rescale_op, normalize_op, change_swap_op] + trans += [ + C.RandomCrop((32, 32), (4, 4, 4, 4)), + C.RandomHorizontalFlip(prob=0.5) + ] + + trans += [ + C.Resize((config.image_height, config.image_width)), + C.Rescale(1.0 / 255.0, 0.0), + C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]), + C.HWC2CHW() + ] type_cast_op = C2.TypeCast(mstype.int32) - ds = ds.map(input_columns="label", operations=type_cast_op) - ds = ds.map(input_columns="image", operations=trans) - - # apply shuffle operations - ds = ds.shuffle(buffer_size=config.buffer_size) + ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op) + ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) diff --git a/example/resnet50_cifar10/eval.py b/example/resnet50_cifar10/eval.py index 1134d0bd2ed4b1ce30872751d3a39b70bc6aeab0..cedcf2eedc27ec7356e94e00ec7eb11f83d63356 100755 --- a/example/resnet50_cifar10/eval.py +++ b/example/resnet50_cifar10/eval.py @@ -17,8 +17,6 @@ eval. """ import os import argparse -import random -import numpy as np from dataset import create_dataset from config import config from mindspore import context @@ -27,13 +25,8 @@ from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits from mindspore.train.model import Model, ParallelMode from mindspore.train.serialization import load_checkpoint, load_param_into_net -import mindspore.dataset.engine as de from mindspore.communication.management import init -random.seed(1) -np.random.seed(1) -de.config.set_seed(1) - parser = argparse.ArgumentParser(description='Image classification') parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute') parser.add_argument('--device_num', type=int, default=1, help='Device num.') diff --git a/example/resnet50_cifar10/train.py b/example/resnet50_cifar10/train.py index 4804fc452eb1f9407d2992c57658a7f714025f0c..db7f08b2b32a5cbc38741ce8ada76102c5e575b4 100755 --- a/example/resnet50_cifar10/train.py +++ b/example/resnet50_cifar10/train.py @@ -15,8 +15,6 @@ """train_imagenet.""" import os import argparse -import random -import numpy as np from dataset import create_dataset from lr_generator import get_lr from config import config @@ -31,13 +29,8 @@ from mindspore.train.model import Model, ParallelMode from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor from mindspore.train.loss_scale_manager import FixedLossScaleManager -import mindspore.dataset.engine as de from mindspore.communication.management import init -random.seed(1) -np.random.seed(1) -de.config.set_seed(1) - parser = argparse.ArgumentParser(description='Image classification') parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute') parser.add_argument('--device_num', type=int, default=1, help='Device num.') diff --git a/example/resnet50_imagenet2012/README.md b/example/resnet50_imagenet2012/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ba1742cdffd90cd3f043cd039f0930a81cc7c5d7 --- /dev/null +++ b/example/resnet50_imagenet2012/README.md @@ -0,0 +1,127 @@ +# ResNet-50 Example + +## Description + +This is an example of training ResNet-50 with ImageNet2012 dataset in MindSpore. + +## Requirements + +- Install [MindSpore](https://www.mindspore.cn/install/en). + +- Download the dataset ImageNet2012 + +> Unzip the ImageNet2012 dataset to any path you want and the folder structure should include train and eval dataset as follows: +> ``` +> . +> ├── ilsvrc # train dataset +> └── ilsvrc_eval # infer dataset +> ``` + + +## Example structure + +```shell +. +├── crossentropy.py # CrossEntropy loss function +├── config.py # parameter configuration +├── dataset.py # data preprocessing +├── eval.py # infer script +├── lr_generator.py # generate learning rate for each step +├── run_distribute_train.sh # launch distributed training(8 pcs) +├── run_infer.sh # launch infering +├── run_standalone_train.sh # launch standalone training(1 pcs) +└── train.py # train script +``` + + +## Parameter configuration + +Parameters for both training and inference can be set in config.py. + +``` +"class_num": 1001, # dataset class number +"batch_size": 32, # batch size of input tensor +"loss_scale": 1024, # loss scale +"momentum": 0.9, # momentum optimizer +"weight_decay": 1e-4, # weight decay +"epoch_size": 90, # only valid for taining, which is always 1 for inference +"buffer_size": 1000, # number of queue size in data preprocessing +"image_height": 224, # image height +"image_width": 224, # image width +"save_checkpoint": True, # whether save checkpoint or not +"save_checkpoint_epochs": 1, # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch +"keep_checkpoint_max": 10, # only keep the last keep_checkpoint_max checkpoint +"save_checkpoint_path": "./", # path to save checkpoint relative to the executed path +"warmup_epochs": 0, # number of warmup epoch +"lr_decay_mode": "cosine", # decay mode for generating learning rate +"label_smooth": True, # label smooth +"label_smooth_factor": 0.1, # label smooth factor +"lr_init": 0, # initial learning rate +"lr_max": 0.1, # maximum learning rate +``` + +## Running the example + +### Train + +#### Usage + +``` +# distributed training +Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] + +# standalone training +Usage: sh run_standalone_train.sh [DATASET_PATH] +``` + + +#### Launch + +```bash +# distributed training example(8 pcs) +sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc + +# standalone training example(1 pcs) +sh run_standalone_train.sh dataset/ilsvrc +``` + +> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). + +#### Result + +Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. + +``` +# distribute training result(8 pcs) +epoch: 1 step: 5004, loss is 4.8995576 +epoch: 2 step: 5004, loss is 3.9235563 +epoch: 3 step: 5004, loss is 3.833077 +epoch: 4 step: 5004, loss is 3.2795618 +epoch: 5 step: 5004, loss is 3.1978393 +``` + +### Infer + +#### Usage + +``` +# infer +Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH] +``` + +#### Launch + +```bash +# infer with checkpoint +sh run_infer.sh dataset/ilsvrc_eval train_parallel0/resnet-90_5004.ckpt +``` + +> checkpoint can be produced in training process. + +#### Result + +Inference result will be stored in the example path, whose folder name is "infer". Under this, you can find result like the followings in log. + +``` +result: {'acc': 0.7671054737516005} ckpt=train_parallel0/resnet-90_5004.ckpt +``` diff --git a/example/resnet50_imagenet2012/config.py b/example/resnet50_imagenet2012/config.py new file mode 100755 index 0000000000000000000000000000000000000000..4ba907436258d3e1b8bbf56b1bd1313fcb25a8ed --- /dev/null +++ b/example/resnet50_imagenet2012/config.py @@ -0,0 +1,41 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +network config setting, will be used in train.py and eval.py +""" +from easydict import EasyDict as ed + +config = ed({ + "class_num": 1001, + "batch_size": 32, + "loss_scale": 1024, + "momentum": 0.9, + "weight_decay": 1e-4, + "epoch_size": 90, + "buffer_size": 1000, + "image_height": 224, + "image_width": 224, + "save_checkpoint": True, + "save_checkpoint_epochs": 1, + "keep_checkpoint_max": 10, + "save_checkpoint_path": "./", + "warmup_epochs": 0, + "lr_decay_mode": "cosine", + "use_label_smooth": True, + "label_smooth_factor": 0.1, + "lr_init": 0, + "lr_max": 0.1 + +}) diff --git a/example/resnet50_imagenet2012/crossentropy.py b/example/resnet50_imagenet2012/crossentropy.py new file mode 100644 index 0000000000000000000000000000000000000000..b078b29f6fedf3a2ab5e43772078b9161efc4251 --- /dev/null +++ b/example/resnet50_imagenet2012/crossentropy.py @@ -0,0 +1,39 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""define loss function for network""" +from mindspore.nn.loss.loss import _Loss +from mindspore.ops import operations as P +from mindspore.ops import functional as F +from mindspore import Tensor +from mindspore.common import dtype as mstype +import mindspore.nn as nn + + +class CrossEntropy(_Loss): + """the redefined loss function with SoftmaxCrossEntropyWithLogits""" + + def __init__(self, smooth_factor=0, num_classes=1001): + super(CrossEntropy, self).__init__() + self.onehot = P.OneHot() + self.on_value = Tensor(1.0 - smooth_factor, mstype.float32) + self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), mstype.float32) + self.ce = nn.SoftmaxCrossEntropyWithLogits() + self.mean = P.ReduceMean(False) + + def construct(self, logit, label): + one_hot_label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value) + loss = self.ce(logit, one_hot_label) + loss = self.mean(loss, 0) + return loss diff --git a/example/resnet50_imagenet2012/dataset.py b/example/resnet50_imagenet2012/dataset.py new file mode 100755 index 0000000000000000000000000000000000000000..6f4b1a11b52db66465c66c40e892f2757e9bc53c --- /dev/null +++ b/example/resnet50_imagenet2012/dataset.py @@ -0,0 +1,79 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +create train or eval dataset. +""" +import os +import mindspore.common.dtype as mstype +import mindspore.dataset.engine as de +import mindspore.dataset.transforms.vision.c_transforms as C +import mindspore.dataset.transforms.c_transforms as C2 + + +def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): + """ + create a train or eval dataset + + Args: + dataset_path(string): the path of dataset. + do_train(bool): whether dataset is used for train or eval. + repeat_num(int): the repeat times of dataset. Default: 1 + batch_size(int): the batch size of dataset. Default: 32 + + Returns: + dataset + """ + device_num = int(os.getenv("DEVICE_NUM")) + rank_id = int(os.getenv("RANK_ID")) + + if device_num == 1: + ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True) + else: + ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True, + num_shards=device_num, shard_id=rank_id) + + image_size = 224 + mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] + std = [0.229 * 255, 0.224 * 255, 0.225 * 255] + + # define map operations + if do_train: + trans = [ + C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), + C.RandomHorizontalFlip(prob=0.5), + C.Normalize(mean=mean, std=std), + C.HWC2CHW() + ] + else: + trans = [ + C.Decode(), + C.Resize((256, 256)), + C.CenterCrop(image_size), + C.Normalize(mean=mean, std=std), + C.HWC2CHW() + ] + + type_cast_op = C2.TypeCast(mstype.int32) + + ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans) + ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op) + + # apply batch operations + ds = ds.batch(batch_size, drop_remainder=True) + + # apply dataset repeat operation + ds = ds.repeat(repeat_num) + + return ds diff --git a/example/resnet50_imagenet2012/eval.py b/example/resnet50_imagenet2012/eval.py new file mode 100755 index 0000000000000000000000000000000000000000..1db83a4715b5807b9c48d725ba85df9ab50f5035 --- /dev/null +++ b/example/resnet50_imagenet2012/eval.py @@ -0,0 +1,62 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +eval. +""" +import os +import argparse +from dataset import create_dataset +from config import config +from mindspore import context +from mindspore.model_zoo.resnet import resnet50 +from mindspore.train.model import Model +from mindspore.train.serialization import load_checkpoint, load_param_into_net +from crossentropy import CrossEntropy + +parser = argparse.ArgumentParser(description='Image classification') +parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute') +parser.add_argument('--device_num', type=int, default=1, help='Device num.') +parser.add_argument('--do_train', type=bool, default=False, help='Do train or not.') +parser.add_argument('--do_eval', type=bool, default=True, help='Do eval or not.') +parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path') +parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') +args_opt = parser.parse_args() + +device_id = int(os.getenv('DEVICE_ID')) + +context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) +context.set_context(enable_task_sink=True, device_id=device_id) +context.set_context(enable_loop_sink=True) +context.set_context(enable_mem_reuse=True) + +if __name__ == '__main__': + + net = resnet50(class_num=config.class_num) + if not config.use_label_smooth: + config.label_smooth_factor = 0.0 + loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) + + if args_opt.do_eval: + dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size) + step_size = dataset.get_dataset_size() + + if args_opt.checkpoint_path: + param_dict = load_checkpoint(args_opt.checkpoint_path) + load_param_into_net(net, param_dict) + net.set_train(False) + + model = Model(net, loss_fn=loss, metrics={'acc'}) + res = model.eval(dataset) + print("result:", res, "ckpt=", args_opt.checkpoint_path) diff --git a/example/resnet50_imagenet2012/lr_generator.py b/example/resnet50_imagenet2012/lr_generator.py new file mode 100755 index 0000000000000000000000000000000000000000..faf1302ae29ea422f5138f0869369005894af46a --- /dev/null +++ b/example/resnet50_imagenet2012/lr_generator.py @@ -0,0 +1,90 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""learning rate generator""" +import math +import numpy as np + + +def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode): + """ + generate learning rate array + + Args: + global_step(int): total steps of the training + lr_init(float): init learning rate + lr_end(float): end learning rate + lr_max(float): max learning rate + warmup_epochs(int): number of warmup epochs + total_epochs(int): total epoch of training + steps_per_epoch(int): steps of one epoch + lr_decay_mode(string): learning rate decay mode, including steps, poly, cosine or default + + Returns: + np.array, learning rate array + """ + lr_each_step = [] + total_steps = steps_per_epoch * total_epochs + warmup_steps = steps_per_epoch * warmup_epochs + if lr_decay_mode == 'steps': + decay_epoch_index = [0.3 * total_steps, 0.6 * total_steps, 0.8 * total_steps] + for i in range(total_steps): + if i < decay_epoch_index[0]: + lr = lr_max + elif i < decay_epoch_index[1]: + lr = lr_max * 0.1 + elif i < decay_epoch_index[2]: + lr = lr_max * 0.01 + else: + lr = lr_max * 0.001 + lr_each_step.append(lr) + elif lr_decay_mode == 'poly': + if warmup_steps != 0: + inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps) + else: + inc_each_step = 0 + for i in range(total_steps): + if i < warmup_steps: + lr = float(lr_init) + inc_each_step * float(i) + else: + base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps))) + lr = float(lr_max) * base * base + if lr < 0.0: + lr = 0.0 + lr_each_step.append(lr) + elif lr_decay_mode == 'cosine': + decay_steps = total_steps - warmup_steps + for i in range(total_steps): + if i < warmup_steps: + lr_inc = (float(lr_max) - float(lr_init)) / float(warmup_steps) + lr = float(lr_init) + lr_inc * (i + 1) + else: + linear_decay = (total_steps - i) / decay_steps + cosine_decay = 0.5 * (1 + math.cos(math.pi * 2 * 0.47 * i / decay_steps)) + decayed = linear_decay * cosine_decay + 0.00001 + lr = lr_max * decayed + lr_each_step.append(lr) + else: + for i in range(total_steps): + if i < warmup_steps: + lr = lr_init + (lr_max - lr_init) * i / warmup_steps + else: + lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps) + lr_each_step.append(lr) + + current_step = global_step + lr_each_step = np.array(lr_each_step).astype(np.float32) + learning_rate = lr_each_step[current_step:] + + return learning_rate diff --git a/example/resnet50_imagenet2012/run_distribute_train.sh b/example/resnet50_imagenet2012/run_distribute_train.sh new file mode 100755 index 0000000000000000000000000000000000000000..7a45269cd6b143b63304155444916b966136c024 --- /dev/null +++ b/example/resnet50_imagenet2012/run_distribute_train.sh @@ -0,0 +1,65 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) + +if [ ! -f "$PATH1" ] +then + echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" +exit 1 +fi + +if [ ! -d "$PATH2" ] +then + echo "error: DATASET_PATH=$PATH2 is not a directory" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=8 +export RANK_SIZE=8 +export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 +export RANK_TABLE_FILE=$PATH1 + +for((i=0; i<${DEVICE_NUM}; i++)) +do + export DEVICE_ID=$i + export RANK_ID=$i + rm -rf ./train_parallel$i + mkdir ./train_parallel$i + cp *.py ./train_parallel$i + cp *.sh ./train_parallel$i + cd ./train_parallel$i || exit + echo "start training for rank $RANK_ID, device $DEVICE_ID" + env > env.log + python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log & + cd .. +done diff --git a/example/resnet50_imagenet2012/run_infer.sh b/example/resnet50_imagenet2012/run_infer.sh new file mode 100755 index 0000000000000000000000000000000000000000..14d7faf981e5b9cf7f8c2be0c701d66725c2847a --- /dev/null +++ b/example/resnet50_imagenet2012/run_infer.sh @@ -0,0 +1,64 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 2 ] +then + echo "Usage: sh run_infer.sh [DATASET_PATH] [CHECKPOINT_PATH]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) +PATH2=$(get_real_path $2) + + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$1 is not a directory" +exit 1 +fi + +if [ ! -f $PATH2 ] +then + echo "error: CHECKPOINT_PATH=$2 is not a file" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=1 +export DEVICE_ID=0 +export RANK_SIZE=$DEVICE_NUM +export RANK_ID=0 + +if [ -d "infer" ]; +then + rm -rf ./infer +fi +mkdir ./infer +cp *.py ./infer +cp *.sh ./infer +cd ./infer || exit +env > env.log +echo "start infering for device $DEVICE_ID" +python eval.py --do_eval=True --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log & +cd .. diff --git a/example/resnet50_imagenet2012/run_standalone_train.sh b/example/resnet50_imagenet2012/run_standalone_train.sh new file mode 100755 index 0000000000000000000000000000000000000000..cb08cde6c94c837e9fbfa5891027127d74f87215 --- /dev/null +++ b/example/resnet50_imagenet2012/run_standalone_train.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 1 ] +then + echo "Usage: sh run_standalone_train.sh [DATASET_PATH]" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) + +if [ ! -d "$PATH1" ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=1 +export DEVICE_ID=0 +export RANK_ID=0 + +if [ -d "train" ]; +then + rm -rf ./train +fi +mkdir ./train +cp *.py ./train +cp *.sh ./train +cd ./train || exit +echo "start training for device $DEVICE_ID" +env > env.log +python train.py --do_train=True --dataset_path=$PATH1 &> log & +cd .. diff --git a/example/resnet50_imagenet2012/train.py b/example/resnet50_imagenet2012/train.py new file mode 100755 index 0000000000000000000000000000000000000000..1992bfda95f875cb379e384c7c39a71011b60772 --- /dev/null +++ b/example/resnet50_imagenet2012/train.py @@ -0,0 +1,99 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""train_imagenet.""" +import os +import argparse +from dataset import create_dataset +from lr_generator import get_lr +from config import config +from mindspore import context +from mindspore import Tensor +from mindspore.model_zoo.resnet import resnet50 +from mindspore.parallel._auto_parallel_context import auto_parallel_context +from mindspore.nn.optim.momentum import Momentum + +from mindspore.train.model import Model, ParallelMode + +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train.loss_scale_manager import FixedLossScaleManager +from mindspore.communication.management import init +import mindspore.nn as nn +import mindspore.common.initializer as weight_init +from crossentropy import CrossEntropy + +parser = argparse.ArgumentParser(description='Image classification') +parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute') +parser.add_argument('--device_num', type=int, default=1, help='Device num.') +parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.') +parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.') +parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') +args_opt = parser.parse_args() + +device_id = int(os.getenv('DEVICE_ID')) + +context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False) +context.set_context(enable_task_sink=True, device_id=device_id) +context.set_context(enable_loop_sink=True) +context.set_context(enable_mem_reuse=True) + +if __name__ == '__main__': + if not args_opt.do_eval and args_opt.run_distribute: + context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + mirror_mean=True, parameter_broadcast=True) + auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160]) + init() + + epoch_size = config.epoch_size + net = resnet50(class_num=config.class_num) + + # weight init + for _, cell in net.cells_and_names(): + if isinstance(cell, nn.Conv2d): + cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(), + cell.weight.default_input.shape(), + cell.weight.default_input.dtype()) + if isinstance(cell, nn.Dense): + cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(), + cell.weight.default_input.shape(), + cell.weight.default_input.dtype()) + if not config.use_label_smooth: + config.label_smooth_factor = 0.0 + + loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num) + + if args_opt.do_train: + dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, + repeat_num=epoch_size, batch_size=config.batch_size) + step_size = dataset.get_dataset_size() + + loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + lr = Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max, + warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size, + lr_decay_mode='cosine')) + + opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, + config.weight_decay, config.loss_scale) + + model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}) + + time_cb = TimeMonitor(data_size=step_size) + loss_cb = LossMonitor() + cb = [time_cb, loss_cb] + if config.save_checkpoint: + config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs*step_size, + keep_checkpoint_max=config.keep_checkpoint_max) + ckpt_cb = ModelCheckpoint(prefix="resnet", directory=config.save_checkpoint_path, config=config_ck) + cb += [ckpt_cb] + model.train(epoch_size, dataset, callbacks=cb)