From b92b4ded560a9b5f8180be90f8bbba914d1e3f87 Mon Sep 17 00:00:00 2001 From: moran Date: Tue, 18 Aug 2020 11:23:40 +0800 Subject: [PATCH] fix alexnet and rename .sh --- mindinsight/wizard/base/templates.py | 4 +- .../templates/network/alexnet/README.md-tpl | 135 +++++++++++++++++ .../alexnet/dataset/cifar10/dataset.py-tpl | 2 + .../alexnet/dataset/imagenet/dataset.py-tpl | 2 + .../alexnet/dataset/mnist/dataset.py-tpl | 98 ------------- .../templates/network/alexnet/eval.py-tpl | 19 ++- ...e_train.sh => run_distribute_train.sh-tpl} | 26 ++-- .../scripts/run_distribute_train_gpu.sh | 53 ------- .../scripts/run_distribute_train_gpu.sh-tpl | 75 ++++++++++ .../scripts/{run_eval.sh => run_eval.sh-tpl} | 0 .../{run_eval_gpu.sh => run_eval_gpu.sh-tpl} | 2 +- ...e_train.sh => run_standalone_train.sh-tpl} | 0 .../scripts/run_standalone_train_gpu.sh | 59 -------- .../scripts/run_standalone_train_gpu.sh-tpl | 77 ++++++++++ .../network/alexnet/src/alexnet.py-tpl | 3 + .../network/alexnet/src/config.py-tpl | 8 +- .../templates/network/alexnet/train.py-tpl | 60 +++++--- .../templates/network/lenet/README.md-tpl | 120 ++++++++++++++++ .../conf/templates/network/lenet/eval.py-tpl | 16 +-- ...e_train.sh => run_distribute_train.sh-tpl} | 0 ...gpu.sh => run_distribute_train_gpu.sh-tpl} | 3 + .../scripts/{run_eval.sh => run_eval.sh-tpl} | 0 .../{run_eval_gpu.sh => run_eval_gpu.sh-tpl} | 2 +- ...e_train.sh => run_standalone_train.sh-tpl} | 0 ...gpu.sh => run_standalone_train_gpu.sh-tpl} | 3 + .../templates/network/lenet/src/config.py-tpl | 14 +- .../conf/templates/network/lenet/train.py-tpl | 8 +- .../templates/network/resnet50/README.md-tpl | 136 ++++++++++++++++++ .../resnet50/dataset/cifar10/dataset.py-tpl | 1 + .../resnet50/dataset/imagenet/dataset.py-tpl | 2 + .../resnet50/dataset/mnist/dataset.py-tpl | 98 ------------- ...e_train.sh => run_distribute_train.sh-tpl} | 27 ++-- .../scripts/run_distribute_train_gpu.sh | 53 ------- .../scripts/run_distribute_train_gpu.sh-tpl | 76 ++++++++++ .../scripts/{run_eval.sh => run_eval.sh-tpl} | 0 .../{run_eval_gpu.sh => run_eval_gpu.sh-tpl} | 2 +- ...e_train.sh => run_standalone_train.sh-tpl} | 0 .../scripts/run_standalone_train_gpu.sh | 59 -------- .../scripts/run_standalone_train_gpu.sh-tpl | 77 ++++++++++ .../network/resnet50/src/config.py-tpl | 7 +- .../templates/network/resnet50/train.py-tpl | 15 +- mindinsight/wizard/network/alexnet.py | 2 +- mindinsight/wizard/network/lenet.py | 2 +- mindinsight/wizard/network/resnet50.py | 4 +- 44 files changed, 829 insertions(+), 521 deletions(-) create mode 100644 mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl delete mode 100644 mindinsight/wizard/conf/templates/network/alexnet/dataset/mnist/dataset.py-tpl rename mindinsight/wizard/conf/templates/network/alexnet/scripts/{run_distribute_train.sh => run_distribute_train.sh-tpl} (73%) delete mode 100644 mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train_gpu.sh create mode 100644 mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train_gpu.sh-tpl rename mindinsight/wizard/conf/templates/network/alexnet/scripts/{run_eval.sh => run_eval.sh-tpl} (100%) rename mindinsight/wizard/conf/templates/network/alexnet/scripts/{run_eval_gpu.sh => run_eval_gpu.sh-tpl} (96%) rename mindinsight/wizard/conf/templates/network/alexnet/scripts/{run_standalone_train.sh => run_standalone_train.sh-tpl} (100%) delete mode 100644 mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train_gpu.sh create mode 100644 mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train_gpu.sh-tpl create mode 100644 mindinsight/wizard/conf/templates/network/lenet/README.md-tpl rename mindinsight/wizard/conf/templates/network/lenet/scripts/{run_distribute_train.sh => run_distribute_train.sh-tpl} (100%) rename mindinsight/wizard/conf/templates/network/lenet/scripts/{run_distribute_train_gpu.sh => run_distribute_train_gpu.sh-tpl} (97%) rename mindinsight/wizard/conf/templates/network/lenet/scripts/{run_eval.sh => run_eval.sh-tpl} (100%) rename mindinsight/wizard/conf/templates/network/lenet/scripts/{run_eval_gpu.sh => run_eval_gpu.sh-tpl} (96%) rename mindinsight/wizard/conf/templates/network/lenet/scripts/{run_standalone_train.sh => run_standalone_train.sh-tpl} (100%) rename mindinsight/wizard/conf/templates/network/lenet/scripts/{run_standalone_train_gpu.sh => run_standalone_train_gpu.sh-tpl} (97%) create mode 100644 mindinsight/wizard/conf/templates/network/resnet50/README.md-tpl delete mode 100644 mindinsight/wizard/conf/templates/network/resnet50/dataset/mnist/dataset.py-tpl rename mindinsight/wizard/conf/templates/network/resnet50/scripts/{run_distribute_train.sh => run_distribute_train.sh-tpl} (72%) delete mode 100644 mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh create mode 100644 mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh-tpl rename mindinsight/wizard/conf/templates/network/resnet50/scripts/{run_eval.sh => run_eval.sh-tpl} (100%) rename mindinsight/wizard/conf/templates/network/resnet50/scripts/{run_eval_gpu.sh => run_eval_gpu.sh-tpl} (96%) rename mindinsight/wizard/conf/templates/network/resnet50/scripts/{run_standalone_train.sh => run_standalone_train.sh-tpl} (100%) delete mode 100644 mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train_gpu.sh create mode 100644 mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train_gpu.sh-tpl diff --git a/mindinsight/wizard/base/templates.py b/mindinsight/wizard/base/templates.py index ed2e542..7c6b800 100644 --- a/mindinsight/wizard/base/templates.py +++ b/mindinsight/wizard/base/templates.py @@ -29,7 +29,7 @@ def render_template(template_file_path, context): class TemplateManager: """BaseNetwork code generator.""" - replace_template_suffixes = [('.py-tpl', '.py')] + replace_template_suffixes = [('.py-tpl', '.py'), ('.sh-tpl', '.sh'), ('.md-tpl', '.md')] def __init__(self, template_base_dir, exclude_dirs=None, exclude_files=None): self.template_base_dir = template_base_dir @@ -70,7 +70,7 @@ class TemplateManager: """Generate the network files.""" source_files = [] template_files = self.get_template_files() - extensions = tuple(options.get('extensions', '.py')) + extensions = tuple([new_extension for _, new_extension in self.replace_template_suffixes]) for template_file in template_files: new_file_path = template_file template_file_path = template_file diff --git a/mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl b/mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl new file mode 100644 index 0000000..05f93b9 --- /dev/null +++ b/mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl @@ -0,0 +1,135 @@ +# AlexNet Example + +## Description + +These are examples of training AlexNet with dataset in MindSpore. + +## Requirements + +- Install [MindSpore](https://www.mindspore.cn/install/en). + +- Download the dataset, the directory structure is as follows: + +{% if dataset=='Cifar10' %} +CIFAR-10 + +``` +└─Data + ├─test + │ cifar-10-verify-bin + │ + └─train + cifar-10-batches-bin +``` + +{% elif dataset=='ImageNet' %} +ImageNet + +``` +└─Data + ├─test + │ validation_preprocess + │ + └─train + ilsvrc +``` +{% endif %} + +## Structure + +```shell +. +└──alexnet + ├── README.md + ├── script + ├── run_distribute_train.sh # launch distributed training(8 pcs) + ├── run_eval.sh # launch evaluation + ├── run_standalone_train.sh # launch standalone training(1 pcs) + ├── run_distribute_train_gpu.sh # launch gpu distributed training(4 pcs) + ├── run_eval_gpu.sh # launch gpu evaluation + └── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) + ├── src + ├── config.py # parameter configuration + ├── dataset.py # data preprocessing + ├── generator_lr.py # generate learning rate for each step + └── alexnet.py # alexnet network definition + ├── eval.py # eval net + └── train.py # train net +``` + + +## Parameter configuration + +Parameters for both training and evaluation can be set in src/config.py. + + +## Running the example + +### Train + +#### Usage + +``` +# distributed training +Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + +# standalone training +Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +``` + + +#### Launch + +``` +# distribute training example +./run_distribute_train.sh rank_table.json ~/dataset_path + +# standalone training example +./run_standalone_train.sh ~/dataset_path +``` + +> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). + +#### Result + +Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. + +``` +epoch: 1 step: 1, loss is 2.3041954 +epoch: 1 step: 2, loss is 2.3079312 +... +epoch: 1 step: 601, loss is 2.314184 +epoch: 1 step: 603, loss is 2.305666 +... +``` + +### Evaluation + +#### Usage + +``` +# evaluation +Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] +``` + +#### Launch + +``` +# evaluation example +./run_eval.sh ~/cifar-10-batches-bin ~/alexnet/train/alexnet-1.591.ckpt +``` + +> checkpoint can be produced in training process. + + +### Running on GPU +``` +# distributed training example +./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + +# standalone training example +./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + +# infer example +./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] +``` diff --git a/mindinsight/wizard/conf/templates/network/alexnet/dataset/cifar10/dataset.py-tpl b/mindinsight/wizard/conf/templates/network/alexnet/dataset/cifar10/dataset.py-tpl index 62666b2..7f925c6 100644 --- a/mindinsight/wizard/conf/templates/network/alexnet/dataset/cifar10/dataset.py-tpl +++ b/mindinsight/wizard/conf/templates/network/alexnet/dataset/cifar10/dataset.py-tpl @@ -24,6 +24,7 @@ from mindspore.common import dtype as mstype from .config import cfg from mindspore.communication.management import init, get_rank, get_group_size + def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"): """ create dataset for train or test @@ -66,6 +67,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe cifar_ds = cifar_ds.repeat(repeat_size) return cifar_ds + def _get_rank_info(): """ get rank size and rank id diff --git a/mindinsight/wizard/conf/templates/network/alexnet/dataset/imagenet/dataset.py-tpl b/mindinsight/wizard/conf/templates/network/alexnet/dataset/imagenet/dataset.py-tpl index 160e532..a558c2e 100644 --- a/mindinsight/wizard/conf/templates/network/alexnet/dataset/imagenet/dataset.py-tpl +++ b/mindinsight/wizard/conf/templates/network/alexnet/dataset/imagenet/dataset.py-tpl @@ -24,6 +24,7 @@ from mindspore.communication.management import init, get_rank, get_group_size from .config import cfg + def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"): """ create a train or eval imagenet dataset @@ -88,6 +89,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe return ds + def _get_rank_info(): """ get rank size and rank id diff --git a/mindinsight/wizard/conf/templates/network/alexnet/dataset/mnist/dataset.py-tpl b/mindinsight/wizard/conf/templates/network/alexnet/dataset/mnist/dataset.py-tpl deleted file mode 100644 index 97dac66..0000000 --- a/mindinsight/wizard/conf/templates/network/alexnet/dataset/mnist/dataset.py-tpl +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -Produce the dataset -""" -import os - -import mindspore.dataset as ds -import mindspore.dataset.transforms.vision.c_transforms as CV -import mindspore.dataset.transforms.c_transforms as C -from mindspore.dataset.transforms.vision import Inter -from mindspore.common import dtype as mstype -from mindspore.communication.management import init, get_rank, get_group_size - -from .config import cfg -from mindspore.communication.management import init, get_rank, get_group_size - -def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"): - """ - create dataset for train or test - """ - - if do_train: - data_path = os.path.join(data_path, "train") - else: - data_path = os.path.join(data_path, "test") - - if target == 'Ascend': - device_num, rank_id = _get_rank_info() - elif target == 'GPU': - init("nccl") - rank_id = get_rank() - device_num = get_group_size() - else: - device_num = 1 - - # define dataset - if device_num == 1: - mnist_ds = ds.MnistDataset(data_path) - else: - mnist_ds = ds.MnistDataset(data_path, num_parallel_workers=8, shuffle=True, - num_shards=device_num, shard_id=rank_id) - - resize_height, resize_width = cfg.image_height, cfg.image_width - rescale = 1.0 / 255.0 - shift = 0.0 - rescale_nml = 1 / 0.3081 - shift_nml = -1 * 0.1307 / 0.3081 - - # define map operations - resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode - rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) - rescale_op = CV.Rescale(rescale, shift) - hwc2chw_op = CV.HWC2CHW() - type_cast_op = C.TypeCast(mstype.int32) - - # apply map operations on images - mnist_ds = mnist_ds.map(input_columns="label", operations=type_cast_op) - mnist_ds = mnist_ds.map(input_columns="image", operations=resize_op) - mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_op) - mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_nml_op) - mnist_ds = mnist_ds.map(input_columns="image", operations=hwc2chw_op) - - # apply DatasetOps - buffer_size = 10000 - mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) - mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) - mnist_ds = mnist_ds.repeat(repeat_size) - - return mnist_ds - - -def _get_rank_info(): - """ - get rank size and rank id - """ - rank_size = int(os.environ.get("RANK_SIZE", 1)) - - if rank_size > 1: - rank_size = get_group_size() - rank_id = get_rank() - else: - rank_size = 1 - rank_id = 0 - - return rank_size, rank_id diff --git a/mindinsight/wizard/conf/templates/network/alexnet/eval.py-tpl b/mindinsight/wizard/conf/templates/network/alexnet/eval.py-tpl index fc676a7..b78c781 100644 --- a/mindinsight/wizard/conf/templates/network/alexnet/eval.py-tpl +++ b/mindinsight/wizard/conf/templates/network/alexnet/eval.py-tpl @@ -18,6 +18,7 @@ eval alexnet according to model file: python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt """ +import os import argparse from src.config import cfg from src.dataset import create_dataset @@ -33,15 +34,16 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description='MindSpore AlexNet Example') parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU'], help='device where the code will be implemented (default: Ascend)') - parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved') - parser.add_argument('--ckpt_path', type=str, default="./ckpt", help='if is test, must provide\ + parser.add_argument('--dataset_path', type=str, default="./", help='path where the dataset is saved') + parser.add_argument('--checkpoint_path', type=str, default="./ckpt", help='if is test, must provide\ path where the trained ckpt file') parser.add_argument('--dataset_sink_mode', type=str, default='True', choices = ['True', 'False'], help='DataSet sink mode is True or False') args = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) - data_path = args.data_path + device_id = int(os.getenv('DEVICE_ID')) + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, save_graphs=False, device_id=device_id) + data_path = args.dataset_path dataset_sink_mode = args.dataset_sink_mode=='True' network = AlexNet(cfg.num_classes) @@ -50,15 +52,10 @@ if __name__ == "__main__": {% elif loss=='SoftmaxCrossEntropyExpand' %} net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True) {% endif %} - {% if optimizer=='Lamb' %} - net_opt = nn.Lamb(network.trainable_params(), learning_rate=cfg.lr) - {% elif optimizer=='Momentum' %} - net_opt = nn.Momentum(network.trainable_params(), learning_rate=cfg.lr, momentum=cfg.momentum) - {% endif %} - model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) + model = Model(network, loss_fn=net_loss, metrics={"Accuracy": Accuracy()}) print("============== Starting Testing ==============") - param_dict = load_checkpoint(args.ckpt_path) + param_dict = load_checkpoint(args.checkpoint_path) load_param_into_net(network, param_dict) do_train = False ds_eval = create_dataset(data_path=data_path, batch_size=cfg.batch_size, do_train=do_train, diff --git a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh-tpl similarity index 73% rename from mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh rename to mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh-tpl index 5095aa7..fe092fa 100644 --- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh +++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh-tpl @@ -16,7 +16,7 @@ if [ $# != 2 ] && [ $# != 3 ] then - echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" + echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" exit 1 fi @@ -31,15 +31,15 @@ get_real_path(){ PATH1=$(get_real_path $1) PATH2=$(get_real_path $2) -if [ ! -d $PATH1 ] +if [ ! -f $PATH1 ] then - echo "error: DATASET_PATH=$PATH1 is not a directory" + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" exit 1 fi -if [ ! -f $PATH2 ] +if [ ! -d $PATH2 ] then - echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH2 is not a file" + echo "error: DATASET_PATH=$PATH2 is not a directory" exit 1 fi @@ -56,15 +56,15 @@ fi ulimit -u unlimited export DEVICE_NUM=8 -export RANK_SIZE=8 -export MINDSPORE_HCCL_CONFIG_PATH=$PATH2 -export RANK_TABLE_FILE=$PATH2 -rank_start=$((DEVICE_NUM * SERVER_ID)) +export RANK_SIZE=$DEVICE_NUM +export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 +export RANK_TABLE_FILE=$PATH1 -for((i=0; i env.log if [ $# == 2 ] then - python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --dataset_sink_mode=False &> log & + python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --dataset_sink_mode=False &> log & fi if [ $# == 3 ] then - python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --pre_trained=$PATH2 --dataset_sink_mode=False &> log & + python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 --dataset_sink_mode=False &> log & fi cd .. diff --git a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train_gpu.sh b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train_gpu.sh deleted file mode 100644 index bf19622..0000000 --- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train_gpu.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -if [ $# != 1 ] -then - echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH]" -exit 1 -fi - -get_real_path(){ - if [ "${1:0:1}" == "/" ]; then - echo "$1" - else - echo "$(realpath -m $PWD/$1)" - fi -} - -PATH1=$(get_real_path $1) - -if [ ! -d $PATH1 ] -then - echo "error: DATASET_PATH=$PATH1 is not a directory" -exit 1 -fi - - -ulimit -u unlimited -export DEVICE_NUM=4 -export RANK_SIZE=4 - -rm -rf ./train_parallel -mkdir ./train_parallel -cp ../*.py ./train_parallel -cp *.sh ./train_parallel -cp -r ../src ./train_parallel -cd ./train_parallel || exit - -mpirun --allow-run-as-root -n $RANK_SIZE \ -python train.py --run_distribute=True \ ---device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & \ No newline at end of file diff --git a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train_gpu.sh-tpl b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train_gpu.sh-tpl new file mode 100644 index 0000000..434c128 --- /dev/null +++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train_gpu.sh-tpl @@ -0,0 +1,75 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 1 ] && [ $# != 2 ] +then + echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ $# == 2 ] +then + PATH2=$(get_real_path $2) +fi + +if [ $# == 2 ] && [ ! -f $PATH2 ] +then + echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=4 +export RANK_SIZE=$DEVICE_NUM + +rm -rf ./train_parallel +mkdir ./train_parallel +cp ../*.py ./train_parallel +cp *.sh ./train_parallel +cp -r ../src ./train_parallel +cd ./train_parallel || exit +echo "start training" +env > env.log + +if [ $# == 1 ] +then + mpirun --allow-run-as-root -n $RANK_SIZE \ + python train.py --run_distribute=True \ + --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & +fi + +if [ $# == 2 ] +then + mpirun --allow-run-as-root -n $RANK_SIZE \ + python train.py --run_distribute=True \ + --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & +fi diff --git a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval.sh b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval.sh-tpl similarity index 100% rename from mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval.sh rename to mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval.sh-tpl diff --git a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval_gpu.sh b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval_gpu.sh-tpl similarity index 96% rename from mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval_gpu.sh rename to mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval_gpu.sh-tpl index bc29508..6a9e379 100644 --- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval_gpu.sh +++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_eval_gpu.sh-tpl @@ -61,6 +61,6 @@ cp *.sh ./eval cp -r ../src ./eval cd ./eval || exit env > env.log -echo "start evaluation for device $DEVICE_ID" +echo "start evaluation" python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log & cd .. diff --git a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train.sh b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train.sh-tpl similarity index 100% rename from mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train.sh rename to mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train.sh-tpl diff --git a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train_gpu.sh b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train_gpu.sh deleted file mode 100644 index 5d18649..0000000 --- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train_gpu.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -if [ $# != 1 ] -then - echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH]" -exit 1 -fi - - -get_real_path(){ - if [ "${1:0:1}" == "/" ]; then - echo "$1" - else - echo "$(realpath -m $PWD/$1)" - fi -} - -PATH1=$(get_real_path $1) - - -if [ ! -d $PATH1 ] -then - echo "error: DATASET_PATH=$PATH1 is not a directory" -exit 1 -fi - - -ulimit -u unlimited -export DEVICE_NUM=1 -export DEVICE_ID=0 -export RANK_ID=0 -export RANK_SIZE=1 - -if [ -d "train" ]; -then - rm -rf ./train -fi -mkdir ./train -cp ../*.py ./train -cp *.sh ./train -cp -r ../src ./train -cd ./train || exit -python train.py --device_target="GPU" --dataset_path=$PATH1 &> log & - -cd .. \ No newline at end of file diff --git a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train_gpu.sh-tpl b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train_gpu.sh-tpl new file mode 100644 index 0000000..57e047f --- /dev/null +++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_standalone_train_gpu.sh-tpl @@ -0,0 +1,77 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 1 ] && [ $# != 2 ] +then + echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ $# == 2 ] +then + PATH2=$(get_real_path $2) +fi + +if [ $# == 2 ] && [ ! -f $PATH2 ] +then + echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=1 +export DEVICE_ID=0 +export RANK_ID=0 +export RANK_SIZE=1 + +if [ -d "train" ]; +then + rm -rf ./train +fi +mkdir ./train +cp ../*.py ./train +cp *.sh ./train +cp -r ../src ./train +cd ./train || exit +echo "start training" +env > env.log + +if [ $# == 1 ] +then + python train.py --device_target="GPU" --dataset_path=$PATH1 &> log & +fi + +if [ $# == 2 ] +then + python train.py --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & +fi +cd .. diff --git a/mindinsight/wizard/conf/templates/network/alexnet/src/alexnet.py-tpl b/mindinsight/wizard/conf/templates/network/alexnet/src/alexnet.py-tpl index 2e333a0..cc1f1da 100644 --- a/mindinsight/wizard/conf/templates/network/alexnet/src/alexnet.py-tpl +++ b/mindinsight/wizard/conf/templates/network/alexnet/src/alexnet.py-tpl @@ -17,17 +17,20 @@ import mindspore.nn as nn from mindspore.common.initializer import TruncatedNormal from mindspore.ops import operations as P + def conv(in_channels, out_channels, kernel_size, stride=1, padding=0, pad_mode="valid"): weight = weight_variable() return nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, weight_init=weight, has_bias=False, pad_mode=pad_mode) + def fc_with_initialize(input_channels, out_channels): weight = weight_variable() bias = weight_variable() return nn.Dense(input_channels, out_channels, weight, bias) + def weight_variable(): return TruncatedNormal(0.02) diff --git a/mindinsight/wizard/conf/templates/network/alexnet/src/config.py-tpl b/mindinsight/wizard/conf/templates/network/alexnet/src/config.py-tpl index ed6f6b6..5eca324 100644 --- a/mindinsight/wizard/conf/templates/network/alexnet/src/config.py-tpl +++ b/mindinsight/wizard/conf/templates/network/alexnet/src/config.py-tpl @@ -26,15 +26,21 @@ cfg = edict({ {% elif dataset=='ImageNet' %} 'num_classes': 1001, {% endif %} - 'lr': 0.002, {% if optimizer=='Momentum' %} + 'lr': 0.002, "momentum": 0.9, + {% elif optimizer=='SGD' %} + 'lr': 0.1, + {% else %} + 'lr': 0.001, {% endif %} 'epoch_size': 1, 'batch_size': 32, + 'loss_scale': 1024, 'buffer_size': 1000, 'image_height': 227, 'image_width': 227, + 'weight_decay': 1e-4, 'save_checkpoint': True, 'save_checkpoint_epochs': 5, 'keep_checkpoint_max': 10, diff --git a/mindinsight/wizard/conf/templates/network/alexnet/train.py-tpl b/mindinsight/wizard/conf/templates/network/alexnet/train.py-tpl index 88073c0..ead7ddc 100644 --- a/mindinsight/wizard/conf/templates/network/alexnet/train.py-tpl +++ b/mindinsight/wizard/conf/templates/network/alexnet/train.py-tpl @@ -18,6 +18,7 @@ train alexnet and get network model files(.ckpt) : python train.py --data_path /YourDataPath """ +import os import argparse from src.config import cfg from src.dataset import create_dataset @@ -26,9 +27,10 @@ from src.alexnet import AlexNet import mindspore.nn as nn from mindspore import context from mindspore import Tensor -from mindspore.train import Model +from mindspore.train import Model, ParallelMode from mindspore.nn.metrics import Accuracy from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor +from mindspore.train.loss_scale_manager import FixedLossScaleManager from mindspore.parallel._auto_parallel_context import auto_parallel_context from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.communication.management import init, get_rank, get_group_size @@ -41,7 +43,7 @@ if __name__ == "__main__": parser.add_argument('--device_num', type=int, default=1, help='Device num') parser.add_argument('--device_target', type=str, default="Ascend", choices=['Ascend', 'GPU', 'CPU'], help='device where the code will be implemented (default: Ascend)') - parser.add_argument('--data_path', type=str, default="./", help='path where the dataset is saved') + parser.add_argument('--dataset_path', type=str, default="./", help='path where the dataset is saved') parser.add_argument('--pre_trained', type=str, default=None, help='Pre-trained checkpoint path') parser.add_argument('--dataset_sink_mode', type=str, default='True', choices = ['True', 'False'], help='DataSet sink mode is True or False') @@ -58,7 +60,6 @@ if __name__ == "__main__": context.set_context(device_id=device_id, enable_auto_mixed_precision=True) context.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) - auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160]) init() # GPU target @@ -69,7 +70,7 @@ if __name__ == "__main__": ckpt_save_dir = cfg.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/" - data_path = args.data_path + data_path = args.dataset_path do_train = True ds_train = create_dataset(data_path=data_path, batch_size=cfg.batch_size, do_train=do_train, @@ -77,14 +78,14 @@ if __name__ == "__main__": step_size = ds_train.get_dataset_size() # define net - network = AlexNet(cfg.num_classes) + net = AlexNet(cfg.num_classes) # init weight if args.pre_trained: param_dict = load_checkpoint(args.pre_trained) - load_param_into_net(network, param_dict) + load_param_into_net(net, param_dict) else: - for _, cell in network.cells_and_names(): + for _, cell in net.cells_and_names(): if isinstance(cell, nn.Conv2d): cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(), cell.weight.default_input.shape, @@ -93,20 +94,37 @@ if __name__ == "__main__": cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(), cell.weight.default_input.shape, cell.weight.default_input.dtype).to_tensor() - - - {% if loss=='SoftmaxCrossEntropyWithLogits' %} - net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") - {% elif loss=='SoftmaxCrossEntropyExpand' %} - net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True) - {% endif %} + # define learning rate lr = Tensor(get_lr(0, cfg.lr, cfg.epoch_size, ds_train.get_dataset_size())) - {% if optimizer=='Lamb' %} - net_opt = nn.Lamb(network.trainable_params(), learning_rate=lr) - {% elif optimizer=='Momentum' %} - net_opt = nn.Momentum(network.trainable_params(), learning_rate=lr, momentum=cfg.momentum) - {% endif %} - model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) + + # define loss, model + if target == "Ascend": + {% if loss=='SoftmaxCrossEntropyWithLogits' %} + loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') + {% elif loss=='SoftmaxCrossEntropyExpand' %} + loss = nn.SoftmaxCrossEntropyExpand(sparse=True) + {% endif %} + {% if optimizer=='Momentum' %} + opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, momentum=cfg.momentum, + weight_decay=cfg.weight_decay, loss_scale=cfg.loss_scale) + {% else %} + opt = nn.{{ optimizer }}(net.trainable_params(), learning_rate=cfg.lr) + {% endif %} + loss_scale = FixedLossScaleManager(cfg.loss_scale, drop_overflow_update=False) + model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, + amp_level="O2", keep_batchnorm_fp32=False) + else: + {% if loss=='SoftmaxCrossEntropyWithLogits' %} + loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") + {% elif loss=='SoftmaxCrossEntropyExpand' %} + loss = nn.SoftmaxCrossEntropyExpand(sparse=True) + {% endif %} + {% if optimizer=='Momentum' %} + opt = nn.Momentum(net.trainable_params(), learning_rate=lr, momentum=cfg.momentum) + {% else %} + opt = nn.{{ optimizer }}(net.trainable_params(), learning_rate=lr) + {% endif %} + model = Model(net, loss, opt, metrics={"Accuracy": Accuracy()}) # define callbacks time_cb = TimeMonitor(data_size=step_size) @@ -114,7 +132,7 @@ if __name__ == "__main__": cb = [time_cb, loss_cb] if cfg.save_checkpoint: cfg_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_epochs * step_size, - keep_checkpoint_max=cfg.keep_checkpoint_max) + keep_checkpoint_max=cfg.keep_checkpoint_max) ckpt_cb = ModelCheckpoint(prefix="alexnet", directory=ckpt_save_dir, config=cfg_ck) cb += [ckpt_cb] diff --git a/mindinsight/wizard/conf/templates/network/lenet/README.md-tpl b/mindinsight/wizard/conf/templates/network/lenet/README.md-tpl new file mode 100644 index 0000000..8e2abcb --- /dev/null +++ b/mindinsight/wizard/conf/templates/network/lenet/README.md-tpl @@ -0,0 +1,120 @@ +# LeNet Example + +## Description + +These are examples of training LeNet with dataset in MindSpore. + +## Requirements + +- Install [MindSpore](https://www.mindspore.cn/install/en). + +- Download the dataset, the directory structure is as follows: + +``` +└─Data + ├─test + │ t10k-images.idx3-ubyte + │ t10k-labels.idx1-ubyte + │ + └─train + train-images.idx3-ubyte + train-labels.idx1-ubyte +``` + +## Structure + +```shell +. +└──lenet + ├── README.md + ├── script + ├── run_distribute_train.sh # launch distributed training(8 pcs) + ├── run_eval.sh # launch evaluation + ├── run_standalone_train.sh # launch standalone training(1 pcs) + ├── run_distribute_train_gpu.sh # launch gpu distributed training(8 pcs) + ├── run_eval_gpu.sh # launch gpu evaluation + └── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) + ├── src + ├── config.py # parameter configuration + ├── dataset.py # data preprocessing + └── lenet.py # lenet network definition + ├── eval.py # eval net + └── train.py # train net +``` + + +## Parameter configuration + +Parameters for both training and evaluation can be set in src/config.py. + + +## Running the example + +### Train + +#### Usage + +``` +# distributed training +Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + +# standalone training +Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +``` + + +#### Launch + +``` +# distribute training example +./run_distribute_train.sh rank_table.json ~/MNIST_data + +# standalone training example +./run_standalone_train.sh ~/MNIST_data +``` + +> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). + +#### Result + +Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. + +``` +epoch: 1 step: 1, loss is 2.3041954 +epoch: 1 step: 2, loss is 2.3079312 +... +epoch: 1 step: 601, loss is 2.314184 +epoch: 1 step: 603, loss is 2.305666 +... +``` + +### Evaluation + +#### Usage + +``` +# evaluation +Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] +``` + +#### Launch + +``` +# evaluation example +./run_eval.sh ~/MNIST_data ~/lenet/train_parallel0/ckpt_0/checkpoint_lenet-2_937.ckpt +``` + +> checkpoint can be produced in training process. + + +### Running on GPU +``` +# distributed training example +./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + +# standalone training example +./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + +# infer example +./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] +``` diff --git a/mindinsight/wizard/conf/templates/network/lenet/eval.py-tpl b/mindinsight/wizard/conf/templates/network/lenet/eval.py-tpl index c380b9e..2f77ebd 100644 --- a/mindinsight/wizard/conf/templates/network/lenet/eval.py-tpl +++ b/mindinsight/wizard/conf/templates/network/lenet/eval.py-tpl @@ -17,6 +17,8 @@ eval lenet according to model file: python eval.py --data_path /YourDataPath --ckpt_path Your.ckpt """ + +import os import argparse import mindspore.nn as nn @@ -37,11 +39,12 @@ if __name__ == "__main__": help='path where the dataset is saved') parser.add_argument('--checkpoint_path', type=str, default="", help='if mode is test, must provide\ path where the trained ckpt file') - parser.add_argument('--dataset_sink_mode', type=bool, default=False, help='dataset_sink_mode is False or True') + parser.add_argument('--dataset_sink', action='store_true', help='enable dataset sink or not') args = parser.parse_args() - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + device_id = int(os.getenv('DEVICE_ID')) + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=device_id) network = LeNet5(cfg.num_classes) {% if loss=='SoftmaxCrossEntropyWithLogits' %} @@ -49,12 +52,7 @@ if __name__ == "__main__": {% elif loss=='SoftmaxCrossEntropyExpand' %} net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True) {% endif %} - {% if optimizer=='Lamb' %} - net_opt = nn.Lamb(network.trainable_params(), learning_rate=cfg.lr) - {% elif optimizer=='Momentum' %} - net_opt = nn.Momentum(network.trainable_params(), learning_rate=cfg.lr, momentum=cfg.momentum) - {% endif %} - model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) + model = Model(network, loss_fn=net_loss, metrics={"Accuracy": Accuracy()}) print("============== Starting Testing ==============") param_dict = load_checkpoint(args.checkpoint_path) @@ -63,5 +61,5 @@ if __name__ == "__main__": do_train = False ds_eval = create_dataset(data_path=data_path, do_train=do_train, batch_size=cfg.batch_size, target=args.device_target) - acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode) + acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink) print("============== {} ==============".format(acc)) diff --git a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh-tpl similarity index 100% rename from mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh rename to mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh-tpl diff --git a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train_gpu.sh b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train_gpu.sh-tpl similarity index 97% rename from mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train_gpu.sh rename to mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train_gpu.sh-tpl index 8f9ccc4..b0954f5 100755 --- a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train_gpu.sh +++ b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train_gpu.sh-tpl @@ -57,6 +57,9 @@ cp *.sh ./train_parallel cp -r ../src ./train_parallel cd ./train_parallel || exit +echo "start training" +env > env.log + if [ $# == 1 ] then mpirun --allow-run-as-root -n $RANK_SIZE \ diff --git a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval.sh b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval.sh-tpl similarity index 100% rename from mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval.sh rename to mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval.sh-tpl diff --git a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval_gpu.sh b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval_gpu.sh-tpl similarity index 96% rename from mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval_gpu.sh rename to mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval_gpu.sh-tpl index 4302d02..5e8f43f 100755 --- a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval_gpu.sh +++ b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_eval_gpu.sh-tpl @@ -61,6 +61,6 @@ cp *.sh ./eval cp -r ../src ./eval cd ./eval || exit env > env.log -echo "start evaluation for device $DEVICE_ID" +echo "start evaluation" python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log & cd .. diff --git a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train.sh b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train.sh-tpl similarity index 100% rename from mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train.sh rename to mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train.sh-tpl diff --git a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train_gpu.sh b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train_gpu.sh-tpl similarity index 97% rename from mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train_gpu.sh rename to mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train_gpu.sh-tpl index e52eebe..82525e5 100755 --- a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train_gpu.sh +++ b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_standalone_train_gpu.sh-tpl @@ -65,6 +65,9 @@ cp *.sh ./train cp -r ../src ./train cd ./train || exit +echo "start training" +env > env.log + if [ $# == 1 ] then python train.py --device_target="GPU" --dataset_path=$PATH1 &> log & diff --git a/mindinsight/wizard/conf/templates/network/lenet/src/config.py-tpl b/mindinsight/wizard/conf/templates/network/lenet/src/config.py-tpl index eb4a0e9..863e0c6 100644 --- a/mindinsight/wizard/conf/templates/network/lenet/src/config.py-tpl +++ b/mindinsight/wizard/conf/templates/network/lenet/src/config.py-tpl @@ -18,21 +18,15 @@ network config setting, will be used in train.py from easydict import EasyDict as edict cfg = edict({ - {% if dataset=='MNIST' %} 'num_classes': 10, - {% elif dataset=='Cifar10' %} - 'num_classes': 10, - {% elif dataset=='ImageNet' %} - 'num_classes': 1001, - {% endif %} - {% if dataset=='Momentum' %} + {% if optimizer=='Momentum' %} 'lr': 0.01, + "momentum": 0.9, + {% elif optimizer=='SGD' %} + 'lr': 0.1, {% else %} 'lr': 0.001, {% endif %} - {% if optimizer=='Momentum' %} - "momentum": 0.9, - {% endif %} 'epoch_size': 1, 'batch_size': 32, 'buffer_size': 1000, diff --git a/mindinsight/wizard/conf/templates/network/lenet/train.py-tpl b/mindinsight/wizard/conf/templates/network/lenet/train.py-tpl index 292795e..6e5a837 100644 --- a/mindinsight/wizard/conf/templates/network/lenet/train.py-tpl +++ b/mindinsight/wizard/conf/templates/network/lenet/train.py-tpl @@ -48,6 +48,7 @@ if __name__ == "__main__": if args.device_target == "CPU": args.dataset_sink = False + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) ckpt_save_dir = './' if args.run_distribute: if args.device_target == 'Ascend': @@ -62,7 +63,6 @@ if __name__ == "__main__": context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=args.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) - context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) data_path = args.dataset_path do_train = True @@ -79,10 +79,10 @@ if __name__ == "__main__": {% elif loss=='SoftmaxCrossEntropyExpand' %} net_loss = nn.SoftmaxCrossEntropyExpand(sparse=True) {% endif %} - {% if optimizer=='Lamb' %} - net_opt = nn.Lamb(network.trainable_params(), learning_rate=cfg.lr) - {% elif optimizer=='Momentum' %} + {% if optimizer=='Momentum' %} net_opt = nn.Momentum(network.trainable_params(), learning_rate=cfg.lr, momentum=cfg.momentum) + {% else %} + net_opt = nn.{{ optimizer }}(network.trainable_params(), learning_rate=cfg.lr) {% endif %} time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, diff --git a/mindinsight/wizard/conf/templates/network/resnet50/README.md-tpl b/mindinsight/wizard/conf/templates/network/resnet50/README.md-tpl new file mode 100644 index 0000000..3d6b95e --- /dev/null +++ b/mindinsight/wizard/conf/templates/network/resnet50/README.md-tpl @@ -0,0 +1,136 @@ +# ResNet50 Example + +## Description + +These are examples of training ResNet50 with dataset in MindSpore. + +## Requirements + +- Install [MindSpore](https://www.mindspore.cn/install/en). + +- Download the dataset, the directory structure is as follows: + +{% if dataset=='Cifar10' %} +CIFAR-10 + +``` +└─Data + ├─test + │ cifar-10-verify-bin + │ + └─train + cifar-10-batches-bin +``` + +{% elif dataset=='ImageNet' %} +ImageNet + +``` +└─Data + ├─test + │ validation_preprocess + │ + └─train + ilsvrc +``` +{% endif %} + +## Structure + +```shell +. +└──resnet50 + ├── README.md + ├── script + ├── run_distribute_train.sh # launch distributed training(8 pcs) + ├── run_eval.sh # launch evaluation + ├── run_standalone_train.sh # launch standalone training(1 pcs) + ├── run_distribute_train_gpu.sh # launch gpu distributed training(4 pcs) + ├── run_eval_gpu.sh # launch gpu evaluation + └── run_standalone_train_gpu.sh # launch gpu standalone training(1 pcs) + ├── src + ├── config.py # parameter configuration + ├── crossentropy.py # loss definition for ImageNet2012 dataset + ├── dataset.py # data preprocessing + ├── lr_generator.py # generate learning rate for each step + └── resnet50.py # resNet50 network definition + ├── eval.py # eval net + └── train.py # train net +``` + + +## Parameter configuration + +Parameters for both training and evaluation can be set in src/config.py. + + +## Running the example + +### Train + +#### Usage + +``` +# distributed training +Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + +# standalone training +Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +``` + + +#### Launch + +``` +# distribute training example +./run_distribute_train.sh rank_table.json ~/dataset_path + +# standalone training example +./run_standalone_train.sh ~/dataset_path +``` + +> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). + +#### Result + +Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. + +``` +epoch: 1 step: 1, loss is 2.3041954 +epoch: 1 step: 2, loss is 2.3079312 +... +epoch: 1 step: 601, loss is 2.314184 +epoch: 1 step: 603, loss is 2.305666 +... +``` + +### Evaluation + +#### Usage + +``` +# evaluation +Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] +``` + +#### Launch + +``` +# evaluation example +./run_eval.sh ~/cifar-10-batches-bin ~/resnet50/train/alexnet-1.591.ckpt +``` + +> checkpoint can be produced in training process. + + +### Running on GPU +``` +# distributed training example +./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + +# standalone training example +./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) + +# infer example +./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] +``` diff --git a/mindinsight/wizard/conf/templates/network/resnet50/dataset/cifar10/dataset.py-tpl b/mindinsight/wizard/conf/templates/network/resnet50/dataset/cifar10/dataset.py-tpl index 0338f4e..f7ec46e 100644 --- a/mindinsight/wizard/conf/templates/network/resnet50/dataset/cifar10/dataset.py-tpl +++ b/mindinsight/wizard/conf/templates/network/resnet50/dataset/cifar10/dataset.py-tpl @@ -71,6 +71,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe cifar_ds = cifar_ds.repeat(repeat_size) return cifar_ds + def _get_rank_info(): """ get rank size and rank id diff --git a/mindinsight/wizard/conf/templates/network/resnet50/dataset/imagenet/dataset.py-tpl b/mindinsight/wizard/conf/templates/network/resnet50/dataset/imagenet/dataset.py-tpl index 160e532..a558c2e 100644 --- a/mindinsight/wizard/conf/templates/network/resnet50/dataset/imagenet/dataset.py-tpl +++ b/mindinsight/wizard/conf/templates/network/resnet50/dataset/imagenet/dataset.py-tpl @@ -24,6 +24,7 @@ from mindspore.communication.management import init, get_rank, get_group_size from .config import cfg + def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target="Ascend"): """ create a train or eval imagenet dataset @@ -88,6 +89,7 @@ def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, targe return ds + def _get_rank_info(): """ get rank size and rank id diff --git a/mindinsight/wizard/conf/templates/network/resnet50/dataset/mnist/dataset.py-tpl b/mindinsight/wizard/conf/templates/network/resnet50/dataset/mnist/dataset.py-tpl deleted file mode 100644 index 6a8ca7e..0000000 --- a/mindinsight/wizard/conf/templates/network/resnet50/dataset/mnist/dataset.py-tpl +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ -""" -Produce the dataset -""" -import os - -import mindspore.dataset as ds -import mindspore.dataset.transforms.vision.c_transforms as CV -import mindspore.dataset.transforms.c_transforms as C -from mindspore.dataset.transforms.vision import Inter -from mindspore.common import dtype as mstype -from mindspore.communication.management import init, get_rank, get_group_size - -from .config import cfg - - -def create_dataset(data_path, batch_size=32, repeat_size=1, do_train=True, target='Ascend'): - """ - create dataset for train or test - """ - - if do_train: - data_path = os.path.join(data_path, "train") - else: - data_path = os.path.join(data_path, "test") - - if target == 'Ascend': - device_num, rank_id = _get_rank_info() - elif target == 'GPU': - init("nccl") - rank_id = get_rank() - device_num = get_group_size() - else: - device_num = 1 - - # define dataset - if device_num == 1: - mnist_ds = ds.MnistDataset(data_path) - else: - mnist_ds = ds.MnistDataset(data_path, num_parallel_workers=8, shuffle=True, - num_shards=device_num, shard_id=rank_id) - - resize_height, resize_width = cfg.image_height, cfg.image_width - rescale = 1.0 / 255.0 - shift = 0.0 - rescale_nml = 1 / 0.3081 - shift_nml = -1 * 0.1307 / 0.3081 - - # define map operations - resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) # Bilinear mode - rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) - rescale_op = CV.Rescale(rescale, shift) - hwc2chw_op = CV.HWC2CHW() - type_cast_op = C.TypeCast(mstype.int32) - - # apply map operations on images - mnist_ds = mnist_ds.map(input_columns="label", operations=type_cast_op) - mnist_ds = mnist_ds.map(input_columns="image", operations=resize_op) - mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_op) - mnist_ds = mnist_ds.map(input_columns="image", operations=rescale_nml_op) - mnist_ds = mnist_ds.map(input_columns="image", operations=hwc2chw_op) - - # apply DatasetOps - buffer_size = 10000 - mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) - mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) - mnist_ds = mnist_ds.repeat(repeat_size) - - return mnist_ds - - -def _get_rank_info(): - """ - get rank size and rank id - """ - rank_size = int(os.environ.get("RANK_SIZE", 1)) - - if rank_size > 1: - rank_size = get_group_size() - rank_id = get_rank() - else: - rank_size = 1 - rank_id = 0 - - return rank_size, rank_id diff --git a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh-tpl similarity index 72% rename from mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh rename to mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh-tpl index 517ceb3..fe092fa 100644 --- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh +++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh-tpl @@ -16,7 +16,7 @@ if [ $# != 2 ] && [ $# != 3 ] then - echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_CKPT_PATH](optional)" + echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" exit 1 fi @@ -31,15 +31,15 @@ get_real_path(){ PATH1=$(get_real_path $1) PATH2=$(get_real_path $2) -if [ ! -d $PATH1 ] +if [ ! -f $PATH1 ] then - echo "error: DATASET_PATH=$PATH1 is not a directory" + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" exit 1 fi -if [ ! -f $PATH2 ] +if [ ! -d $PATH2 ] then - echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH2 is not a file" + echo "error: DATASET_PATH=$PATH2 is not a directory" exit 1 fi @@ -56,16 +56,15 @@ fi ulimit -u unlimited export DEVICE_NUM=8 -export RANK_SIZE=8 -export MINDSPORE_HCCL_CONFIG_PATH=$PATH2 -export RANK_TABLE_FILE=$PATH2 -export SERVER_ID=0 -rank_start=$((DEVICE_NUM * SERVER_ID)) +export RANK_SIZE=$DEVICE_NUM +export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 +export RANK_TABLE_FILE=$PATH1 -for((i=0; i env.log if [ $# == 2 ] then - python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --dataset_sink_mode=False &> log & + python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --dataset_sink_mode=False &> log & fi if [ $# == 3 ] then - python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH1 --pre_trained=$PATH2 --dataset_sink_mode=False &> log & + python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 --dataset_sink_mode=False &> log & fi cd .. diff --git a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh deleted file mode 100644 index bf19622..0000000 --- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -if [ $# != 1 ] -then - echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH]" -exit 1 -fi - -get_real_path(){ - if [ "${1:0:1}" == "/" ]; then - echo "$1" - else - echo "$(realpath -m $PWD/$1)" - fi -} - -PATH1=$(get_real_path $1) - -if [ ! -d $PATH1 ] -then - echo "error: DATASET_PATH=$PATH1 is not a directory" -exit 1 -fi - - -ulimit -u unlimited -export DEVICE_NUM=4 -export RANK_SIZE=4 - -rm -rf ./train_parallel -mkdir ./train_parallel -cp ../*.py ./train_parallel -cp *.sh ./train_parallel -cp -r ../src ./train_parallel -cd ./train_parallel || exit - -mpirun --allow-run-as-root -n $RANK_SIZE \ -python train.py --run_distribute=True \ ---device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & \ No newline at end of file diff --git a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh-tpl b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh-tpl new file mode 100644 index 0000000..2cfb5d8 --- /dev/null +++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh-tpl @@ -0,0 +1,76 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 1 ] && [ $# != 2 ] +then + echo "Usage: sh run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ $# == 2 ] +then + PATH2=$(get_real_path $2) +fi + +if [ $# == 2 ] && [ ! -f $PATH2 ] +then + echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=4 +export RANK_SIZE=$DEVICE_NUM + +rm -rf ./train_parallel +mkdir ./train_parallel +cp ../*.py ./train_parallel +cp *.sh ./train_parallel +cp -r ../src ./train_parallel +cd ./train_parallel || exit + +echo "start training" +env > env.log + +if [ $# == 1 ] +then + mpirun --allow-run-as-root -n $RANK_SIZE \ + python train.py --run_distribute=True \ + --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & +fi + +if [ $# == 2 ] +then + mpirun --allow-run-as-root -n $RANK_SIZE \ + python train.py --run_distribute=True \ + --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH3 &> log & +fi diff --git a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval.sh b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval.sh-tpl similarity index 100% rename from mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval.sh rename to mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval.sh-tpl diff --git a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval_gpu.sh b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval_gpu.sh-tpl similarity index 96% rename from mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval_gpu.sh rename to mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval_gpu.sh-tpl index bc29508..6a9e379 100644 --- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval_gpu.sh +++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_eval_gpu.sh-tpl @@ -61,6 +61,6 @@ cp *.sh ./eval cp -r ../src ./eval cd ./eval || exit env > env.log -echo "start evaluation for device $DEVICE_ID" +echo "start evaluation" python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 --device_target="GPU" &> log & cd .. diff --git a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train.sh b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train.sh-tpl similarity index 100% rename from mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train.sh rename to mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train.sh-tpl diff --git a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train_gpu.sh b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train_gpu.sh deleted file mode 100644 index 5d18649..0000000 --- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train_gpu.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash -# Copyright 2020 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -if [ $# != 1 ] -then - echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH]" -exit 1 -fi - - -get_real_path(){ - if [ "${1:0:1}" == "/" ]; then - echo "$1" - else - echo "$(realpath -m $PWD/$1)" - fi -} - -PATH1=$(get_real_path $1) - - -if [ ! -d $PATH1 ] -then - echo "error: DATASET_PATH=$PATH1 is not a directory" -exit 1 -fi - - -ulimit -u unlimited -export DEVICE_NUM=1 -export DEVICE_ID=0 -export RANK_ID=0 -export RANK_SIZE=1 - -if [ -d "train" ]; -then - rm -rf ./train -fi -mkdir ./train -cp ../*.py ./train -cp *.sh ./train -cp -r ../src ./train -cd ./train || exit -python train.py --device_target="GPU" --dataset_path=$PATH1 &> log & - -cd .. \ No newline at end of file diff --git a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train_gpu.sh-tpl b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train_gpu.sh-tpl new file mode 100644 index 0000000..57e047f --- /dev/null +++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_standalone_train_gpu.sh-tpl @@ -0,0 +1,77 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +if [ $# != 1 ] && [ $# != 2 ] +then + echo "Usage: sh run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" +exit 1 +fi + +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +PATH1=$(get_real_path $1) + +if [ ! -d $PATH1 ] +then + echo "error: DATASET_PATH=$PATH1 is not a directory" +exit 1 +fi + +if [ $# == 2 ] +then + PATH2=$(get_real_path $2) +fi + +if [ $# == 2 ] && [ ! -f $PATH2 ] +then + echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file" +exit 1 +fi + +ulimit -u unlimited +export DEVICE_NUM=1 +export DEVICE_ID=0 +export RANK_ID=0 +export RANK_SIZE=1 + +if [ -d "train" ]; +then + rm -rf ./train +fi +mkdir ./train +cp ../*.py ./train +cp *.sh ./train +cp -r ../src ./train +cd ./train || exit +echo "start training" +env > env.log + +if [ $# == 1 ] +then + python train.py --device_target="GPU" --dataset_path=$PATH1 &> log & +fi + +if [ $# == 2 ] +then + python train.py --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & +fi +cd .. diff --git a/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl b/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl index 5e2ce27..7113210 100644 --- a/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl +++ b/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl @@ -28,7 +28,13 @@ cfg = ed({ "batch_size": 32, "loss_scale": 1024, {% if optimizer=='Momentum' %} + "lr": 0.01, "momentum": 0.9, + "lr": 0.01, + {% elif optimizer=='SGD' %} + 'lr': 0.1, + {% else %} + 'lr': 0.001, {% endif %} "image_height": 224, "image_width": 224, @@ -48,7 +54,6 @@ cfg = ed({ {% endif %} "use_label_smooth": True, "label_smooth_factor": 0.1, - "lr": 0.01, "lr_init": 0.01, "lr_end": 0.00001, "lr_max": 0.1 diff --git a/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl b/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl index c2b8e12..770b596 100644 --- a/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl +++ b/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl @@ -112,12 +112,11 @@ if __name__ == '__main__': lr = Tensor(lr) # define opt - {% if optimizer=='Lamb' %} - opt = nn.Lamb(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, - weight_decay=cfg.weight_decay) - {% elif optimizer=='Momentum' %} + {% if optimizer=='Momentum' %} opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, momentum=cfg.momentum, weight_decay=cfg.weight_decay, loss_scale=cfg.loss_scale) + {% else %} + opt = nn.{{optimizer}}(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=cfg.lr) {% endif %} # define loss, model @@ -125,7 +124,7 @@ if __name__ == '__main__': {% if dataset=='ImageNet' %} if not cfg.use_label_smooth: cfg.label_smooth_factor = 0.0 - loss = CrossEntropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) + loss = CrossEntLambropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) {% else %} {% if loss=='SoftmaxCrossEntropyWithLogits' %} loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') @@ -143,10 +142,10 @@ if __name__ == '__main__': {% elif loss=='SoftmaxCrossEntropyExpand' %} loss = nn.SoftmaxCrossEntropyExpand(sparse=True) {% endif %} - {% if optimizer=='Lamb' %} - opt = nn.Lamb(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr) - {% elif optimizer=='Momentum' %} + {% if optimizer=='Momentum' %} opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr, momentum=cfg.momentum) + {% else %} + opt = nn.{{optimizer}}(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr) {% endif %} model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) diff --git a/mindinsight/wizard/network/alexnet.py b/mindinsight/wizard/network/alexnet.py index 21ba516..5a9cbb3 100644 --- a/mindinsight/wizard/network/alexnet.py +++ b/mindinsight/wizard/network/alexnet.py @@ -15,4 +15,4 @@ class Network(GenericNetwork): name = 'alexnet' supported_datasets = ['Cifar10', 'ImageNet'] supported_loss_functions = ['SoftmaxCrossEntropyWithLogits', 'SoftmaxCrossEntropyExpand'] - supported_optimizers = ['Momentum', 'Lamb'] + supported_optimizers = ['Momentum', 'Adam', 'SGD'] diff --git a/mindinsight/wizard/network/lenet.py b/mindinsight/wizard/network/lenet.py index ed854c4..414aac6 100644 --- a/mindinsight/wizard/network/lenet.py +++ b/mindinsight/wizard/network/lenet.py @@ -21,4 +21,4 @@ class Network(GenericNetwork): name = 'lenet' supported_datasets = ['MNIST'] supported_loss_functions = ['SoftmaxCrossEntropyWithLogits', 'SoftmaxCrossEntropyExpand'] - supported_optimizers = ['Momentum', 'Lamb'] + supported_optimizers = ['Momentum', 'Adam', 'SGD'] diff --git a/mindinsight/wizard/network/resnet50.py b/mindinsight/wizard/network/resnet50.py index 26eb455..4117184 100644 --- a/mindinsight/wizard/network/resnet50.py +++ b/mindinsight/wizard/network/resnet50.py @@ -6,7 +6,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -"""alexnet module.""" +"""Resnet50 module.""" from mindinsight.wizard.network.generic_network import GenericNetwork @@ -15,4 +15,4 @@ class Network(GenericNetwork): name = 'resnet50' supported_datasets = ['Cifar10', 'ImageNet'] supported_loss_functions = ['SoftmaxCrossEntropyWithLogits', 'SoftmaxCrossEntropyExpand'] - supported_optimizers = ['Momentum', 'Lamb'] + supported_optimizers = ['Momentum', 'Adam', 'SGD'] -- GitLab