From 408170d50537548009573d82b7408fcdb4987483 Mon Sep 17 00:00:00 2001 From: moran Date: Thu, 20 Aug 2020 14:30:23 +0800 Subject: [PATCH] fix bugs --- .../templates/network/alexnet/README.md-tpl | 36 ++++++++++--------- .../scripts/run_distribute_train.sh-tpl | 7 ++-- .../network/alexnet/src/config.py-tpl | 4 +-- .../templates/network/lenet/README.md-tpl | 36 ++++++++++--------- .../lenet/scripts/run_distribute_train.sh-tpl | 10 +++--- .../templates/network/resnet50/README.md-tpl | 36 ++++++++++--------- .../templates/network/resnet50/eval.py-tpl | 8 +++-- .../scripts/run_distribute_train.sh-tpl | 7 ++-- .../scripts/run_distribute_train_gpu.sh-tpl | 2 +- .../network/resnet50/src/config.py-tpl | 6 ++-- .../templates/network/resnet50/train.py-tpl | 19 ++++++++-- 11 files changed, 94 insertions(+), 77 deletions(-) diff --git a/mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl b/mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl index 05f93b9..2c55f7b 100644 --- a/mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl +++ b/mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl @@ -65,32 +65,34 @@ Parameters for both training and evaluation can be set in src/config.py. ## Running the example -### Train +### Running on Ascend -#### Usage +#### Train + +##### Usage ``` # distributed training -Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) # standalone training -Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +Usage: bash run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) ``` -#### Launch +##### Launch ``` # distribute training example -./run_distribute_train.sh rank_table.json ~/dataset_path +bash run_distribute_train.sh rank_table.json ~/cifar-10-batches-bin # standalone training example -./run_standalone_train.sh ~/dataset_path +bash run_standalone_train.sh ~/cifar-10-batches-bin ``` -> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). +> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/distributed_training_ascend.html). -#### Result +##### Result Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. @@ -103,20 +105,20 @@ epoch: 1 step: 603, loss is 2.305666 ... ``` -### Evaluation +#### Evaluation -#### Usage +##### Usage ``` # evaluation -Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] +Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] ``` -#### Launch +##### Launch ``` # evaluation example -./run_eval.sh ~/cifar-10-batches-bin ~/alexnet/train/alexnet-1.591.ckpt +bash run_eval.sh ~/cifar-10-verify-bin ~/resnet50/train/alexnet-1.591.ckpt ``` > checkpoint can be produced in training process. @@ -125,11 +127,11 @@ Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] ### Running on GPU ``` # distributed training example -./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +bash run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) # standalone training example -./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +bash run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) # infer example -./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] +bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] ``` diff --git a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh-tpl b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh-tpl index fe092fa..1c2b2ea 100644 --- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh-tpl +++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh-tpl @@ -57,14 +57,13 @@ fi ulimit -u unlimited export DEVICE_NUM=8 export RANK_SIZE=$DEVICE_NUM -export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 export RANK_TABLE_FILE=$PATH1 start_id=0 -for((i=start_id; i About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). +> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/distributed_training_ascend.html). -#### Result +##### Result Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. @@ -88,20 +90,20 @@ epoch: 1 step: 603, loss is 2.305666 ... ``` -### Evaluation +#### Evaluation -#### Usage +##### Usage ``` # evaluation -Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] +Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] ``` -#### Launch +##### Launch ``` # evaluation example -./run_eval.sh ~/MNIST_data ~/lenet/train_parallel0/ckpt_0/checkpoint_lenet-2_937.ckpt +bash run_eval.sh ~/MNIST_data ~/lenet/train_parallel0/ckpt_0/checkpoint_lenet-2_937.ckpt ``` > checkpoint can be produced in training process. @@ -110,11 +112,11 @@ Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] ### Running on GPU ``` # distributed training example -./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +bash run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) # standalone training example -./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +bash run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) # infer example -./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] +bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] ``` diff --git a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh-tpl b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh-tpl index 1c99178..6eff374 100755 --- a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh-tpl +++ b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh-tpl @@ -60,13 +60,11 @@ export DEVICE_NUM=8 export RANK_SIZE=$DEVICE_NUM export RANK_TABLE_FILE=$PATH1 -export SERVER_ID=0 -rank_start=$((DEVICE_NUM * SERVER_ID)) - -for((i=0; i About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). +> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/distributed_training_ascend.html). -#### Result +##### Result Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log. @@ -104,20 +106,20 @@ epoch: 1 step: 603, loss is 2.305666 ... ``` -### Evaluation +#### Evaluation -#### Usage +##### Usage ``` # evaluation -Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] +Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] ``` -#### Launch +##### Launch ``` # evaluation example -./run_eval.sh ~/cifar-10-batches-bin ~/resnet50/train/alexnet-1.591.ckpt +bash run_eval.sh ~/cifar-10-verify-bin ~/resnet50/train/alexnet-1.591.ckpt ``` > checkpoint can be produced in training process. @@ -126,11 +128,11 @@ Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH] ### Running on GPU ``` # distributed training example -./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +bash run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) # standalone training example -./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) +bash run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) # infer example -./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] +bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH] ``` diff --git a/mindinsight/wizard/conf/templates/network/resnet50/eval.py-tpl b/mindinsight/wizard/conf/templates/network/resnet50/eval.py-tpl index 1430449..3d0ccfc 100644 --- a/mindinsight/wizard/conf/templates/network/resnet50/eval.py-tpl +++ b/mindinsight/wizard/conf/templates/network/resnet50/eval.py-tpl @@ -22,7 +22,6 @@ from mindspore import context from mindspore import dataset as de from mindspore.train.model import Model from mindspore.train.serialization import load_checkpoint, load_param_into_net -from src.crossentropy import CrossEntropy parser = argparse.ArgumentParser(description='Image classification') @@ -67,9 +66,14 @@ if __name__ == '__main__': # define loss, model {% if dataset=='ImageNet' %} + {% if loss=='SoftmaxCrossEntropyWithLogits' %} if not cfg.use_label_smooth: cfg.label_smooth_factor = 0.0 - loss = CrossEntropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) + loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', + smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) + {% elif loss=='SoftmaxCrossEntropyExpand' %} + loss = nn.SoftmaxCrossEntropyExpand(sparse=True) + {% endif %} {% else %} {% if loss=='SoftmaxCrossEntropyWithLogits' %} loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') diff --git a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh-tpl b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh-tpl index fe092fa..1c2b2ea 100644 --- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh-tpl +++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh-tpl @@ -57,14 +57,13 @@ fi ulimit -u unlimited export DEVICE_NUM=8 export RANK_SIZE=$DEVICE_NUM -export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 export RANK_TABLE_FILE=$PATH1 start_id=0 -for((i=start_id; i log & + --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & fi diff --git a/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl b/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl index 7113210..04a00f7 100644 --- a/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl +++ b/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl @@ -18,9 +18,7 @@ network config setting, will be used in train.py and eval.py from easydict import EasyDict as ed cfg = ed({ - {% if dataset=='MNIST' %} - 'num_classes': 10, - {% elif dataset=='Cifar10' %} + {% if dataset=='Cifar10' %} 'num_classes': 10, {% elif dataset=='ImageNet' %} 'num_classes': 1001, @@ -48,7 +46,7 @@ cfg = ed({ {% if dataset=='ImageNet' %} "warmup_epochs": 0, "lr_decay_mode": "cosine", - {% else %} + {% elif dataset=='Cifar10' %} "warmup_epochs": 5, "lr_decay_mode": "poly", {% endif %} diff --git a/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl b/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl index 218e6cc..8e8bbb8 100644 --- a/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl +++ b/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl @@ -29,7 +29,6 @@ from mindspore.communication.management import init, get_rank, get_group_size import mindspore.nn as nn import mindspore.common.initializer as weight_init from src.lr_generator import get_lr, warmup_cosine_annealing_lr -from src.crossentropy import CrossEntropy parser = argparse.ArgumentParser(description='Image classification') parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute') @@ -122,9 +121,14 @@ if __name__ == '__main__': # define loss, model if target == "Ascend": {% if dataset=='ImageNet' %} + {% if loss=='SoftmaxCrossEntropyWithLogits' %} if not cfg.use_label_smooth: cfg.label_smooth_factor = 0.0 - loss = CrossEntropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) + loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', + smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) + {% elif loss=='SoftmaxCrossEntropyExpand' %} + loss = nn.SoftmaxCrossEntropyExpand(sparse=True) + {% endif %} {% else %} {% if loss=='SoftmaxCrossEntropyWithLogits' %} loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') @@ -137,6 +141,16 @@ if __name__ == '__main__': amp_level="O2", keep_batchnorm_fp32=False) else: # GPU target + {% if dataset=='ImageNet' %} + {% if loss=='SoftmaxCrossEntropyWithLogits' %} + if not cfg.use_label_smooth: + cfg.label_smooth_factor = 0.0 + loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, is_grad=False, reduction='mean', + smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) + {% elif loss=='SoftmaxCrossEntropyExpand' %} + loss = nn.SoftmaxCrossEntropyExpand(sparse=True) + {% endif %} + {% else %} {% if loss=='SoftmaxCrossEntropyWithLogits' %} loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, is_grad=False, reduction='mean') {% elif loss=='SoftmaxCrossEntropyExpand' %} @@ -147,6 +161,7 @@ if __name__ == '__main__': {% else %} opt = nn.{{optimizer}}(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr) {% endif %} + {% endif %} model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) # define callbacks -- GitLab