From 408170d50537548009573d82b7408fcdb4987483 Mon Sep 17 00:00:00 2001
From: moran <moran2huawei.com>
Date: Thu, 20 Aug 2020 14:30:23 +0800
Subject: [PATCH] fix bugs

---
 .../templates/network/alexnet/README.md-tpl   | 36 ++++++++++---------
 .../scripts/run_distribute_train.sh-tpl       |  7 ++--
 .../network/alexnet/src/config.py-tpl         |  4 +--
 .../templates/network/lenet/README.md-tpl     | 36 ++++++++++---------
 .../lenet/scripts/run_distribute_train.sh-tpl | 10 +++---
 .../templates/network/resnet50/README.md-tpl  | 36 ++++++++++---------
 .../templates/network/resnet50/eval.py-tpl    |  8 +++--
 .../scripts/run_distribute_train.sh-tpl       |  7 ++--
 .../scripts/run_distribute_train_gpu.sh-tpl   |  2 +-
 .../network/resnet50/src/config.py-tpl        |  6 ++--
 .../templates/network/resnet50/train.py-tpl   | 19 ++++++++--
 11 files changed, 94 insertions(+), 77 deletions(-)

diff --git a/mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl b/mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl
index 05f93b9..2c55f7b 100644
--- a/mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/README.md-tpl
@@ -65,32 +65,34 @@ Parameters for both training and evaluation can be set in src/config.py.
 
 ## Running the example
 
-### Train
+### Running on Ascend
 
-#### Usage
+#### Train
+
+##### Usage
 
 ```
 # distributed training
-Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 
 # standalone training
-Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+Usage: bash run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 ```
 
 
-#### Launch
+##### Launch
 
 ```
 # distribute training example
-./run_distribute_train.sh rank_table.json ~/dataset_path
+bash run_distribute_train.sh rank_table.json ~/cifar-10-batches-bin
 
 # standalone training example
-./run_standalone_train.sh ~/dataset_path
+bash run_standalone_train.sh ~/cifar-10-batches-bin
 ```
 
-> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
+> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/distributed_training_ascend.html).
 
-#### Result
+##### Result
 
 Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log.
 
@@ -103,20 +105,20 @@ epoch: 1 step: 603, loss is 2.305666
 ...
 ```
 
-### Evaluation
+#### Evaluation
 
-#### Usage
+##### Usage
 
 ```
 # evaluation
-Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
+Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ```
 
-#### Launch
+##### Launch
 
 ```
 # evaluation example
-./run_eval.sh ~/cifar-10-batches-bin ~/alexnet/train/alexnet-1.591.ckpt
+bash run_eval.sh ~/cifar-10-verify-bin ~/resnet50/train/alexnet-1.591.ckpt
 ```
 
 > checkpoint can be produced in training process.
@@ -125,11 +127,11 @@ Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ### Running on GPU
 ```
 # distributed training example
-./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+bash run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 
 # standalone training example
-./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+bash run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 
 # infer example
-./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
+bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ```
diff --git a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh-tpl b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh-tpl
index fe092fa..1c2b2ea 100644
--- a/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/scripts/run_distribute_train.sh-tpl
@@ -57,14 +57,13 @@ fi
 ulimit -u unlimited
 export DEVICE_NUM=8
 export RANK_SIZE=$DEVICE_NUM
-export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
 export RANK_TABLE_FILE=$PATH1
 
 start_id=0
-for((i=start_id; i<DEVICE_NUM + start_id; i++))
+for((i=0; i<${DEVICE_NUM}; i++))
 do
-    export DEVICE_ID=$i
-    export RANK_ID=$((i - start_id))
+    export DEVICE_ID=$((i + start_id))
+    export RANK_ID=$i
     rm -rf ./train_parallel$i
     mkdir ./train_parallel$i
     cp ../*.py ./train_parallel$i
diff --git a/mindinsight/wizard/conf/templates/network/alexnet/src/config.py-tpl b/mindinsight/wizard/conf/templates/network/alexnet/src/config.py-tpl
index 5eca324..c4e3403 100644
--- a/mindinsight/wizard/conf/templates/network/alexnet/src/config.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/alexnet/src/config.py-tpl
@@ -19,9 +19,7 @@ network config setting, will be used in train.py
 from easydict import EasyDict as edict
 
 cfg = edict({
-    {% if dataset=='MNIST' %}
-    'num_classes': 10,
-    {% elif dataset=='Cifar10' %}
+    {% if dataset=='Cifar10' %}
     'num_classes': 10,
     {% elif dataset=='ImageNet' %}
     'num_classes': 1001,
diff --git a/mindinsight/wizard/conf/templates/network/lenet/README.md-tpl b/mindinsight/wizard/conf/templates/network/lenet/README.md-tpl
index 8e2abcb..1a82836 100644
--- a/mindinsight/wizard/conf/templates/network/lenet/README.md-tpl
+++ b/mindinsight/wizard/conf/templates/network/lenet/README.md-tpl
@@ -50,32 +50,34 @@ Parameters for both training and evaluation can be set in src/config.py.
 
 ## Running the example
 
-### Train
+### Running on Ascend
 
-#### Usage
+#### Train
+
+##### Usage
 
 ```
 # distributed training
-Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 
 # standalone training
-Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+Usage: bash run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 ```
 
 
-#### Launch
+##### Launch
 
 ```
 # distribute training example
-./run_distribute_train.sh rank_table.json ~/MNIST_data
+bash run_distribute_train.sh rank_table.json ~/MNIST_data
 
 # standalone training example
-./run_standalone_train.sh ~/MNIST_data
+bash run_standalone_train.sh ~/MNIST_data
 ```
 
-> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
+> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/distributed_training_ascend.html).
 
-#### Result
+##### Result
 
 Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log.
 
@@ -88,20 +90,20 @@ epoch: 1 step: 603, loss is 2.305666
 ...
 ```
 
-### Evaluation
+#### Evaluation
 
-#### Usage
+##### Usage
 
 ```
 # evaluation
-Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
+Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ```
 
-#### Launch
+##### Launch
 
 ```
 # evaluation example
-./run_eval.sh ~/MNIST_data ~/lenet/train_parallel0/ckpt_0/checkpoint_lenet-2_937.ckpt
+bash run_eval.sh ~/MNIST_data ~/lenet/train_parallel0/ckpt_0/checkpoint_lenet-2_937.ckpt
 ```
 
 > checkpoint can be produced in training process.
@@ -110,11 +112,11 @@ Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ### Running on GPU
 ```
 # distributed training example
-./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+bash run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 
 # standalone training example
-./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+bash run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 
 # infer example
-./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
+bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ```
diff --git a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh-tpl b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh-tpl
index 1c99178..6eff374 100755
--- a/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/lenet/scripts/run_distribute_train.sh-tpl
@@ -60,13 +60,11 @@ export DEVICE_NUM=8
 export RANK_SIZE=$DEVICE_NUM
 export RANK_TABLE_FILE=$PATH1
 
-export SERVER_ID=0
-rank_start=$((DEVICE_NUM * SERVER_ID))
-
-for((i=0; i<DEVICE_NUM; i++))
+start_id=0
+for((i=0; i<${DEVICE_NUM}; i++))
 do
-    export DEVICE_ID=$i
-    export RANK_ID=$((rank_start + i))
+    export DEVICE_ID=$((i + start_id))
+    export RANK_ID=$i
     rm -rf ./train_parallel$i
     mkdir ./train_parallel$i
     cp ../*.py ./train_parallel$i
diff --git a/mindinsight/wizard/conf/templates/network/resnet50/README.md-tpl b/mindinsight/wizard/conf/templates/network/resnet50/README.md-tpl
index 3d6b95e..c426e02 100644
--- a/mindinsight/wizard/conf/templates/network/resnet50/README.md-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/README.md-tpl
@@ -66,32 +66,34 @@ Parameters for both training and evaluation can be set in src/config.py.
 
 ## Running the example
 
-### Train
+### Running on Ascend
 
-#### Usage
+#### Train
+
+##### Usage
 
 ```
 # distributed training
-Usage: ./run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 
 # standalone training
-Usage: ./run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+Usage: bash run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 ```
 
 
-#### Launch
+##### Launch
 
 ```
 # distribute training example
-./run_distribute_train.sh rank_table.json ~/dataset_path
+bash run_distribute_train.sh rank_table.json ~/cifar-10-batches-bin
 
 # standalone training example
-./run_standalone_train.sh ~/dataset_path
+bash run_standalone_train.sh ~/cifar-10-batches-bin
 ```
 
-> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
+> About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/zh-CN/master/advanced_use/distributed_training_ascend.html).
 
-#### Result
+##### Result
 
 Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". Under this, you can find checkpoint file together with result like the followings in log.
 
@@ -104,20 +106,20 @@ epoch: 1 step: 603, loss is 2.305666
 ...
 ```
 
-### Evaluation
+#### Evaluation
 
-#### Usage
+##### Usage
 
 ```
 # evaluation
-Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
+Usage: bash run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ```
 
-#### Launch
+##### Launch
 
 ```
 # evaluation example
-./run_eval.sh ~/cifar-10-batches-bin ~/resnet50/train/alexnet-1.591.ckpt
+bash run_eval.sh ~/cifar-10-verify-bin ~/resnet50/train/alexnet-1.591.ckpt
 ```
 
 > checkpoint can be produced in training process.
@@ -126,11 +128,11 @@ Usage: ./run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ### Running on GPU
 ```
 # distributed training example
-./run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+bash run_distribute_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 
 # standalone training example
-./run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+bash run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 
 # infer example
-./run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
+bash run_eval_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
 ```
diff --git a/mindinsight/wizard/conf/templates/network/resnet50/eval.py-tpl b/mindinsight/wizard/conf/templates/network/resnet50/eval.py-tpl
index 1430449..3d0ccfc 100644
--- a/mindinsight/wizard/conf/templates/network/resnet50/eval.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/eval.py-tpl
@@ -22,7 +22,6 @@ from mindspore import context
 from mindspore import dataset as de
 from mindspore.train.model import Model
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
-from src.crossentropy import CrossEntropy
 
 parser = argparse.ArgumentParser(description='Image classification')
 
@@ -67,9 +66,14 @@ if __name__ == '__main__':
 
     # define loss, model
     {% if dataset=='ImageNet' %}
+    {% if loss=='SoftmaxCrossEntropyWithLogits' %}
     if not cfg.use_label_smooth:
         cfg.label_smooth_factor = 0.0
-    loss = CrossEntropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes)
+    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean',
+                                            smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes)
+    {% elif loss=='SoftmaxCrossEntropyExpand' %}
+    loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
+    {% endif %}
     {% else %}
     {% if loss=='SoftmaxCrossEntropyWithLogits' %}
     loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
diff --git a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh-tpl b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh-tpl
index fe092fa..1c2b2ea 100644
--- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train.sh-tpl
@@ -57,14 +57,13 @@ fi
 ulimit -u unlimited
 export DEVICE_NUM=8
 export RANK_SIZE=$DEVICE_NUM
-export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
 export RANK_TABLE_FILE=$PATH1
 
 start_id=0
-for((i=start_id; i<DEVICE_NUM + start_id; i++))
+for((i=0; i<${DEVICE_NUM}; i++))
 do
-    export DEVICE_ID=$i
-    export RANK_ID=$((i - start_id))
+    export DEVICE_ID=$((i + start_id))
+    export RANK_ID=$i
     rm -rf ./train_parallel$i
     mkdir ./train_parallel$i
     cp ../*.py ./train_parallel$i
diff --git a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh-tpl b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh-tpl
index 2cfb5d8..337f2ff 100644
--- a/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/scripts/run_distribute_train_gpu.sh-tpl
@@ -72,5 +72,5 @@ if [ $# == 2 ]
 then
   mpirun --allow-run-as-root -n $RANK_SIZE \
   python train.py --run_distribute=True \
-  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH3 &> log &
+  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
 fi
diff --git a/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl b/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl
index 7113210..04a00f7 100644
--- a/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/src/config.py-tpl
@@ -18,9 +18,7 @@ network config setting, will be used in train.py and eval.py
 from easydict import EasyDict as ed
 
 cfg = ed({
-    {% if dataset=='MNIST' %}
-    'num_classes': 10,
-    {% elif dataset=='Cifar10' %}
+    {% if dataset=='Cifar10' %}
     'num_classes': 10,
     {% elif dataset=='ImageNet' %}
     'num_classes': 1001,
@@ -48,7 +46,7 @@ cfg = ed({
     {% if dataset=='ImageNet' %}
     "warmup_epochs": 0,
     "lr_decay_mode": "cosine",
-    {% else %}
+    {% elif dataset=='Cifar10' %}
     "warmup_epochs": 5,
     "lr_decay_mode": "poly",
     {% endif %}
diff --git a/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl b/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl
index 218e6cc..8e8bbb8 100644
--- a/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl
+++ b/mindinsight/wizard/conf/templates/network/resnet50/train.py-tpl
@@ -29,7 +29,6 @@ from mindspore.communication.management import init, get_rank, get_group_size
 import mindspore.nn as nn
 import mindspore.common.initializer as weight_init
 from src.lr_generator import get_lr, warmup_cosine_annealing_lr
-from src.crossentropy import CrossEntropy
 
 parser = argparse.ArgumentParser(description='Image classification')
 parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute')
@@ -122,9 +121,14 @@ if __name__ == '__main__':
     # define loss, model
     if target == "Ascend":
         {% if dataset=='ImageNet' %}
+        {% if loss=='SoftmaxCrossEntropyWithLogits' %}
         if not cfg.use_label_smooth:
             cfg.label_smooth_factor = 0.0
-        loss = CrossEntropy(smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes)
+        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean',
+                                                smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes)
+        {% elif loss=='SoftmaxCrossEntropyExpand' %}
+        loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
+        {% endif %}
         {% else %}
         {% if loss=='SoftmaxCrossEntropyWithLogits' %}
         loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
@@ -137,6 +141,16 @@ if __name__ == '__main__':
                       amp_level="O2", keep_batchnorm_fp32=False)
     else:
         # GPU target
+        {% if dataset=='ImageNet' %}
+        {% if loss=='SoftmaxCrossEntropyWithLogits' %}
+        if not cfg.use_label_smooth:
+            cfg.label_smooth_factor = 0.0
+        loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, is_grad=False, reduction='mean',
+                                                smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes)
+        {% elif loss=='SoftmaxCrossEntropyExpand' %}
+        loss = nn.SoftmaxCrossEntropyExpand(sparse=True)
+        {% endif %}
+        {% else %}
         {% if loss=='SoftmaxCrossEntropyWithLogits' %}
         loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, is_grad=False, reduction='mean')
         {% elif loss=='SoftmaxCrossEntropyExpand' %}
@@ -147,6 +161,7 @@ if __name__ == '__main__':
         {% else %}
         opt = nn.{{optimizer}}(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr)
         {% endif %}
+        {% endif %}
         model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'})
 
     # define callbacks
-- 
GitLab