add resnet50_quant

df65f168 · wandongdong · b3f09b1d · df65f168 · df65f168 · df65f168
12 changed file
--- a/example/resnet50_quant/README.md
+++ b/example/resnet50_quant/README.md
+# ResNet-50_quant Example
+## Description
+This is an example of training ResNet-50_quant with ImageNet2012 dataset in MindSpore.
+## Requirements
+- Install [MindSpore](https://www.mindspore.cn/install/en).
+- Download the dataset ImageNet2012 
+> Unzip the ImageNet2012 dataset to any path you want and the folder structure should include train and eval dataset as follows:
+> ```
+> .  
+> ├── ilsvrc                  # train dataset
+> └── ilsvrc_eval             # infer dataset
+> ```
+## Example structure
+```shell
+.
+├── Resnet50_quant        
+  ├── Readme.md                      
+  ├── scripts 
+  │   ├──run_train.sh                  
+  │   ├──run_eval.sh                    
+  ├── src                              
+  │   ├──config.py                     
+  │   ├──crossentropy.py                                 
+  │   ├──dataset.py
+  │   ├──luanch.py       
+  │   ├──lr_generator.py                                 
+  │   ├──utils.py       
+  ├── models                              
+  │   ├──resnet_quant.py
+  ├── train.py
+  ├── eval.py
+```
+## Parameter configuration
+Parameters for both training and inference can be set in config.py.
+```
+"class_num": 1001,                # dataset class number
+"batch_size": 32,                 # batch size of input tensor
+"loss_scale": 1024,               # loss scale
+"momentum": 0.9,                  # momentum optimizer
+"weight_decay": 1e-4,             # weight decay 
+"epoch_size": 110,                 # only valid for taining, which is always 1 for inference 
+"pretrained_epoch_size": 90,       # epoch size that model has been trained before load pretrained checkpoint
+"buffer_size": 1000,              # number of queue size in data preprocessing
+"image_height": 224,              # image height
+"image_width": 224,               # image width
+"save_checkpoint": True,          # whether save checkpoint or not
+"save_checkpoint_epochs": 1,      # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last epoch
+"keep_checkpoint_max": 50,        # only keep the last keep_checkpoint_max checkpoint
+"save_checkpoint_path": "./",     # path to save checkpoint relative to the executed path
+"warmup_epochs": 0,               # number of warmup epoch
+"lr_decay_mode": "cosine",        # decay mode for generating learning rate
+"label_smooth": True,             # label smooth
+"label_smooth_factor": 0.1,       # label smooth factor
+"lr_init": 0,                     # initial learning rate
+"lr_max": 0.1,                    # maximum learning rate
+```
+## Running the example
+### Train
+### Usage
+- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]
+### Launch
+``` 
+# training example
+  Ascend: sh run_train.sh Ascend 8 192.168.0.1 0,1,2,3,4,5,6,7 ~/imagenet/train/
+```
+### Result
+Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log  will be redirected to `./train/train.log` like followings. 
+``` 
+epoch: 1 step: 5004, loss is 4.8995576
+epoch: 2 step: 5004, loss is 3.9235563
+epoch: 3 step: 5004, loss is 3.833077
+epoch: 4 step: 5004, loss is 3.2795618
+epoch: 5 step: 5004, loss is 3.1978393
+```
+## Eval process
+### Usage
+- Ascend: sh run_infer.sh Ascend [DATASET_PATH] [CHECKPOINT_PATH]
+### Launch
+``` 
+# infer example
+    Ascend: sh run_infer.sh Ascend ~/imagenet/val/ ~/checkpoint/resnet50-110_5004.ckpt
+```
+> checkpoint can be produced in training process.
+#### Result
+Inference result will be stored in the example path, whose folder name is "infer". Under this, you can find result like the followings in log.
+```
+result: {'acc': 0.75.252054737516005} ckpt=train_parallel0/resnet-110_5004.ckpt
+```
--- a/example/resnet50_quant/eval.py
+++ b/example/resnet50_quant/eval.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+eval.
+"""
+import os
+import argparse
+from src.dataset import create_dataset
+from src.config import config
+from src.crossentropy import CrossEntropy
+from src.utils import _load_param_into_net
+from models.resnet_quant import resnet50_quant
+from mindspore import context
+from mindspore.train.model import Model
+from mindspore.train.serialization import load_checkpoint
+parser = argparse.ArgumentParser(description='Image classification')
+parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute')
+parser.add_argument('--device_num', type=int, default=1, help='Device num.')
+parser.add_argument('--do_train', type=bool, default=False, help='Do train or not.')
+parser.add_argument('--do_eval', type=bool, default=True, help='Do eval or not.')
+parser.add_argument('--checkpoint_path', type=str, default=None, help='Checkpoint file path')
+parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
+parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
+args_opt = parser.parse_args()
+target = args_opt.device_target
+context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
+if target == "Ascend":
+    device_id = int(os.getenv('DEVICE_ID'))
+    context.set_context(device_id=device_id)
+if __name__ == '__main__':
+    net = resnet50_quant(class_num=config.class_num)
+    if not config.use_label_smooth:
+        config.label_smooth_factor = 0.0
+    loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
+    if args_opt.do_eval:
+        dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, batch_size=config.batch_size,
+                                 target=target)
+        step_size = dataset.get_dataset_size()
+        if args_opt.checkpoint_path:
+            param_dict = load_checkpoint(args_opt.checkpoint_path)
+            _load_param_into_net(net, param_dict)
+        net.set_train(False)
+        model = Model(net, loss_fn=loss, metrics={'acc'})
+        res = model.eval(dataset)
+        print("result:", res, "ckpt=", args_opt.checkpoint_path)
--- a/example/resnet50_quant/models/resnet_quant.py
+++ b/example/resnet50_quant/models/resnet_quant.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""ResNet."""
+import numpy as np
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore import Tensor
+from mindspore.nn import FakeQuantWithMinMax, Conv2dBatchNormQuant
+_ema_decay = 0.999
+_symmetric = False
+_fake = True
+def _weight_variable(shape, factor=0.01):
+    init_value = np.random.randn(*shape).astype(np.float32) * factor
+    return Tensor(init_value)
+def _conv3x3(in_channel, out_channel, stride=1):
+    weight_shape = (out_channel, in_channel, 3, 3)
+    weight = _weight_variable(weight_shape)
+    return nn.Conv2d(in_channel, out_channel,
+                     kernel_size=3, stride=stride, padding=0, pad_mode='same', weight_init=weight)
+def _conv1x1(in_channel, out_channel, stride=1):
+    weight_shape = (out_channel, in_channel, 1, 1)
+    weight = _weight_variable(weight_shape)
+    return nn.Conv2d(in_channel, out_channel,
+                     kernel_size=1, stride=stride, padding=0, pad_mode='same', weight_init=weight)
+def _conv7x7(in_channel, out_channel, stride=1):
+    weight_shape = (out_channel, in_channel, 7, 7)
+    weight = _weight_variable(weight_shape)
+    return nn.Conv2d(in_channel, out_channel,
+                     kernel_size=7, stride=stride, padding=0, pad_mode='same', weight_init=weight)
+def _bn(channel):
+    return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
+                          gamma_init=1, beta_init=0, moving_mean_init=0, moving_var_init=1)
+def _bn_last(channel):
+    return nn.BatchNorm2d(channel, eps=1e-4, momentum=0.9,
+                          gamma_init=0, beta_init=0, moving_mean_init=0, moving_var_init=1)
+def _fc(in_channel, out_channel):
+    weight_shape = (out_channel, in_channel)
+    weight = _weight_variable(weight_shape)
+    return nn.Dense(in_channel, out_channel, has_bias=True, weight_init=weight, bias_init=0)
+class ConvBNReLU(nn.Cell):
+    """
+    Convolution/Depthwise fused with Batchnorm and ReLU block definition.
+    Args:
+        in_planes (int): Input channel.
+        out_planes (int): Output channel.
+        kernel_size (int): Input kernel size.
+        stride (int): Stride size for the first convolutional layer. Default: 1.
+        groups (int): channel group. Convolution is 1 while Depthiwse is input channel. Default: 1.
+    Returns:
+        Tensor, output tensor.
+    Examples:
+        >>> ConvBNReLU(16, 256, kernel_size=1, stride=1, groups=1)
+    """
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+        super(ConvBNReLU, self).__init__()
+        padding = (kernel_size - 1) // 2
+        conv = Conv2dBatchNormQuant(in_planes, out_planes, kernel_size, stride, pad_mode='pad', padding=padding,
+                                    group=groups, fake=_fake)
+        layers = [conv, nn.ReLUQuant()] if _fake else [conv, nn.ReLU()]
+        self.features = nn.SequentialCell(layers)
+    def construct(self, x):
+        output = self.features(x)
+        return output
+class ResidualBlock(nn.Cell):
+    """
+    ResNet V1 residual block definition.
+    Args:
+        in_channel (int): Input channel.
+        out_channel (int): Output channel.
+        stride (int): Stride size for the first convolutional layer. Default: 1.
+    Returns:
+        Tensor, output tensor.
+    Examples:
+        >>> ResidualBlock(3, 256, stride=2)
+    """
+    expansion = 4
+    def __init__(self,
+                 in_channel,
+                 out_channel,
+                 stride=1):
+        super(ResidualBlock, self).__init__()
+        channel = out_channel // self.expansion
+        self.conv1 = ConvBNReLU(in_channel, channel, kernel_size=1, stride=1)
+        self.conv2 = ConvBNReLU(channel, channel, kernel_size=3, stride=stride)
+        self.conv3 = nn.SequentialCell([Conv2dBatchNormQuant(channel, out_channel, fake=_fake,
+                                                             kernel_size=1, stride=1, pad_mode='same', padding=0),
+                                        FakeQuantWithMinMax(ema=True, ema_decay=_ema_decay, symmetric=False)
+                                        ]) if _fake else Conv2dBatchNormQuant(channel, out_channel, fake=_fake,
+                                                                              kernel_size=1, stride=1,
+                                                                              pad_mode='same', padding=0)
+        self.down_sample = False
+        if stride != 1 or in_channel != out_channel:
+            self.down_sample = True
+        self.down_sample_layer = None
+        if self.down_sample:
+            self.down_sample_layer = nn.SequentialCell([Conv2dBatchNormQuant(in_channel, out_channel,
+                                                                             kernel_size=1, stride=stride,
+                                                                             pad_mode='same', padding=0),
+                                                        FakeQuantWithMinMax(ema=True, ema_decay=_ema_decay,
+                                                                            symmetric=False)
+                                                        ]) if _fake else Conv2dBatchNormQuant(in_channel, out_channel,
+                                                                                              fake=_fake,
+                                                                                              kernel_size=1,
+                                                                                              stride=stride,
+                                                                                              pad_mode='same',
+                                                                                              padding=0)
+        self.add = P.TensorAdd()
+        self.fake = FakeQuantWithMinMax(ema=True, ema_decay=_ema_decay, symmetric=False)
+    def construct(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = self.conv3(out)
+        if self.down_sample:
+            identity = self.down_sample_layer(identity)
+        out = self.add(out, identity)
+        out = P.ReLU()(out)
+        if _fake:
+            out = self.fake(out)
+        return out
+class ResNet(nn.Cell):
+    """
+    ResNet architecture.
+    Args:
+        block (Cell): Block for network.
+        layer_nums (list): Numbers of block in different layers.
+        in_channels (list): Input channel in each layer.
+        out_channels (list): Output channel in each layer.
+        strides (list):  Stride size in each layer.
+        num_classes (int): The number of classes that the training images are belonging to.
+    Returns:
+        Tensor, output tensor.
+    Examples:
+        >>> ResNet(ResidualBlock,
+        >>>        [3, 4, 6, 3],
+        >>>        [64, 256, 512, 1024],
+        >>>        [256, 512, 1024, 2048],
+        >>>        [1, 2, 2, 2],
+        >>>        10)
+    """
+    def __init__(self,
+                 block,
+                 layer_nums,
+                 in_channels,
+                 out_channels,
+                 strides,
+                 num_classes):
+        super(ResNet, self).__init__()
+        if not len(layer_nums) == len(in_channels) == len(out_channels) == 4:
+            raise ValueError("the length of layer_num, in_channels, out_channels list must be 4!")
+        self.conv1 = ConvBNReLU(3, 64, kernel_size=7, stride=2)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode="same")
+        self.layer1 = self._make_layer(block,
+                                       layer_nums[0],
+                                       in_channel=in_channels[0],
+                                       out_channel=out_channels[0],
+                                       stride=strides[0])
+        self.layer2 = self._make_layer(block,
+                                       layer_nums[1],
+                                       in_channel=in_channels[1],
+                                       out_channel=out_channels[1],
+                                       stride=strides[1])
+        self.layer3 = self._make_layer(block,
+                                       layer_nums[2],
+                                       in_channel=in_channels[2],
+                                       out_channel=out_channels[2],
+                                       stride=strides[2])
+        self.layer4 = self._make_layer(block,
+                                       layer_nums[3],
+                                       in_channel=in_channels[3],
+                                       out_channel=out_channels[3],
+                                       stride=strides[3])
+        self.mean = P.ReduceMean(keep_dims=True)
+        self.flatten = nn.Flatten()
+        self.end_point = nn.Dense(out_channels[3], num_classes, has_bias=True)
+    def _make_layer(self, block, layer_num, in_channel, out_channel, stride):
+        """
+        Make stage network of ResNet.
+        Args:
+            block (Cell): Resnet block.
+            layer_num (int): Layer number.
+            in_channel (int): Input channel.
+            out_channel (int): Output channel.
+            stride (int): Stride size for the first convolutional layer.
+        Returns:
+            SequentialCell, the output layer.
+        Examples:
+            >>> _make_layer(ResidualBlock, 3, 128, 256, 2)
+        """
+        layers = []
+        resnet_block = block(in_channel, out_channel, stride=stride)
+        layers.append(resnet_block)
+        for _ in range(1, layer_num):
+            resnet_block = block(out_channel, out_channel, stride=1)
+            layers.append(resnet_block)
+        return nn.SequentialCell(layers)
+    def construct(self, x):
+        x = self.conv1(x)
+        c1 = self.maxpool(x)
+        c2 = self.layer1(c1)
+        c3 = self.layer2(c2)
+        c4 = self.layer3(c3)
+        c5 = self.layer4(c4)
+        out = self.mean(c5, (2, 3))
+        out = self.flatten(out)
+        out = self.end_point(out)
+        return out
+def resnet50_quant(class_num=10):
+    """
+    Get ResNet50 neural network.
+    Args:
+        class_num (int): Class number.
+    Returns:
+        Cell, cell instance of ResNet50 neural network.
+    Examples:
+        >>> net = resnet50_quant(10)
+    """
+    return ResNet(ResidualBlock,
+                  [3, 4, 6, 3],
+                  [64, 256, 512, 1024],
+                  [256, 512, 1024, 2048],
+                  [1, 2, 2, 2],
+                  class_num)
+def resnet101_quant(class_num=1001):
+    """
+    Get ResNet101 neural network.
+    Args:
+        class_num (int): Class number.
+    Returns:
+        Cell, cell instance of ResNet101 neural network.
+    Examples:
+        >>> net = resnet101(1001)
+    """
+    return ResNet(ResidualBlock,
+                  [3, 4, 23, 3],
+                  [64, 256, 512, 1024],
+                  [256, 512, 1024, 2048],
+                  [1, 2, 2, 2],
+                  class_num)
--- a/example/resnet50_quant/scripts/run_infer.sh
+++ b/example/resnet50_quant/scripts/run_infer.sh
+#!/usr/bin/env bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+if [ $# != 3 ]
+then
+    echo "Ascend: sh run_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH] \
+          GPU: sh run_infer.sh [PLATFORM] [DATASET_PATH] [CHECKPOINT_PATH]"
+exit 1
+fi
+# check dataset path
+if [ ! -d $2 ]
+then
+    echo "error: DATASET_PATH=$2 is not a directory"
+exit 1
+fi
+# check checkpoint file
+if [ ! -f $3 ]
+then
+    echo "error: CHECKPOINT_PATH=$3 is not a file"
+exit 1
+fi
+# set environment
+BASEPATH=$(cd "`dirname $0`" || exit; pwd)
+export PYTHONPATH=${BASEPATH}:$PYTHONPATH
+export DEVICE_ID=0
+export RANK_ID=0
+export RANK_SIZE=1
+if [ -d "../eval" ];
+then
+    rm -rf ../eval
+fi
+mkdir ../eval
+cd ../eval || exit
+# luanch
+python ${BASEPATH}/../eval.py \
+        --device_target=$1 \
+        --dataset_path=$2 \
+        --checkpoint_path=$3 \
+        &> infer.log &  # dataset val folder path
--- a/example/resnet50_quant/scripts/run_train.sh
+++ b/example/resnet50_quant/scripts/run_train.sh
+#!/usr/bin/env bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+run_ascend()
+{
+    if [ $2 -lt 1 ] && [ $2 -gt 8 ]
+    then
+        echo "error: DEVICE_NUM=$2 is not in (1-8)"
+    exit 1
+    fi
+    if [ ! -d $5 ]
+    then
+        echo "error: DATASET_PATH=$5 is not a directory"
+    exit 1
+    fi
+    BASEPATH=$(cd "`dirname $0`" || exit; pwd)
+    export PYTHONPATH=${BASEPATH}:$PYTHONPATH
+    if [ -d "../train" ];
+    then
+        rm -rf ../train
+    fi
+    mkdir ../train
+    cd ../train || exit
+    python ${BASEPATH}/../src/launch.py \
+            --nproc_per_node=$2 \
+            --visible_devices=$4 \
+            --server_id=$3 \
+            --training_script=${BASEPATH}/../train.py \
+            --dataset_path=$5 \
+            --pre_trained=$6 \
+            --device_target=$1 &> train.log &  # dataset train folder
+}
+run_gpu()
+{
+    if [ $2 -lt 1 ] && [ $2 -gt 8 ]
+    then
+        echo "error: DEVICE_NUM=$2 is not in (1-8)"
+    exit 1
+    fi
+    if [ ! -d $4 ]
+    then
+        echo "error: DATASET_PATH=$4 is not a directory"
+    exit 1
+    fi
+    BASEPATH=$(cd "`dirname $0`" || exit; pwd)
+    export PYTHONPATH=${BASEPATH}:$PYTHONPATH
+    if [ -d "../train" ];
+    then
+        rm -rf ../train
+    fi
+    mkdir ../train
+    cd ../train || exit
+    export CUDA_VISIBLE_DEVICES="$3"
+    mpirun -n $2 --allow-run-as-root \
+    python ${BASEPATH}/../train.py \
+        --dataset_path=$4 \
+        --platform=$1 \
+        --pre_trained=$5 \
+        &> train.log &  # dataset train folder
+}
+if [ $# -gt 6 ] || [ $# -lt 4 ]
+then
+    echo "Usage:\n \
+          Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
+          GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
+          "
+exit 1
+fi
+if [ $1 = "Ascend" ] ; then
+    run_ascend "$@"
+elif [ $1 = "GPU" ] ; then
+    run_gpu "$@"
+else
+    echo "not support platform"
+fi;
--- a/example/resnet50_quant/src/config.py
+++ b/example/resnet50_quant/src/config.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+network config setting, will be used in train.py and eval.py
+"""
+from easydict import EasyDict as ed
+config = ed({
+    "class_num": 1001,
+    "batch_size": 32,
+    "loss_scale": 1024,
+    "momentum": 0.9,
+    "weight_decay": 1e-4,
+    "epoch_size": 110,
+    "pretrained_epoch_size": 90,
+    "buffer_size": 1000,
+    "image_height": 224,
+    "image_width": 224,
+    "save_checkpoint": True,
+    "save_checkpoint_epochs": 1,
+    "keep_checkpoint_max": 50,
+    "save_checkpoint_path": "./",
+    "warmup_epochs": 0,
+    "lr_decay_mode": "cosine",
+    "use_label_smooth": True,
+    "label_smooth_factor": 0.1,
+    "lr_init": 0,
+    "lr_max": 0.1
+})
--- a/example/resnet50_quant/src/crossentropy.py
+++ b/example/resnet50_quant/src/crossentropy.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""define loss function for network"""
+from mindspore.nn.loss.loss import _Loss
+from mindspore.ops import operations as P
+from mindspore.ops import functional as F
+from mindspore import Tensor
+from mindspore.common import dtype as mstype
+import mindspore.nn as nn
+class CrossEntropy(_Loss):
+    """the redefined loss function with SoftmaxCrossEntropyWithLogits"""
+    def __init__(self, smooth_factor=0, num_classes=1001):
+        super(CrossEntropy, self).__init__()
+        self.onehot = P.OneHot()
+        self.on_value = Tensor(1.0 - smooth_factor, mstype.float32)
+        self.off_value = Tensor(1.0 * smooth_factor / (num_classes - 1), mstype.float32)
+        self.ce = nn.SoftmaxCrossEntropyWithLogits()
+        self.mean = P.ReduceMean(False)
+    def construct(self, logit, label):
+        one_hot_label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value)
+        loss = self.ce(logit, one_hot_label)
+        loss = self.mean(loss, 0)
+        return loss
--- a/example/resnet50_quant/src/dataset.py
+++ b/example/resnet50_quant/src/dataset.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+create train or eval dataset.
+"""
+import os
+import mindspore.common.dtype as mstype
+import mindspore.dataset.engine as de
+import mindspore.dataset.transforms.vision.c_transforms as C
+import mindspore.dataset.transforms.c_transforms as C2
+from mindspore.communication.management import init, get_rank, get_group_size
+def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
+    """
+    create a train or eval dataset
+    Args:
+        dataset_path(string): the path of dataset.
+        do_train(bool): whether dataset is used for train or eval.
+        repeat_num(int): the repeat times of dataset. Default: 1
+        batch_size(int): the batch size of dataset. Default: 32
+        target(str): the device target. Default: Ascend
+    Returns:
+        dataset
+    """
+    if target == "Ascend":
+        device_num = int(os.getenv("RANK_SIZE"))
+        rank_id = int(os.getenv("RANK_ID"))
+    else:
+        init("nccl")
+        rank_id = get_rank()
+        device_num = get_group_size()
+    if device_num == 1:
+        ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
+    else:
+        ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
+                                     num_shards=device_num, shard_id=rank_id)
+    image_size = 224
+    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
+    std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
+    # define map operations
+    if do_train:
+        trans = [
+            C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
+            C.RandomHorizontalFlip(prob=0.5),
+            C.Normalize(mean=mean, std=std),
+            C.HWC2CHW()
+        ]
+    else:
+        trans = [
+            C.Decode(),
+            C.Resize((256, 256)),
+            C.CenterCrop(image_size),
+            C.Normalize(mean=mean, std=std),
+            C.HWC2CHW()
+        ]
+    type_cast_op = C2.TypeCast(mstype.int32)
+    ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans)
+    ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op)
+    # apply batch operations
+    ds = ds.batch(batch_size, drop_remainder=True)
+    # apply dataset repeat operation
+    ds = ds.repeat(repeat_num)
+    return ds
--- a/example/resnet50_quant/src/launch.py
+++ b/example/resnet50_quant/src/launch.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""launch train script"""
+import os
+import sys
+import json
+import subprocess
+import shutil
+import platform
+from argparse import ArgumentParser
+def parse_args():
+    """
+    parse args .
+    Args:
+    Returns:
+        args.
+    Examples:
+        >>> parse_args()
+    """
+    parser = ArgumentParser(description="mindspore distributed training launch "
+                                        "helper utilty that will spawn up "
+                                        "multiple distributed processes")
+    parser.add_argument("--nproc_per_node", type=int, default=1,
+                        help="The number of processes to launch on each node, "
+                             "for D training, this is recommended to be set "
+                             "to the number of D in your system so that "
+                             "each process can be bound to a single D.")
+    parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7",
+                        help="will use the visible devices sequentially")
+    parser.add_argument("--server_id", type=str, default="",
+                        help="server ip")
+    parser.add_argument("--training_script", type=str,
+                        help="The full path to the single D training "
+                             "program/script to be launched in parallel, "
+                             "followed by all the arguments for the "
+                             "training script")
+    # rest from the training program
+    args, unknown = parser.parse_known_args()
+    args.training_script_args = unknown
+    return args
+def main():
+    print("start", __file__)
+    args = parse_args()
+    print(args)
+    visible_devices = args.visible_devices.split(',')
+    assert os.path.isfile(args.training_script)
+    assert len(visible_devices) >= args.nproc_per_node
+    print('visible_devices:{}'.format(visible_devices))
+    if not args.server_id:
+        print('pleaser input server ip!!!')
+        exit(0)
+    print('server_id:{}'.format(args.server_id))
+    # construct hccn_table
+    hccn_configs = open('/etc/hccn.conf', 'r').readlines()
+    device_ips = {}
+    for hccn_item in hccn_configs:
+        hccn_item = hccn_item.strip()
+        if hccn_item.startswith('address_'):
+            device_id, device_ip = hccn_item.split('=')
+            device_id = device_id.split('_')[1]
+            device_ips[device_id] = device_ip
+            print('device_id:{}, device_ip:{}'.format(device_id, device_ip))
+    hccn_table = {}
+    arch = platform.processor()
+    hccn_table['board_id'] = {'aarch64': '0x002f', 'x86_64': '0x0000'}[arch]
+    hccn_table['chip_info'] = '910'
+    hccn_table['deploy_mode'] = 'lab'
+    hccn_table['group_count'] = '1'
+    hccn_table['group_list'] = []
+    instance_list = []
+    usable_dev = ''
+    for instance_id in range(args.nproc_per_node):
+        instance = {}
+        instance['devices'] = []
+        device_id = visible_devices[instance_id]
+        device_ip = device_ips[device_id]
+        usable_dev += str(device_id)
+        instance['devices'].append({
+            'device_id': device_id,
+            'device_ip': device_ip,
+        })
+        instance['rank_id'] = str(instance_id)
+        instance['server_id'] = args.server_id
+        instance_list.append(instance)
+    hccn_table['group_list'].append({
+        'device_num': str(args.nproc_per_node),
+        'server_num': '1',
+        'group_name': '',
+        'instance_count': str(args.nproc_per_node),
+        'instance_list': instance_list,
+    })
+    hccn_table['para_plane_nic_location'] = 'device'
+    hccn_table['para_plane_nic_name'] = []
+    for instance_id in range(args.nproc_per_node):
+        eth_id = visible_devices[instance_id]
+        hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
+    hccn_table['para_plane_nic_num'] = str(args.nproc_per_node)
+    hccn_table['status'] = 'completed'
+    # save hccn_table to file
+    table_path = os.getcwd()
+    if not os.path.exists(table_path):
+        os.mkdir(table_path)
+    table_fn = os.path.join(table_path,
+                            'rank_table_{}p_{}_{}.json'.format(args.nproc_per_node, usable_dev, args.server_id))
+    with open(table_fn, 'w') as table_fp:
+        json.dump(hccn_table, table_fp, indent=4)
+    sys.stdout.flush()
+    # spawn the processes
+    processes = []
+    cmds = []
+    log_files = []
+    env = os.environ.copy()
+    env['RANK_SIZE'] = str(args.nproc_per_node)
+    cur_path = os.getcwd()
+    for rank_id in range(0, args.nproc_per_node):
+        os.chdir(cur_path)
+        device_id = visible_devices[rank_id]
+        device_dir = os.path.join(cur_path, 'device{}'.format(rank_id))
+        env['RANK_ID'] = str(rank_id)
+        env['DEVICE_ID'] = str(device_id)
+        if args.nproc_per_node > 1:
+            env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
+            env['RANK_TABLE_FILE'] = table_fn
+        if os.path.exists(device_dir):
+            shutil.rmtree(device_dir)
+        os.mkdir(device_dir)
+        os.chdir(device_dir)
+        cmd = [sys.executable, '-u']
+        cmd.append(args.training_script)
+        cmd.extend(args.training_script_args)
+        log_file = open('{dir}/log{id}.log'.format(dir=device_dir, id=rank_id), 'w')
+        process = subprocess.Popen(cmd, stdout=log_file, stderr=log_file, env=env)
+        processes.append(process)
+        cmds.append(cmd)
+        log_files.append(log_file)
+    for process, cmd, log_file in zip(processes, cmds, log_files):
+        process.wait()
+        if process.returncode != 0:
+            raise subprocess.CalledProcessError(returncode=process, cmd=cmd)
+        log_file.close()
+if __name__ == "__main__":
+    main()
--- a/example/resnet50_quant/src/lr_generator.py
+++ b/example/resnet50_quant/src/lr_generator.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""learning rate generator"""
+import math
+import numpy as np
+def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode):
+    """
+    generate learning rate array
+    Args:
+       lr_init(float): init learning rate
+       lr_end(float): end learning rate
+       lr_max(float): max learning rate
+       warmup_epochs(int): number of warmup epochs
+       total_epochs(int): total epoch of training
+       steps_per_epoch(int): steps of one epoch
+       lr_decay_mode(string): learning rate decay mode, including steps, poly, cosine or default
+    Returns:
+       np.array, learning rate array
+    """
+    lr_each_step = []
+    total_steps = steps_per_epoch * total_epochs
+    warmup_steps = steps_per_epoch * warmup_epochs
+    if lr_decay_mode == 'steps':
+        decay_epoch_index = [0.3 * total_steps, 0.6 * total_steps, 0.8 * total_steps]
+        for i in range(total_steps):
+            if i < decay_epoch_index[0]:
+                lr = lr_max
+            elif i < decay_epoch_index[1]:
+                lr = lr_max * 0.1
+            elif i < decay_epoch_index[2]:
+                lr = lr_max * 0.01
+            else:
+                lr = lr_max * 0.001
+            lr_each_step.append(lr)
+    elif lr_decay_mode == 'poly':
+        if warmup_steps != 0:
+            inc_each_step = (float(lr_max) - float(lr_init)) / float(warmup_steps)
+        else:
+            inc_each_step = 0
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr = float(lr_init) + inc_each_step * float(i)
+            else:
+                base = (1.0 - (float(i) - float(warmup_steps)) / (float(total_steps) - float(warmup_steps)))
+                lr = float(lr_max) * base * base
+                if lr < 0.0:
+                    lr = 0.0
+            lr_each_step.append(lr)
+    elif lr_decay_mode == 'cosine':
+        decay_steps = total_steps - warmup_steps
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr_inc = (float(lr_max) - float(lr_init)) / float(warmup_steps)
+                lr = float(lr_init) + lr_inc * (i + 1)
+            else:
+                linear_decay = (total_steps - i) / decay_steps
+                cosine_decay = 0.5 * (1 + math.cos(math.pi * 2 * 0.47 * i / decay_steps))
+                decayed = linear_decay * cosine_decay + 0.00001
+                lr = lr_max * decayed
+            lr_each_step.append(lr)
+    else:
+        for i in range(total_steps):
+            if i < warmup_steps:
+                lr = lr_init + (lr_max - lr_init) * i / warmup_steps
+            else:
+                lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps)
+            lr_each_step.append(lr)
+    learning_rate = np.array(lr_each_step).astype(np.float32)
+    return learning_rate
--- a/example/resnet50_quant/src/utils.py
+++ b/example/resnet50_quant/src/utils.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""utils script"""
+from mindspore.train.serialization import load_param_into_net
+def _load_param_into_net(model, params_dict):
+    """
+    load fp32 model parameters to quantization model.
+    Args:
+        model: quantization model
+        params_dict: f32 param
+    Returns:
+        None
+    """
+    model_param = list(model.parameters_and_names())
+    filter_keys = ['global_step', 'learning_rate', 'momentum', 'moments']
+    filt_param_dict = list(filter(lambda x: x.split('.')[0] not in filter_keys, params_dict))
+    if len(model_param) == len(filt_param_dict):
+        load_param_into_net(model, params_dict)
+        return
+    iterable_dict = {
+        'weight': iter([item for item in params_dict.items() if item[0].endswith('weight')]),
+        'bias': iter([item for item in params_dict.items() if item[0].endswith('bias')]),
+        'gamma': iter([item for item in params_dict.items() if item[0].endswith('gamma')]),
+        'beta': iter([item for item in params_dict.items() if item[0].endswith('beta')]),
+        'moving_mean': iter([item for item in params_dict.items() if item[0].endswith('moving_mean')]),
+        'moving_variance': iter(
+            [item for item in params_dict.items() if item[0].endswith('moving_variance')]),
+        'minq': iter([item for item in params_dict.items() if item[0].endswith('minq')]),
+        'maxq': iter([item for item in params_dict.items() if item[0].endswith('maxq')])
+    }
+    for name, param in model.parameters_and_names():
+        key_name = name.split(".")[-1]
+        if key_name not in iterable_dict.keys():
+            continue
+        value_param = next(iterable_dict[key_name], None)
+        if value_param is not None:
+            param.set_parameter_data(value_param[1].data)
+            print(f'init model param {name} with checkpoint param {value_param[0]}')
--- a/example/resnet50_quant/train.py
+++ b/example/resnet50_quant/train.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""train_imagenet."""
+import os
+import argparse
+from mindspore import context
+from mindspore import Tensor
+from mindspore.parallel._auto_parallel_context import auto_parallel_context
+from mindspore.nn.optim.momentum import Momentum
+from mindspore.train.model import Model, ParallelMode
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
+from mindspore.train.loss_scale_manager import FixedLossScaleManager
+from mindspore.train.serialization import load_checkpoint
+from mindspore.communication.management import init, get_rank, get_group_size
+import mindspore.nn as nn
+import mindspore.common.initializer as weight_init
+from models.resnet_quant import resnet50_quant
+from src.dataset import create_dataset
+from src.lr_generator import get_lr
+from src.config import config
+from src.crossentropy import CrossEntropy
+from src.utils import _load_param_into_net
+parser = argparse.ArgumentParser(description='Image classification')
+parser.add_argument('--run_distribute', type=bool, default=False, help='Run distribute')
+parser.add_argument('--device_num', type=int, default=1, help='Device num.')
+parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.')
+parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.')
+parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
+parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
+parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path')
+args_opt = parser.parse_args()
+if __name__ == '__main__':
+    target = args_opt.device_target
+    ckpt_save_dir = config.save_checkpoint_path
+    context.set_context(mode=context.GRAPH_MODE, device_target=target, save_graphs=False)
+    if not args_opt.do_eval and args_opt.run_distribute:
+        if target == "Ascend":
+            device_id = int(os.getenv('DEVICE_ID'))
+            context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=device_id,
+                                enable_auto_mixed_precision=True)
+            init()
+            context.set_auto_parallel_context(device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
+                                              mirror_mean=True)
+            auto_parallel_context().set_all_reduce_fusion_split_indices([107, 160])
+            ckpt_save_dir = config.save_checkpoint_path
+        elif target == "GPU":
+            context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False)
+            init("nccl")
+            context.set_auto_parallel_context(device_num=get_group_size(), parallel_mode=ParallelMode.DATA_PARALLEL,
+                                              mirror_mean=True)
+            ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
+    epoch_size = config.epoch_size
+    net = resnet50_quant(class_num=config.class_num)
+    net.set_train(True)
+    print("========resnet50:\r\n{}".format(net))
+    # weight init
+    if args_opt.pre_trained:
+        param_dict = load_checkpoint(args_opt.pre_trained)
+        _load_param_into_net(net, param_dict)
+        epoch_size = config.epoch_size - config.pretrained_epoch_size
+    else:
+        for _, cell in net.cells_and_names():
+            if isinstance(cell, nn.Conv2d):
+                cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(),
+                                                                    cell.weight.default_input.shape(),
+                                                                    cell.weight.default_input.dtype()).to_tensor()
+            if isinstance(cell, nn.Dense):
+                cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(),
+                                                                    cell.weight.default_input.shape(),
+                                                                    cell.weight.default_input.dtype()).to_tensor()
+    if not config.use_label_smooth:
+        config.label_smooth_factor = 0.0
+    loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.class_num)
+    if args_opt.do_train:
+        dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True,
+                                 repeat_num=epoch_size, batch_size=config.batch_size, target=target)
+        step_size = dataset.get_dataset_size()
+        loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
+        lr = get_lr(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs,
+                    total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode='cosine')
+        if args_opt.pre_trained:
+            lr = lr[config.pretrained_epoch_size * step_size:]
+        lr = Tensor(lr)
+        opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
+                       config.weight_decay, config.loss_scale)
+        model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'})
+        time_cb = TimeMonitor(data_size=step_size)
+        loss_cb = LossMonitor()
+        cb = [time_cb, loss_cb]
+        if config.save_checkpoint:
+            config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs*step_size,
+                                         keep_checkpoint_max=config.keep_checkpoint_max)
+            ckpt_cb = ModelCheckpoint(prefix="resnet", directory=ckpt_save_dir, config=config_ck)
+            cb += [ckpt_cb]
+        model.train(epoch_size, dataset, callbacks=cb)