mobilenetV2 change for gpu

26b1242b · chenzomi · f80e5796 · 26b1242b · 26b1242b · 26b1242b
11 changed file
--- a/mindspore/nn/layer/quant.py
+++ b/mindspore/nn/layer/quant.py
@@ -960,7 +960,7 @@ class ActQuant(_QuantActivation):
        Tensor, with the same type and shape as the `x`.

    Examples:
-        >>> act_quant = nn.ActQuant(nn.ReLU)
+        >>> act_quant = nn.ActQuant(nn.ReLU())
        >>> input_x = Tensor(np.array([[1, 2, -1], [-2, 0, -1]]), mindspore.float32)
        >>> result = act_quant(input_x)
    """
@@ -1009,7 +1009,7 @@ class LeakyReLUQuant(_QuantActivation):
        quant_delay (int): Quantization delay parameters according by global step. Default: 0.

    Inputs:
-        - **x** (Tensor) - The input of HSwishQuant.
+        - **x** (Tensor) - The input of LeakyReLUQuant.

    Outputs:
        Tensor, with the same type and shape as the `x`.

--- a/mindspore/train/quant/quant.py
+++ b/mindspore/train/quant/quant.py
@@ -306,7 +306,7 @@ class ExportToQuantInferNetwork:
        std_dev (int, float): Input data variance. Default: 127.5.

    Returns:
-        Cell, GEIR backend Infer network.
+        Cell, Infer network.
    """
    __quant_op_name__ = ["TensorAdd", "Sub", "Mul", "RealDiv"]


--- a/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh
+++ b/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh
@@ -91,6 +91,6 @@ if [ $1 = "Ascend" ] ; then
 elif [ $1 = "GPU" ] ; then
    run_gpu "$@"
 else
-    echo "not support platform"
+    echo "Unsupported platform."
 fi;

--- a/model_zoo/official/cv/mobilenetv2/src/mobilenetV2_fusion.py
+++ b/model_zoo/official/cv/mobilenetv2/src/mobilenetV2_fusion.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+# """MobileNetV2 Quant model define"""
+
+import numpy as np
+import mindspore.nn as nn
+from mindspore.ops import operations as P
+from mindspore import Tensor
+
+__all__ = ['mobilenetV2']
+
+
+def _make_divisible(v, divisor, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10 %.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class GlobalAvgPooling(nn.Cell):
+    """
+    Global avg pooling definition.
+
+    Args:
+
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> GlobalAvgPooling()
+    """
+
+    def __init__(self):
+        super(GlobalAvgPooling, self).__init__()
+        self.mean = P.ReduceMean(keep_dims=False)
+
+    def construct(self, x):
+        x = self.mean(x, (2, 3))
+        return x
+
+
+class ConvBNReLU(nn.Cell):
+    """
+    Convolution/Depthwise fused with Batchnorm and ReLU block definition.
+
+    Args:
+        in_planes (int): Input channel.
+        out_planes (int): Output channel.
+        kernel_size (int): Input kernel size.
+        stride (int): Stride size for the first convolutional layer. Default: 1.
+        groups (int): channel group. Convolution is 1 while Depthiwse is input channel. Default: 1.
+
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> ConvBNReLU(16, 256, kernel_size=1, stride=1, groups=1)
+    """
+
+    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
+        super(ConvBNReLU, self).__init__()
+        padding = (kernel_size - 1) // 2
+        self.conv = nn.Conv2dBnAct(in_planes, out_planes, kernel_size,
+                                   stride=stride,
+                                   pad_mode='pad',
+                                   padding=padding,
+                                   group=groups,
+                                   has_bn=True,
+                                   activation='relu')
+
+    def construct(self, x):
+        x = self.conv(x)
+        return x
+
+
+class InvertedResidual(nn.Cell):
+    """
+    Mobilenetv2 residual block definition.
+
+    Args:
+        inp (int): Input channel.
+        oup (int): Output channel.
+        stride (int): Stride size for the first convolutional layer. Default: 1.
+        expand_ratio (int): expand ration of input channel
+
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> ResidualBlock(3, 256, 1, 1)
+    """
+
+    def __init__(self, inp, oup, stride, expand_ratio):
+        super(InvertedResidual, self).__init__()
+        assert stride in [1, 2]
+
+        hidden_dim = int(round(inp * expand_ratio))
+        self.use_res_connect = stride == 1 and inp == oup
+
+        layers = []
+        if expand_ratio != 1:
+            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
+        layers.extend([
+            ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim),
+            nn.Conv2dBnAct(hidden_dim, oup, kernel_size=1, stride=1, pad_mode='pad', padding=0, group=1, has_bn=True)
+        ])
+        self.conv = nn.SequentialCell(layers)
+        self.add = P.TensorAdd()
+
+    def construct(self, x):
+        out = self.conv(x)
+        if self.use_res_connect:
+            out = self.add(out, x)
+        return out
+
+
+class mobilenetV2(nn.Cell):
+    """
+    mobilenetV2 fusion architecture.
+
+    Args:
+        class_num (Cell): number of classes.
+        width_mult (int): Channels multiplier for round to 8/16 and others. Default is 1.
+        has_dropout (bool): Is dropout used. Default is false
+        inverted_residual_setting (list): Inverted residual settings. Default is None
+        round_nearest (list): Channel round to . Default is 8
+    Returns:
+        Tensor, output tensor.
+
+    Examples:
+        >>> mobilenetV2(num_classes=1000)
+    """
+
+    def __init__(self, num_classes=1000, width_mult=1.,
+                 has_dropout=False, inverted_residual_setting=None, round_nearest=8):
+        super(mobilenetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        # setting of inverted residual blocks
+        self.cfgs = inverted_residual_setting
+        if inverted_residual_setting is None:
+            self.cfgs = [
+                # t, c, n, s
+                [1, 16, 1, 1],
+                [6, 24, 2, 2],
+                [6, 32, 3, 2],
+                [6, 64, 4, 2],
+                [6, 96, 3, 1],
+                [6, 160, 3, 2],
+                [6, 320, 1, 1],
+            ]
+
+        # building first layer
+        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
+        self.out_channels = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
+
+        features = [ConvBNReLU(3, input_channel, stride=2)]
+        # building inverted residual blocks
+        for t, c, n, s in self.cfgs:
+            output_channel = _make_divisible(c * width_mult, round_nearest)
+            for i in range(n):
+                stride = s if i == 0 else 1
+                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
+                input_channel = output_channel
+        # building last several layers
+        features.append(ConvBNReLU(input_channel, self.out_channels, kernel_size=1))
+        # make it nn.CellList
+        self.features = nn.SequentialCell(features)
+        # mobilenet head
+        head = ([GlobalAvgPooling(),
+                 nn.DenseBnAct(self.out_channels, num_classes, has_bias=True, has_bn=False)
+                 ] if not has_dropout else
+                [GlobalAvgPooling(),
+                 nn.Dropout(0.2),
+                 nn.DenseBnAct(self.out_channels, num_classes, has_bias=True, has_bn=False)
+                 ])
+        self.head = nn.SequentialCell(head)
+
+        # init weights
+        self._initialize_weights()
+
+    def construct(self, x):
+        x = self.features(x)
+        x = self.head(x)
+        return x
+
+    def _initialize_weights(self):
+        """
+        Initialize weights.
+
+        Args:
+
+        Returns:
+            None.
+
+        Examples:
+            >>> _initialize_weights()
+        """
+        for _, m in self.cells_and_names():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                w = Tensor(np.random.normal(0, np.sqrt(2. / n), m.weight.data.shape).astype("float32"))
+                m.weight.set_parameter_data(w)
+                if m.bias is not None:
+                    m.bias.set_parameter_data(Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
+            elif isinstance(m, nn.Conv2dBnAct):
+                n = m.conv.kernel_size[0] * m.conv.kernel_size[1] * m.conv.out_channels
+                w = Tensor(np.random.normal(0, np.sqrt(2. / n), m.conv.weight.data.shape).astype("float32"))
+                m.conv.weight.set_parameter_data(w)
+                if m.conv.bias is not None:
+                    m.conv.bias.set_parameter_data(Tensor(np.zeros(m.conv.bias.data.shape, dtype="float32")))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.gamma.set_parameter_data(Tensor(np.ones(m.gamma.data.shape, dtype="float32")))
+                m.beta.set_parameter_data(Tensor(np.zeros(m.beta.data.shape, dtype="float32")))
+            elif isinstance(m, nn.Dense):
+                m.weight.set_parameter_data(Tensor(np.random.normal(0, 0.01, m.weight.data.shape).astype("float32")))
+                if m.bias is not None:
+                    m.bias.set_parameter_data(Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
+            elif isinstance(m, nn.DenseBnAct):
+                m.dense.weight.set_parameter_data(
+                    Tensor(np.random.normal(0, 0.01, m.dense.weight.data.shape).astype("float32")))
+                if m.dense.bias is not None:
+                    m.dense.bias.set_parameter_data(Tensor(np.zeros(m.dense.bias.data.shape, dtype="float32")))
--- a/model_zoo/official/cv/mobilenetv2/train.py
+++ b/model_zoo/official/cv/mobilenetv2/train.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-"""train_imagenet."""
+"""Train mobilenetV2 on ImageNet."""
+
 import os
 import time
 import argparse
@@ -165,15 +166,14 @@ if __name__ == '__main__':
        print("train args: ", args_opt)
        print("cfg: ", config_gpu)

-        # define net
+        # define network
        net = mobilenet_v2(num_classes=config_gpu.num_classes, platform="GPU")
        # define loss
        if config_gpu.label_smooth > 0:
-            loss = CrossEntropyWithLabelSmooth(
-                smooth_factor=config_gpu.label_smooth, num_classes=config_gpu.num_classes)
+            loss = CrossEntropyWithLabelSmooth(smooth_factor=config_gpu.label_smooth,
+                                               num_classes=config_gpu.num_classes)
        else:
-            loss = SoftmaxCrossEntropyWithLogits(
-                is_grad=False, sparse=True, reduction='mean')
+            loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')
        # define dataset
        epoch_size = config_gpu.epoch_size
        dataset = create_dataset(dataset_path=args_opt.dataset_path,
@@ -187,7 +187,8 @@ if __name__ == '__main__':
        if args_opt.pre_trained:
            param_dict = load_checkpoint(args_opt.pre_trained)
            load_param_into_net(net, param_dict)
-        # define optimizer
+
+        # get learning rate
        loss_scale = FixedLossScaleManager(
            config_gpu.loss_scale, drop_overflow_update=False)
        lr = Tensor(get_lr(global_step=0,
@@ -197,12 +198,14 @@ if __name__ == '__main__':
                           warmup_epochs=config_gpu.warmup_epochs,
                           total_epochs=epoch_size,
                           steps_per_epoch=step_size))
+
+        # define optimization
        opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config_gpu.momentum,
                       config_gpu.weight_decay, config_gpu.loss_scale)
        # define model
-        model = Model(net, loss_fn=loss, optimizer=opt,
-                      loss_scale_manager=loss_scale)
+        model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale)

+        print("============== Starting Training ==============")
        cb = [Monitor(lr_init=lr.asnumpy())]
        ckpt_save_dir = config_gpu.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
        if config_gpu.save_checkpoint:
@@ -212,6 +215,7 @@ if __name__ == '__main__':
            cb += [ckpt_cb]
        # begin train
        model.train(epoch_size, dataset, callbacks=cb)
+        print("============== End Training ==============")
    elif args_opt.platform == "Ascend":
        # train on ascend
        print("train args: ", args_opt, "\ncfg: ", config_ascend,

--- a/model_zoo/official/cv/mobilenetv2_quant/Readme.md
+++ b/model_zoo/official/cv/mobilenetv2_quant/Readme.md
@@ -64,12 +64,14 @@ Dataset use: ImageNet

 Train a MindSpore fusion MobileNetV2 model for ImageNet, like:

- sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]
+- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]
+- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]

 You can just run this command instead.

 ``` bash
->>> sh run_train.sh Ascend 4 192.168.0.1 0,1,2,3 ~/imagenet/train/ ~/mobilenet.ckpt
+>>> Ascend: sh run_train.sh Ascend 4 192.168.0.1 0,1,2,3 ~/imagenet/train/ ~/mobilenet.ckpt
+>>> GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/
 ```

 Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log  will be redirected to `./train/train.log` like followings. 

--- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh
+++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh
@@ -46,16 +46,50 @@ run_ascend()
            --device_target=$1 &> train.log &  # dataset train folder
 }

+run_gpu()
+{
+    if [ $2 -lt 1 ] && [ $2 -gt 8 ]
+    then
+        echo "error: DEVICE_NUM=$2 is not in (1-8)"
+    exit 1
+    fi
+
+    if [ ! -d $4 ]
+    then
+        echo "error: DATASET_PATH=$4 is not a directory"
+    exit 1
+    fi
+
+    BASEPATH=$(cd "`dirname $0`" || exit; pwd)
+    export PYTHONPATH=${BASEPATH}:$PYTHONPATH
+    if [ -d "../train" ];
+    then
+        rm -rf ../train
+    fi
+    mkdir ../train
+    cd ../train || exit
+
+    export CUDA_VISIBLE_DEVICES="$3"
+    mpirun -n $2 --allow-run-as-root \
+    python ${BASEPATH}/../train.py \
+        --dataset_path=$4 \
+        --device_target=$1 \
+        &> ../train.log &  # dataset train folder
+}
+
 if [ $# -gt 6 ] || [ $# -lt 4 ]
 then
    echo "Usage:\n \
          Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
+          GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \
          "
 exit 1
 fi

 if [ $1 = "Ascend" ] ; then
    run_ascend "$@"
+elif [ $1 = "GPU" ] ; then
+    run_gpu "$@"
 else
    echo "Unsupported device target."
 fi;

--- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train_quant.sh
+++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train_quant.sh
@@ -47,16 +47,51 @@ run_ascend()
            --device_target=$1 &> train.log &  # dataset train folder
 }

+run_gpu()
+{
+    if [ $2 -lt 1 ] && [ $2 -gt 8 ]
+    then
+        echo "error: DEVICE_NUM=$2 is not in (1-8)"
+    exit 1
+    fi
+
+    if [ ! -d $4 ]
+    then
+        echo "error: DATASET_PATH=$4 is not a directory"
+    exit 1
+    fi
+
+    BASEPATH=$(cd "`dirname $0`" || exit; pwd)
+    export PYTHONPATH=${BASEPATH}:$PYTHONPATH
+    if [ -d "../train" ];
+    then
+        rm -rf ../train
+    fi
+    mkdir ../train
+    cd ../train || exit
+
+    export CUDA_VISIBLE_DEVICES="$3"
+    mpirun -n $2 --allow-run-as-root \
+    python ${BASEPATH}/../train.py \
+        --dataset_path=$4 \
+        --device_target=$1 \
+        --quantization_aware=True \
+        &> ../train.log &  # dataset train folder
+}
+
 if [ $# -gt 6 ] || [ $# -lt 4 ]
 then
    echo "Usage:\n \
          Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
+          GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \
          "
 exit 1
 fi

 if [ $1 = "Ascend" ] ; then
    run_ascend "$@"
+elif [ $1 = "GPU" ] ; then
+    run_gpu "$@"
 else
    echo "Unsupported device target."
 fi;

--- a/model_zoo/official/cv/mobilenetv2_quant/src/config.py
+++ b/model_zoo/official/cv/mobilenetv2_quant/src/config.py
@@ -33,7 +33,7 @@ config_ascend = ed({
    "loss_scale": 1024,
    "save_checkpoint": True,
    "save_checkpoint_epochs": 1,
-    "keep_checkpoint_max": 200,
+    "keep_checkpoint_max": 300,
    "save_checkpoint_path": "./checkpoint",
    "quantization_aware": False,
 })
@@ -54,7 +54,45 @@ config_ascend_quant = ed({
    "loss_scale": 1024,
    "save_checkpoint": True,
    "save_checkpoint_epochs": 1,
-    "keep_checkpoint_max": 200,
+    "keep_checkpoint_max": 300,
+    "save_checkpoint_path": "./checkpoint",
+    "quantization_aware": True,
+})
+
+config_gpu = ed({
+    "num_classes": 1000,
+    "image_height": 224,
+    "image_width": 224,
+    "batch_size": 150,
+    "epoch_size": 200,
+    "warmup_epochs": 4,
+    "lr": 0.8,
+    "momentum": 0.9,
+    "weight_decay": 4e-5,
+    "label_smooth": 0.1,
+    "loss_scale": 1024,
+    "save_checkpoint": True,
+    "save_checkpoint_epochs": 1,
+    "keep_checkpoint_max": 300,
+    "save_checkpoint_path": "./checkpoint",
+})
+
+config_gpu_quant = ed({
+    "num_classes": 1000,
+    "image_height": 224,
+    "image_width": 224,
+    "batch_size": 134,
+    "epoch_size": 60,
+    "start_epoch": 200,
+    "warmup_epochs": 1,
+    "lr": 0.3,
+    "momentum": 0.9,
+    "weight_decay": 4e-5,
+    "label_smooth": 0.1,
+    "loss_scale": 1024,
+    "save_checkpoint": True,
+    "save_checkpoint_epochs": 1,
+    "keep_checkpoint_max": 300,
    "save_checkpoint_path": "./checkpoint",
    "quantization_aware": True,
 })
--- a/model_zoo/official/cv/mobilenetv2_quant/src/mobilenetV2.py
+++ b/model_zoo/official/cv/mobilenetv2_quant/src/mobilenetV2.py
@@ -222,6 +222,12 @@ class mobilenetV2(nn.Cell):
                m.weight.set_parameter_data(w)
                if m.bias is not None:
                    m.bias.set_parameter_data(Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
+            elif isinstance(m, nn.Conv2dBnAct):
+                n = m.conv.kernel_size[0] * m.conv.kernel_size[1] * m.conv.out_channels
+                w = Tensor(np.random.normal(0, np.sqrt(2. / n), m.conv.weight.data.shape).astype("float32"))
+                m.conv.weight.set_parameter_data(w)
+                if m.conv.bias is not None:
+                    m.conv.bias.set_parameter_data(Tensor(np.zeros(m.conv.bias.data.shape, dtype="float32")))
            elif isinstance(m, nn.BatchNorm2d):
                m.gamma.set_parameter_data(Tensor(np.ones(m.gamma.data.shape, dtype="float32")))
                m.beta.set_parameter_data(Tensor(np.zeros(m.beta.data.shape, dtype="float32")))
@@ -229,3 +235,8 @@ class mobilenetV2(nn.Cell):
                m.weight.set_parameter_data(Tensor(np.random.normal(0, 0.01, m.weight.data.shape).astype("float32")))
                if m.bias is not None:
                    m.bias.set_parameter_data(Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
+            elif isinstance(m, nn.DenseBnAct):
+                m.dense.weight.set_parameter_data(
+                    Tensor(np.random.normal(0, 0.01, m.dense.weight.data.shape).astype("float32")))
+                if m.dense.bias is not None:
+                    m.dense.bias.set_parameter_data(Tensor(np.zeros(m.dense.bias.data.shape, dtype="float32")))
--- a/model_zoo/official/cv/mobilenetv2_quant/train.py
+++ b/model_zoo/official/cv/mobilenetv2_quant/train.py
@@ -23,16 +23,17 @@ from mindspore import context
 from mindspore import Tensor
 from mindspore import nn
 from mindspore.train.model import Model, ParallelMode
+from mindspore.train.loss_scale_manager import FixedLossScaleManager
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
-from mindspore.communication.management import init
+from mindspore.communication.management import init, get_group_size, get_rank
 from mindspore.train.quant import quant
 import mindspore.dataset.engine as de

 from src.dataset import create_dataset
 from src.lr_generator import get_lr
 from src.utils import Monitor, CrossEntropyWithLabelSmooth
-from src.config import config_ascend, config_ascend_quant
+from src.config import config_ascend_quant, config_ascend, config_gpu_quant, config_gpu
 from src.mobilenetV2 import mobilenetV2

 random.seed(1)
@@ -55,11 +56,19 @@ if args_opt.device_target == "Ascend":
    context.set_context(mode=context.GRAPH_MODE,
                        device_target="Ascend",
                        device_id=device_id, save_graphs=False)
+elif args_opt.platform == "GPU":
+    init("nccl")
+    context.set_auto_parallel_context(device_num=get_group_size(),
+                                      parallel_mode=ParallelMode.DATA_PARALLEL,
+                                      mirror_mean=True)
+    context.set_context(mode=context.GRAPH_MODE,
+                        device_target="GPU",
+                        save_graphs=False)
 else:
    raise ValueError("Unsupported device target.")

-if __name__ == '__main__':
-    # train on ascend
+
+def train_on_ascend():
    config = config_ascend_quant if args_opt.quantization_aware else config_ascend
    print("training args: {}".format(args_opt))
    print("training configure: {}".format(config))
@@ -129,3 +138,72 @@ if __name__ == '__main__':
            callback += [ckpt_cb]
    model.train(epoch_size, dataset, callbacks=callback)
    print("============== End Training ==============")
+
+
+def train_on_gpu():
+    config = config_gpu_quant if args_opt.quantization_aware else config_gpu
+    print("training args: {}".format(args_opt))
+    print("training configure: {}".format(config))
+
+    # define network
+    network = mobilenetV2(num_classes=config.num_classes)
+    # define loss
+    if config.label_smooth > 0:
+        loss = CrossEntropyWithLabelSmooth(smooth_factor=config.label_smooth,
+                                           num_classes=config.num_classes)
+    else:
+        loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean')
+    # define dataset
+    epoch_size = config.epoch_size
+    dataset = create_dataset(dataset_path=args_opt.dataset_path,
+                             do_train=True,
+                             config=config,
+                             device_target=args_opt.device_target,
+                             repeat_num=1,
+                             batch_size=config.batch_size)
+    step_size = dataset.get_dataset_size()
+    # resume
+    if args_opt.pre_trained:
+        param_dict = load_checkpoint(args_opt.pre_trained)
+        load_param_into_net(network, param_dict)
+
+    # convert fusion network to quantization aware network
+    if config.quantization_aware:
+        network = quant.convert_quant_network(network,
+                                              bn_fold=True,
+                                              per_channel=[True, False],
+                                              symmetric=[True, True])
+
+    # get learning rate
+    loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
+    lr = Tensor(get_lr(global_step=config.start_epoch * step_size,
+                       lr_init=0,
+                       lr_end=0,
+                       lr_max=config.lr,
+                       warmup_epochs=config.warmup_epochs,
+                       total_epochs=epoch_size + config.start_epoch,
+                       steps_per_epoch=step_size))
+
+    # define optimization
+    opt = nn.Momentum(filter(lambda x: x.requires_grad, network.get_parameters()), lr, config.momentum,
+                      config.weight_decay, config.loss_scale)
+    # define model
+    model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale)
+
+    print("============== Starting Training ==============")
+    callback = [Monitor(lr_init=lr.asnumpy())]
+    ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(get_rank()) + "/"
+    if config.save_checkpoint:
+        config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
+                                     keep_checkpoint_max=config.keep_checkpoint_max)
+        ckpt_cb = ModelCheckpoint(prefix="mobilenetV2", directory=ckpt_save_dir, config=config_ck)
+        callback += [ckpt_cb]
+    model.train(epoch_size, dataset, callbacks=callback)
+    print("============== End Training ==============")
+
+
+if __name__ == '__main__':
+    if args_opt.device_target == "Ascend":
+        train_on_ascend()
+    elif args_opt.platform == "GPU":
+        train_on_gpu()