Merge branch 'develop_reg' into develop_reg

46955a26 · Felix · GitHub · 1b904319 · 82ed9470 · 46955a26
37 changed file
--- a/docs/zh_CN/feature_visiualization/get_started.md
+++ b/docs/zh_CN/feature_visiualization/get_started.md
@@ -37,7 +37,7 @@ def forward(self, inputs):
    y = self.pool2d_max(y)
    for bottleneck_block in self.bottleneck_block_list:
        y = bottleneck_block(y)
-    y = self.pool2d_avg(y)
+    y = self.avg_pool(y)
    y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
    y = self.out(y)
    return y, self.fm

--- a/ppcls/arch/__init__.py
+++ b/ppcls/arch/__init__.py
@@ -12,8 +12,54 @@
 #See the License for the specific language governing permissions and
 #limitations under the License.
+import copy
+import importlib
+import paddle.nn as nn
 from . import backbone
 from .backbone import *
 from ppcls.arch.loss_metrics.loss import *
 from .utils import *
+def build_model(config):
+    config = copy.deepcopy(config)
+    model_type = config.pop("name")
+    mod = importlib.import_module(__name__)
+    arch = getattr(mod, model_type)(**config)
+    return arch
+class RecModel(nn.Layer):
+    def __init__(self, **config):
+        super().__init__()
+        backbone_config = config["Backbone"]
+        backbone_name = backbone_config.pop("name")
+        self.backbone = getattr(backbone_name)(**backbone_config)
+        if "backbone_stop_layer" in config:
+            backbone_stop_layer = config["backbone_stop_layer"]
+            self.backbone.stop_layer(backbone_stop_layer)
+        if "Neck" in config:
+            neck_config = config["Neck"]
+            neck_name = neck_config.pop("name")
+            self.neck = getattr(neck_name)(**neck_config)
+        else:
+            self.neck = None
+        if "Head" in config:
+            head_config = config["Head"]
+            head_name = head_config.pop("name")
+            self.head = getattr(head_name)(**head_config)
+        else:
+            self.head = None
+    def forward(self, x):
+        y = self.backbone(x)
+        if self.neck is not None:
+            y = self.neck(y)
+        if self.head is not None:
+            y = self.head(y)
+        return y
--- a/ppcls/arch/backbone/legendary_models/__init__.py
+++ b/ppcls/arch/backbone/legendary_models/__init__.py
+from .resnet import ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, ResNet18_vd, ResNet34_vd, ResNet50_vd, ResNet101_vd, ResNet152_vd
+from .hrnet import HRNet_W18_C, HRNet_W30_C, HRNet_W32_C, HRNet_W40_C, HRNet_W44_C, HRNet_W48_C, HRNet_W64_C
+from .mobilenet_v1 import MobileNetV1_x0_25, MobileNetV1_x0_5, MobileNetV1_x0_75, MobileNetV1
+from .mobilenet_v3 import MobileNetV3_small_x0_35, MobileNetV3_small_x0_5, MobileNetV3_small_x0_75, MobileNetV3_small_x1_0, MobileNetV3_small_x1_25, MobileNetV3_large_x0_35, MobileNetV3_large_x0_5, MobileNetV3_large_x0_75, MobileNetV3_large_x1_0, MobileNetV3_large_x1_25
+from .inception_v3 import InceptionV3
+from .vgg import VGG11, VGG13, VGG16, VGG19
--- a/ppcls/arch/backbone/legendary_models/hrnet.py
+++ b/ppcls/arch/backbone/legendary_models/hrnet.py
@@ -17,34 +17,45 @@ from __future__ import division
 from __future__ import print_function
 import math
-import numpy as np
 import paddle
+from paddle import nn
 from paddle import ParamAttr
-import paddle.nn as nn
+from paddle.nn.functional import upsample
-import paddle.nn.functional as F
-from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
 from paddle.nn.initializer import Uniform
-from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer, Identity
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
-__all__ = [
-    "HRNet_W18_C",
+MODEL_URLS = {
-    "HRNet_W30_C",
+    "HRNet_W18_C":
-    "HRNet_W32_C",
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W18_C_pretrained.pdparams",
-    "HRNet_W40_C",
+    "HRNet_W30_C":
-    "HRNet_W44_C",
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W30_C_pretrained.pdparams",
-    "HRNet_W48_C",
+    "HRNet_W32_C":
-    "HRNet_W60_C",
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W32_C_pretrained.pdparams",
-    "HRNet_W64_C",
+    "HRNet_W40_C":
-    "SE_HRNet_W18_C",
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W40_C_pretrained.pdparams",
-    "SE_HRNet_W30_C",
+    "HRNet_W44_C":
-    "SE_HRNet_W32_C",
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W44_C_pretrained.pdparams",
-    "SE_HRNet_W40_C",
+    "HRNet_W48_C":
-    "SE_HRNet_W44_C",
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W48_C_pretrained.pdparams",
-    "SE_HRNet_W48_C",
+    "HRNet_W64_C":
-    "SE_HRNet_W60_C",
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/HRNet_W64_C_pretrained.pdparams"
-    "SE_HRNet_W64_C",
+}
-]
+__all__ = list(MODEL_URLS.keys())
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act is None:
+        return Identity()
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
 class ConvBNLayer(TheseusLayer):
@@ -54,136 +65,25 @@ class ConvBNLayer(TheseusLayer):
                 filter_size,
                 stride=1,
                 groups=1,
-                 act="relu",
+                 act="relu"):
-                 name=None):
+        super().__init__()
-        super(ConvBNLayer, self).__init__()
-        self._conv = nn.Conv2D(
+        self.conv = nn.Conv2D(
            in_channels=num_channels,
            out_channels=num_filters,
            kernel_size=filter_size,
            stride=stride,
            padding=(filter_size - 1) // 2,
            groups=groups,
-            weight_attr=ParamAttr(name=name + "_weights"),
            bias_attr=False)
-        bn_name = name + '_bn'
+        self.bn = nn.BatchNorm(num_filters, act=None)
-        self._batch_norm = nn.BatchNorm(
+        self.act = _create_act(act)
-            num_filters,
-            act=act,
-            param_attr=ParamAttr(name=bn_name + '_scale'),
-            bias_attr=ParamAttr(bn_name + '_offset'),
-            moving_mean_name=bn_name + '_mean',
-            moving_variance_name=bn_name + '_variance')
-    def forward(self, x, res_dict=None):
-        y = self._conv(x)
-        y = self._batch_norm(y)
-        return y
-class Layer1(TheseusLayer):
-    def __init__(self, num_channels, has_se=False, name=None):
-        super(Layer1, self).__init__()
-        self.bottleneck_block_list = []
-        for i in range(4):
-            bottleneck_block = self.add_sublayer(
-                "bb_{}_{}".format(name, i + 1),
-                BottleneckBlock(
-                    num_channels=num_channels if i == 0 else 256,
-                    num_filters=64,
-                    has_se=has_se,
-                    stride=1,
-                    downsample=True if i == 0 else False,
-                    name=name + '_' + str(i + 1)))
-            self.bottleneck_block_list.append(bottleneck_block)
-    def forward(self, x, res_dict=None):
-        y = x
-        for block_func in self.bottleneck_block_list:
-            y = block_func(y)
-        return y
-class TransitionLayer(TheseusLayer):
-    def __init__(self, in_channels, out_channels, name=None):
-        super(TransitionLayer, self).__init__()
-        num_in = len(in_channels)
-        num_out = len(out_channels)
-        out = []
-        self.conv_bn_func_list = []
-        for i in range(num_out):
-            residual = None
-            if i < num_in:
-                if in_channels[i] != out_channels[i]:
-                    residual = self.add_sublayer(
-                        "transition_{}_layer_{}".format(name, i + 1),
-                        ConvBNLayer(
-                            num_channels=in_channels[i],
-                            num_filters=out_channels[i],
-                            filter_size=3,
-                            name=name + '_layer_' + str(i + 1)))
-            else:
-                residual = self.add_sublayer(
-                    "transition_{}_layer_{}".format(name, i + 1),
-                    ConvBNLayer(
-                        num_channels=in_channels[-1],
-                        num_filters=out_channels[i],
-                        filter_size=3,
-                        stride=2,
-                        name=name + '_layer_' + str(i + 1)))
-            self.conv_bn_func_list.append(residual)
-    def forward(self, x, res_dict=None):
-        outs = []
-        for idx, conv_bn_func in enumerate(self.conv_bn_func_list):
-            if conv_bn_func is None:
-                outs.append(x[idx])
-            else:
-                if idx < len(x):
-                    outs.append(conv_bn_func(x[idx]))
-                else:
-                    outs.append(conv_bn_func(x[-1]))
-        return outs
-class Branches(TheseusLayer):
-    def __init__(self,
-                 block_num,
-                 in_channels,
-                 out_channels,
-                 has_se=False,
-                 name=None):
-        super(Branches, self).__init__()
-        self.basic_block_list = []
-        for i in range(len(out_channels)):
-            self.basic_block_list.append([])
-            for j in range(block_num):
-                in_ch = in_channels[i] if j == 0 else out_channels[i]
-                basic_block_func = self.add_sublayer(
-                    "bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1),
-                    BasicBlock(
-                        num_channels=in_ch,
-                        num_filters=out_channels[i],
-                        has_se=has_se,
-                        name=name + '_branch_layer_' + str(i + 1) + '_' +
-                        str(j + 1)))
-                self.basic_block_list[i].append(basic_block_func)
-    def forward(self, x, res_dict=None):
+    def forward(self, x):
-        outs = []
+        x = self.conv(x)
-        for idx, xi in enumerate(x):
+        x = self.bn(x)
-            conv = xi
+        x = self.act(x)
-            basic_block_list = self.basic_block_list[idx]
+        return x
-            for basic_block_func in basic_block_list:
-                conv = basic_block_func(conv)
-            outs.append(conv)
-        return outs
 class BottleneckBlock(TheseusLayer):
@@ -192,9 +92,8 @@ class BottleneckBlock(TheseusLayer):
                 num_filters,
                 has_se,
                 stride=1,
-                 downsample=False,
+                 downsample=False):
-                 name=None):
+        super().__init__()
-        super(BottleneckBlock, self).__init__()
        self.has_se = has_se
        self.downsample = downsample
@@ -203,540 +102,640 @@ class BottleneckBlock(TheseusLayer):
            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=1,
-            act="relu",
+            act="relu")
-            name=name + "_conv1", )
        self.conv2 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters,
            filter_size=3,
            stride=stride,
-            act="relu",
+            act="relu")
-            name=name + "_conv2")
        self.conv3 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters * 4,
            filter_size=1,
-            act=None,
+            act=None)
-            name=name + "_conv3")
        if self.downsample:
            self.conv_down = ConvBNLayer(
                num_channels=num_channels,
                num_filters=num_filters * 4,
                filter_size=1,
-                act=None,
+                act=None)
-                name=name + "_downsample")
        if self.has_se:
            self.se = SELayer(
                num_channels=num_filters * 4,
                num_filters=num_filters * 4,
-                reduction_ratio=16,
+                reduction_ratio=16)
-                name='fc' + name)
+        self.relu = nn.ReLU()
    def forward(self, x, res_dict=None):
        residual = x
-        conv1 = self.conv1(x)
+        x = self.conv1(x)
-        conv2 = self.conv2(conv1)
+        x = self.conv2(x)
-        conv3 = self.conv3(conv2)
+        x = self.conv3(x)
        if self.downsample:
-            residual = self.conv_down(x)
+            residual = self.conv_down(residual)
        if self.has_se:
-            conv3 = self.se(conv3)
+            x = self.se(x)
+        x = paddle.add(x=residual, y=x)
-        y = paddle.add(x=residual, y=conv3)
+        x = self.relu(x)
-        y = F.relu(y)
+        return x
-        return y
-class BasicBlock(TheseusLayer):
+class BasicBlock(nn.Layer):
-    def __init__(self,
+    def __init__(self, num_channels, num_filters, has_se=False):
-                 num_channels,
+        super().__init__()
-                 num_filters,
-                 stride=1,
-                 has_se=False,
-                 downsample=False,
-                 name=None):
-        super(BasicBlock, self).__init__()
        self.has_se = has_se
-        self.downsample = downsample
        self.conv1 = ConvBNLayer(
            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=3,
-            stride=stride,
+            stride=1,
-            act="relu",
+            act="relu")
-            name=name + "_conv1")
        self.conv2 = ConvBNLayer(
            num_channels=num_filters,
            num_filters=num_filters,
            filter_size=3,
            stride=1,
-            act=None,
+            act=None)
-            name=name + "_conv2")
-        if self.downsample:
-            self.conv_down = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * 4,
-                filter_size=1,
-                act="relu",
-                name=name + "_downsample")
        if self.has_se:
            self.se = SELayer(
                num_channels=num_filters,
                num_filters=num_filters,
-                reduction_ratio=16,
+                reduction_ratio=16)
-                name='fc' + name)
+        self.relu = nn.ReLU()
-    def forward(self, input, res_dict=None):
+    def forward(self, x):
-        residual = input
+        residual = x
-        conv1 = self.conv1(input)
+        x = self.conv1(x)
-        conv2 = self.conv2(conv1)
+        x = self.conv2(x)
-        if self.downsample:
-            residual = self.conv_down(input)
        if self.has_se:
-            conv2 = self.se(conv2)
+            x = self.se(x)
-        y = paddle.add(x=residual, y=conv2)
+        x = paddle.add(x=residual, y=x)
-        y = F.relu(y)
+        x = self.relu(x)
-        return y
+        return x
 class SELayer(TheseusLayer):
-    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
+    def __init__(self, num_channels, num_filters, reduction_ratio):
-        super(SELayer, self).__init__()
+        super().__init__()
-        self.pool2d_gap = AdaptiveAvgPool2D(1)
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
        self._num_channels = num_channels
        med_ch = int(num_channels / reduction_ratio)
        stdv = 1.0 / math.sqrt(num_channels * 1.0)
-        self.squeeze = nn.Linear(
+        self.fc_squeeze = nn.Linear(
            num_channels,
            med_ch,
-            weight_attr=ParamAttr(
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
-                initializer=Uniform(-stdv, stdv), name=name + "_sqz_weights"),
+        self.relu = nn.ReLU()
-            bias_attr=ParamAttr(name=name + '_sqz_offset'))
        stdv = 1.0 / math.sqrt(med_ch * 1.0)
-        self.excitation = nn.Linear(
+        self.fc_excitation = nn.Linear(
            med_ch,
            num_filters,
-            weight_attr=ParamAttr(
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
-                initializer=Uniform(-stdv, stdv), name=name + "_exc_weights"),
+        self.sigmoid = nn.Sigmoid()
-            bias_attr=ParamAttr(name=name + '_exc_offset'))
+    def forward(self, x, res_dict=None):
-    def forward(self, input, res_dict=None):
+        residual = x
-        pool = self.pool2d_gap(input)
+        x = self.avg_pool(x)
-        pool = paddle.squeeze(pool, axis=[2, 3])
+        x = paddle.squeeze(x, axis=[2, 3])
-        squeeze = self.squeeze(pool)
+        x = self.fc_squeeze(x)
-        squeeze = F.relu(squeeze)
+        x = self.relu(x)
-        excitation = self.excitation(squeeze)
+        x = self.fc_excitation(x)
-        excitation = F.sigmoid(excitation)
+        x = self.sigmoid(x)
-        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
+        x = paddle.unsqueeze(x, axis=[2, 3])
-        out = input * excitation
+        x = residual * x
-        return out
+        return x
 class Stage(TheseusLayer):
-    def __init__(self,
+    def __init__(self, num_modules, num_filters, has_se=False):
-                 num_channels,
+        super().__init__()
-                 num_modules,
-                 num_filters,
-                 has_se=False,
-                 multi_scale_output=True,
-                 name=None):
-        super(Stage, self).__init__()
        self._num_modules = num_modules
-        self.stage_func_list = []
+        self.stage_func_list = nn.LayerList()
        for i in range(num_modules):
-            if i == num_modules - 1 and not multi_scale_output:
+            self.stage_func_list.append(
-                stage_func = self.add_sublayer(
+                HighResolutionModule(
-                    "stage_{}_{}".format(name, i + 1),
+                    num_filters=num_filters, has_se=has_se))
-                    HighResolutionModule(
-                        num_channels=num_channels,
+    def forward(self, x, res_dict=None):
-                        num_filters=num_filters,
+        x = x
-                        has_se=has_se,
-                        multi_scale_output=False,
-                        name=name + '_' + str(i + 1)))
-            else:
-                stage_func = self.add_sublayer(
-                    "stage_{}_{}".format(name, i + 1),
-                    HighResolutionModule(
-                        num_channels=num_channels,
-                        num_filters=num_filters,
-                        has_se=has_se,
-                        name=name + '_' + str(i + 1)))
-            self.stage_func_list.append(stage_func)
-    def forward(self, input, res_dict=None):
-        out = input
        for idx in range(self._num_modules):
-            out = self.stage_func_list[idx](out)
+            x = self.stage_func_list[idx](x)
-        return out
+        return x
 class HighResolutionModule(TheseusLayer):
-    def __init__(self,
+    def __init__(self, num_filters, has_se=False):
-                 num_channels,
+        super().__init__()
-                 num_filters,
-                 has_se=False,
-                 multi_scale_output=True,
-                 name=None):
-        super(HighResolutionModule, self).__init__()
-        self.branches_func = Branches(
+        self.basic_block_list = nn.LayerList()
-            block_num=4,
-            in_channels=num_channels,
+        for i in range(len(num_filters)):
-            out_channels=num_filters,
+            self.basic_block_list.append(
-            has_se=has_se,
+                nn.Sequential(*[
-            name=name)
+                    BasicBlock(
+                        num_channels=num_filters[i],
+                        num_filters=num_filters[i],
+                        has_se=has_se) for j in range(4)
+                ]))
        self.fuse_func = FuseLayers(
-            in_channels=num_filters,
+            in_channels=num_filters, out_channels=num_filters)
-            out_channels=num_filters,
-            multi_scale_output=multi_scale_output,
-            name=name)
-    def forward(self, input, res_dict=None):
+    def forward(self, x, res_dict=None):
-        out = self.branches_func(input)
+        out = []
+        for idx, xi in enumerate(x):
+            basic_block_list = self.basic_block_list[idx]
+            for basic_block_func in basic_block_list:
+                xi = basic_block_func(xi)
+            out.append(xi)
        out = self.fuse_func(out)
        return out
 class FuseLayers(TheseusLayer):
-    def __init__(self,
+    def __init__(self, in_channels, out_channels):
-                 in_channels,
+        super().__init__()
-                 out_channels,
-                 multi_scale_output=True,
-                 name=None):
-        super(FuseLayers, self).__init__()
-        self._actual_ch = len(in_channels) if multi_scale_output else 1
+        self._actual_ch = len(in_channels)
        self._in_channels = in_channels
-        self.residual_func_list = []
+        self.residual_func_list = nn.LayerList()
-        for i in range(self._actual_ch):
+        self.relu = nn.ReLU()
+        for i in range(len(in_channels)):
            for j in range(len(in_channels)):
-                residual_func = None
                if j > i:
-                    residual_func = self.add_sublayer(
+                    self.residual_func_list.append(
-                        "residual_{}_layer_{}_{}".format(name, i + 1, j + 1),
                        ConvBNLayer(
                            num_channels=in_channels[j],
                            num_filters=out_channels[i],
                            filter_size=1,
                            stride=1,
-                            act=None,
+                            act=None))
-                            name=name + '_layer_' + str(i + 1) + '_' +
-                            str(j + 1)))
-                    self.residual_func_list.append(residual_func)
                elif j < i:
                    pre_num_filters = in_channels[j]
                    for k in range(i - j):
                        if k == i - j - 1:
-                            residual_func = self.add_sublayer(
+                            self.residual_func_list.append(
-                                "residual_{}_layer_{}_{}_{}".format(
-                                    name, i + 1, j + 1, k + 1),
                                ConvBNLayer(
                                    num_channels=pre_num_filters,
                                    num_filters=out_channels[i],
                                    filter_size=3,
                                    stride=2,
-                                    act=None,
+                                    act=None))
-                                    name=name + '_layer_' + str(i + 1) + '_' +
-                                    str(j + 1) + '_' + str(k + 1)))
                            pre_num_filters = out_channels[i]
                        else:
-                            residual_func = self.add_sublayer(
+                            self.residual_func_list.append(
-                                "residual_{}_layer_{}_{}_{}".format(
-                                    name, i + 1, j + 1, k + 1),
                                ConvBNLayer(
                                    num_channels=pre_num_filters,
                                    num_filters=out_channels[j],
                                    filter_size=3,
                                    stride=2,
-                                    act="relu",
+                                    act="relu"))
-                                    name=name + '_layer_' + str(i + 1) + '_' +
-                                    str(j + 1) + '_' + str(k + 1)))
                            pre_num_filters = out_channels[j]
-                        self.residual_func_list.append(residual_func)
-    def forward(self, input, res_dict=None):
+    def forward(self, x, res_dict=None):
-        outs = []
+        out = []
        residual_func_idx = 0
-        for i in range(self._actual_ch):
+        for i in range(len(self._in_channels)):
-            residual = input[i]
+            residual = x[i]
            for j in range(len(self._in_channels)):
                if j > i:
-                    y = self.residual_func_list[residual_func_idx](input[j])
+                    xj = self.residual_func_list[residual_func_idx](x[j])
                    residual_func_idx += 1
-                    y = F.upsample(y, scale_factor=2**(j - i), mode="nearest")
+                    xj = upsample(xj, scale_factor=2**(j - i), mode="nearest")
-                    residual = paddle.add(x=residual, y=y)
+                    residual = paddle.add(x=residual, y=xj)
                elif j < i:
-                    y = input[j]
+                    xj = x[j]
                    for k in range(i - j):
-                        y = self.residual_func_list[residual_func_idx](y)
+                        xj = self.residual_func_list[residual_func_idx](xj)
                        residual_func_idx += 1
-                    residual = paddle.add(x=residual, y=y)
+                    residual = paddle.add(x=residual, y=xj)
-            residual = F.relu(residual)
+            residual = self.relu(residual)
-            outs.append(residual)
+            out.append(residual)
-        return outs
+        return out
 class LastClsOut(TheseusLayer):
    def __init__(self,
                 num_channel_list,
                 has_se,
-                 num_filters_list=[32, 64, 128, 256],
+                 num_filters_list=[32, 64, 128, 256]):
-                 name=None):
+        super().__init__()
-        super(LastClsOut, self).__init__()
-        self.func_list = []
+        self.func_list = nn.LayerList()
        for idx in range(len(num_channel_list)):
-            func = self.add_sublayer(
+            self.func_list.append(
-                "conv_{}_conv_{}".format(name, idx + 1),
                BottleneckBlock(
                    num_channels=num_channel_list[idx],
                    num_filters=num_filters_list[idx],
                    has_se=has_se,
-                    downsample=True,
+                    downsample=True))
-                    name=name + 'conv_' + str(idx + 1)))
-            self.func_list.append(func)
-    def forward(self, inputs, res_dict=None):
+    def forward(self, x, res_dict=None):
-        outs = []
+        out = []
-        for idx, input in enumerate(inputs):
+        for idx, xi in enumerate(x):
-            out = self.func_list[idx](input)
+            xi = self.func_list[idx](xi)
-            outs.append(out)
+            out.append(xi)
-        return outs
+        return out
 class HRNet(TheseusLayer):
-    def __init__(self, width=18, has_se=False, class_dim=1000):
+    """
-        super(HRNet, self).__init__()
+    HRNet
+    Args:
+        width: int=18. Base channel number of HRNet.
+        has_se: bool=False. If 'True', add se module to HRNet.
+        class_num: int=1000. Output num of last fc layer.
+    Returns:
+        model: nn.Layer. Specific HRNet model depends on args.
+    """
+    def __init__(self, width=18, has_se=False, class_num=1000):
+        super().__init__()
        self.width = width
        self.has_se = has_se
-        self.channels = {
+        self._class_num = class_num
-            18: [[18, 36], [18, 36, 72], [18, 36, 72, 144]],
-            30: [[30, 60], [30, 60, 120], [30, 60, 120, 240]],
+        channels_2 = [self.width, self.width * 2]
-            32: [[32, 64], [32, 64, 128], [32, 64, 128, 256]],
+        channels_3 = [self.width, self.width * 2, self.width * 4]
-            40: [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
+        channels_4 = [
-            44: [[44, 88], [44, 88, 176], [44, 88, 176, 352]],
+            self.width, self.width * 2, self.width * 4, self.width * 8
-            48: [[48, 96], [48, 96, 192], [48, 96, 192, 384]],
+        ]
-            60: [[60, 120], [60, 120, 240], [60, 120, 240, 480]],
-            64: [[64, 128], [64, 128, 256], [64, 128, 256, 512]]
-        }
-        self._class_dim = class_dim
-        channels_2, channels_3, channels_4 = self.channels[width]
-        num_modules_2, num_modules_3, num_modules_4 = 1, 4, 3
        self.conv_layer1_1 = ConvBNLayer(
            num_channels=3,
            num_filters=64,
            filter_size=3,
            stride=2,
-            act='relu',
+            act="relu")
-            name="layer1_1")
        self.conv_layer1_2 = ConvBNLayer(
            num_channels=64,
            num_filters=64,
            filter_size=3,
            stride=2,
-            act='relu',
+            act="relu")
-            name="layer1_2")
+        self.layer1 = nn.Sequential(*[
-        self.la1 = Layer1(num_channels=64, has_se=has_se, name="layer2")
+            BottleneckBlock(
+                num_channels=64 if i == 0 else 256,
-        self.tr1 = TransitionLayer(
+                num_filters=64,
-            in_channels=[256], out_channels=channels_2, name="tr1")
+                has_se=has_se,
+                stride=1,
+                downsample=True if i == 0 else False) for i in range(4)
+        ])
+        self.conv_tr1_1 = ConvBNLayer(
+            num_channels=256, num_filters=width, filter_size=3)
+        self.conv_tr1_2 = ConvBNLayer(
+            num_channels=256, num_filters=width * 2, filter_size=3, stride=2)
        self.st2 = Stage(
-            num_channels=channels_2,
+            num_modules=1, num_filters=channels_2, has_se=self.has_se)
-            num_modules=num_modules_2,
-            num_filters=channels_2,
-            has_se=self.has_se,
-            name="st2")
-        self.tr2 = TransitionLayer(
+        self.conv_tr2 = ConvBNLayer(
-            in_channels=channels_2, out_channels=channels_3, name="tr2")
+            num_channels=width * 2,
+            num_filters=width * 4,
+            filter_size=3,
+            stride=2)
        self.st3 = Stage(
-            num_channels=channels_3,
+            num_modules=4, num_filters=channels_3, has_se=self.has_se)
-            num_modules=num_modules_3,
-            num_filters=channels_3,
+        self.conv_tr3 = ConvBNLayer(
-            has_se=self.has_se,
+            num_channels=width * 4,
-            name="st3")
+            num_filters=width * 8,
+            filter_size=3,
+            stride=2)
-        self.tr3 = TransitionLayer(
-            in_channels=channels_3, out_channels=channels_4, name="tr3")
        self.st4 = Stage(
-            num_channels=channels_4,
+            num_modules=3, num_filters=channels_4, has_se=self.has_se)
-            num_modules=num_modules_4,
-            num_filters=channels_4,
-            has_se=self.has_se,
-            name="st4")
        # classification
        num_filters_list = [32, 64, 128, 256]
        self.last_cls = LastClsOut(
            num_channel_list=channels_4,
            has_se=self.has_se,
-            num_filters_list=num_filters_list,
+            num_filters_list=num_filters_list)
-            name="cls_head", )
        last_num_filters = [256, 512, 1024]
-        self.cls_head_conv_list = []
+        self.cls_head_conv_list = nn.LayerList()
        for idx in range(3):
            self.cls_head_conv_list.append(
-                self.add_sublayer(
+                ConvBNLayer(
-                    "cls_head_add{}".format(idx + 1),
+                    num_channels=num_filters_list[idx] * 4,
-                    ConvBNLayer(
+                    num_filters=last_num_filters[idx],
-                        num_channels=num_filters_list[idx] * 4,
+                    filter_size=3,
-                        num_filters=last_num_filters[idx],
+                    stride=2))
-                        filter_size=3,
-                        stride=2,
-                        name="cls_head_add" + str(idx + 1))))
        self.conv_last = ConvBNLayer(
-            num_channels=1024,
+            num_channels=1024, num_filters=2048, filter_size=1, stride=1)
-            num_filters=2048,
-            filter_size=1,
-            stride=1,
-            name="cls_head_last_conv")
-        self.pool2d_avg = AdaptiveAvgPool2D(1)
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
        stdv = 1.0 / math.sqrt(2048 * 1.0)
-        self.out = nn.Linear(
+        self.fc = nn.Linear(
            2048,
-            class_dim,
+            class_num,
-            weight_attr=ParamAttr(
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
-                initializer=Uniform(-stdv, stdv), name="fc_weights"),
-            bias_attr=ParamAttr(name="fc_offset"))
-    def forward(self, input, res_dict=None):
+    def forward(self, x, res_dict=None):
-        conv1 = self.conv_layer1_1(input)
+        x = self.conv_layer1_1(x)
-        conv2 = self.conv_layer1_2(conv1)
+        x = self.conv_layer1_2(x)
-        la1 = self.la1(conv2)
+        x = self.layer1(x)
-        tr1 = self.tr1([la1])
+        tr1_1 = self.conv_tr1_1(x)
-        st2 = self.st2(tr1)
+        tr1_2 = self.conv_tr1_2(x)
+        x = self.st2([tr1_1, tr1_2])
-        tr2 = self.tr2(st2)
+        tr2 = self.conv_tr2(x[-1])
-        st3 = self.st3(tr2)
+        x.append(tr2)
+        x = self.st3(x)
-        tr3 = self.tr3(st3)
+        tr3 = self.conv_tr3(x[-1])
-        st4 = self.st4(tr3)
+        x.append(tr3)
+        x = self.st4(x)
-        last_cls = self.last_cls(st4)
+        x = self.last_cls(x)
-        y = last_cls[0]
+        y = x[0]
        for idx in range(3):
-            y = paddle.add(last_cls[idx + 1], self.cls_head_conv_list[idx](y))
+            y = paddle.add(x[idx + 1], self.cls_head_conv_list[idx](y))
        y = self.conv_last(y)
-        y = self.pool2d_avg(y)
+        y = self.avg_pool(y)
        y = paddle.reshape(y, shape=[-1, y.shape[1]])
-        y = self.out(y)
+        y = self.fc(y)
        return y
-def HRNet_W18_C(**args):
+def _load_pretrained(pretrained, model, model_url, use_ssld):
-    model = HRNet(width=18, **args)
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+def HRNet_W18_C(pretrained=False, use_ssld=False, **kwargs):
+    """
+    HRNet_W18_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W18_C` model depends on args.
+    """
+    model = HRNet(width=18, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W18_C"], use_ssld)
    return model
-def HRNet_W30_C(**args):
+def HRNet_W30_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=30, **args)
+    """
+    HRNet_W30_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W30_C` model depends on args.
+    """
+    model = HRNet(width=30, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W30_C"], use_ssld)
    return model
-def HRNet_W32_C(**args):
+def HRNet_W32_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=32, **args)
+    """
+    HRNet_W32_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W32_C` model depends on args.
+    """
+    model = HRNet(width=32, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W32_C"], use_ssld)
    return model
-def HRNet_W40_C(**args):
+def HRNet_W40_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=40, **args)
+    """
+    HRNet_W40_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W40_C` model depends on args.
+    """
+    model = HRNet(width=40, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W40_C"], use_ssld)
    return model
-def HRNet_W44_C(**args):
+def HRNet_W44_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=44, **args)
+    """
+    HRNet_W44_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W44_C` model depends on args.
+    """
+    model = HRNet(width=44, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W44_C"], use_ssld)
    return model
-def HRNet_W48_C(**args):
+def HRNet_W48_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=48, **args)
+    """
+    HRNet_W48_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W48_C` model depends on args.
+    """
+    model = HRNet(width=48, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W48_C"], use_ssld)
    return model
-def HRNet_W60_C(**args):
+def HRNet_W60_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=60, **args)
+    """
+    HRNet_W60_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W60_C` model depends on args.
+    """
+    model = HRNet(width=60, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W60_C"], use_ssld)
    return model
-def HRNet_W64_C(**args):
+def HRNet_W64_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=64, **args)
+    """
+    HRNet_W64_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `HRNet_W64_C` model depends on args.
+    """
+    model = HRNet(width=64, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["HRNet_W64_C"], use_ssld)
    return model
-def SE_HRNet_W18_C(**args):
+def SE_HRNet_W18_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=18, has_se=True, **args)
+    """
+    SE_HRNet_W18_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W18_C` model depends on args.
+    """
+    model = HRNet(width=18, has_se=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W18_C"], use_ssld)
    return model
-def SE_HRNet_W30_C(**args):
+def SE_HRNet_W30_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=30, has_se=True, **args)
+    """
+    SE_HRNet_W30_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W30_C` model depends on args.
+    """
+    model = HRNet(width=30, has_se=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W30_C"], use_ssld)
    return model
-def SE_HRNet_W32_C(**args):
+def SE_HRNet_W32_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=32, has_se=True, **args)
+    """
+    SE_HRNet_W32_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W32_C` model depends on args.
+    """
+    model = HRNet(width=32, has_se=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W32_C"], use_ssld)
    return model
-def SE_HRNet_W40_C(**args):
+def SE_HRNet_W40_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=40, has_se=True, **args)
+    """
+    SE_HRNet_W40_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W40_C` model depends on args.
+    """
+    model = HRNet(width=40, has_se=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W40_C"], use_ssld)
    return model
-def SE_HRNet_W44_C(**args):
+def SE_HRNet_W44_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=44, has_se=True, **args)
+    """
+    SE_HRNet_W44_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W44_C` model depends on args.
+    """
+    model = HRNet(width=44, has_se=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W44_C"], use_ssld)
    return model
-def SE_HRNet_W48_C(**args):
+def SE_HRNet_W48_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=48, has_se=True, **args)
+    """
+    SE_HRNet_W48_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W48_C` model depends on args.
+    """
+    model = HRNet(width=48, has_se=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W48_C"], use_ssld)
    return model
-def SE_HRNet_W60_C(**args):
+def SE_HRNet_W60_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=60, has_se=True, **args)
+    """
+    SE_HRNet_W60_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W60_C` model depends on args.
+    """
+    model = HRNet(width=60, has_se=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W60_C"], use_ssld)
    return model
-def SE_HRNet_W64_C(**args):
+def SE_HRNet_W64_C(pretrained=False, use_ssld=False, **kwargs):
-    model = HRNet(width=64, has_se=True, **args)
+    """
+    SE_HRNet_W64_C
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `SE_HRNet_W64_C` model depends on args.
+    """
+    model = HRNet(width=64, has_se=True, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["SE_HRNet_W64_C"], use_ssld)
    return model
--- a/ppcls/arch/backbone/legendary_models/inception_v3.py
+++ b/ppcls/arch/backbone/legendary_models/inception_v3.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+import math
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+MODEL_URLS = {
+    "InceptionV3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/InceptionV3_pretrained.pdparams"
+}
+__all__ = MODEL_URLS.keys()
+'''
+InceptionV3 config: dict.
+    key: inception blocks of InceptionV3.
+    values: conv num in different blocks.
+'''
+NET_CONFIG = {
+    "inception_a": [[192, 256, 288], [32, 64, 64]],
+    "inception_b": [288],
+    "inception_c": [[768, 768, 768, 768], [128, 160, 160, 192]],
+    "inception_d": [768],
+    "inception_e": [1280, 2048]
+}
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 groups=1,
+                 act="relu"):
+        super().__init__()
+        self.act = act
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+        self.bn = BatchNorm(num_filters)
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+class InceptionStem(TheseusLayer):
+    def __init__(self):
+        super().__init__()
+        self.conv_1a_3x3 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.conv_2a_3x3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act="relu")
+        self.conv_2b_3x3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.max_pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self.conv_3b_1x1 = ConvBNLayer(
+            num_channels=64, num_filters=80, filter_size=1, act="relu")
+        self.conv_4a_3x3 = ConvBNLayer(
+            num_channels=80, num_filters=192, filter_size=3, act="relu")
+    def forward(self, x):
+        x = self.conv_1a_3x3(x)
+        x = self.conv_2a_3x3(x)
+        x = self.conv_2b_3x3(x)
+        x = self.max_pool(x)
+        x = self.conv_3b_1x1(x)
+        x = self.conv_4a_3x3(x)
+        x = self.max_pool(x)
+        return x
+class InceptionA(TheseusLayer):
+    def __init__(self, num_channels, pool_features):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch5x5_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=48,
+            filter_size=1,
+            act="relu")
+        self.branch5x5_2 = ConvBNLayer(
+            num_channels=48,
+            num_filters=64,
+            filter_size=5,
+            padding=2,
+            act="relu")
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3 = ConvBNLayer(
+            num_channels=96,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=pool_features,
+            filter_size=1,
+            act="relu")
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+        x = paddle.concat(
+            [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+        return x
+class InceptionB(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch3x3 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=384,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3 = ConvBNLayer(
+            num_channels=96,
+            num_filters=96,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+    def forward(self, x):
+        branch3x3 = self.branch3x3(x)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+        branch_pool = self.branch_pool(x)
+        x = paddle.concat([branch3x3, branch3x3dbl, branch_pool], axis=1)
+        return x
+class InceptionC(TheseusLayer):
+    def __init__(self, num_channels, channels_7x7):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+        self.branch7x7_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=channels_7x7,
+            filter_size=1,
+            stride=1,
+            act="relu")
+        self.branch7x7_2 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(1, 7),
+            stride=1,
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7_3 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=192,
+            filter_size=(7, 1),
+            stride=1,
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=channels_7x7,
+            filter_size=1,
+            act="relu")
+        self.branch7x7dbl_2 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7dbl_3 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7dbl_4 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7dbl_5 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=192,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+        x = paddle.concat(
+            [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+        return x
+class InceptionD(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch3x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_2 = ConvBNLayer(
+            num_channels=192,
+            num_filters=320,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch7x7x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+        self.branch7x7x3_2 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7x3_3 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7x3_4 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+    def forward(self, x):
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = self.branch3x3_2(branch3x3)
+        branch7x7x3 = self.branch7x7x3_1(x)
+        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
+        branch_pool = self.branch_pool(x)
+        x = paddle.concat([branch3x3, branch7x7x3, branch_pool], axis=1)
+        return x
+class InceptionE(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=320,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=384,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_2a = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(1, 3),
+            padding=(0, 1),
+            act="relu")
+        self.branch3x3_2b = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(3, 1),
+            padding=(1, 0),
+            act="relu")
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=448,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=448,
+            num_filters=384,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3a = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(1, 3),
+            padding=(0, 1),
+            act="relu")
+        self.branch3x3dbl_3b = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(3, 1),
+            padding=(1, 0),
+            act="relu")
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = paddle.concat(branch3x3, axis=1)
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = paddle.concat(branch3x3dbl, axis=1)
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+        x = paddle.concat(
+            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+        return x
+class Inception_V3(TheseusLayer):
+    """
+    Inception_V3
+    Args:
+        config: dict. config of Inception_V3.
+        class_num: int=1000. The number of classes.
+        pretrained: (True or False) or path of pretrained_model. Whether to load the pretrained model.
+    Returns:
+        model: nn.Layer. Specific Inception_V3 model depends on args.
+    """
+    def __init__(self, config, class_num=1000):
+        super().__init__()
+        self.inception_a_list = config["inception_a"]
+        self.inception_c_list = config["inception_c"]
+        self.inception_b_list = config["inception_b"]
+        self.inception_d_list = config["inception_d"]
+        self.inception_e_list = config["inception_e"]
+        self.inception_stem = InceptionStem()
+        self.inception_block_list = nn.LayerList()
+        for i in range(len(self.inception_a_list[0])):
+            inception_a = InceptionA(self.inception_a_list[0][i],
+                                     self.inception_a_list[1][i])
+            self.inception_block_list.append(inception_a)
+        for i in range(len(self.inception_b_list)):
+            inception_b = InceptionB(self.inception_b_list[i])
+            self.inception_block_list.append(inception_b)
+        for i in range(len(self.inception_c_list[0])):
+            inception_c = InceptionC(self.inception_c_list[0][i],
+                                     self.inception_c_list[1][i])
+            self.inception_block_list.append(inception_c)
+        for i in range(len(self.inception_d_list)):
+            inception_d = InceptionD(self.inception_d_list[i])
+            self.inception_block_list.append(inception_d)
+        for i in range(len(self.inception_e_list)):
+            inception_e = InceptionE(self.inception_e_list[i])
+            self.inception_block_list.append(inception_e)
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.dropout = Dropout(p=0.2, mode="downscale_in_infer")
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+        self.fc = Linear(
+            2048,
+            class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr())
+    def forward(self, x):
+        x = self.inception_stem(x)
+        for inception_block in self.inception_block_list:
+            x = inception_block(x)
+        x = self.avg_pool(x)
+        x = paddle.reshape(x, shape=[-1, 2048])
+        x = self.dropout(x)
+        x = self.fc(x)
+        return x
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+def InceptionV3(pretrained=False, use_ssld=False, **kwargs):
+    """
+    InceptionV3
+    Args:
+        pretrained: bool=false or str. if `true` load pretrained parameters, `false` otherwise.
+                    if str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `InceptionV3` model 
+    """
+    model = Inception_V3(NET_CONFIG, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["InceptionV3"], use_ssld)
+    return model
--- a/ppcls/arch/backbone/legendary_models/mobilenet_v1.py
+++ b/ppcls/arch/backbone/legendary_models/mobilenet_v1.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, ReLU, Flatten
+from paddle.nn import AdaptiveAvgPool2D
+from paddle.nn.initializer import KaimingNormal
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+MODEL_URLS = {
+    "MobileNetV1_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_25_pretrained.pdparams",
+    "MobileNetV1_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_5_pretrained.pdparams",
+    "MobileNetV1_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_75_pretrained.pdparams",
+    "MobileNetV1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_pretrained.pdparams"
+}
+__all__ = MODEL_URLS.keys()
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 num_groups=1):
+        super().__init__()
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+        self.bn = BatchNorm(num_filters)
+        self.relu = ReLU()
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+class DepthwiseSeparable(TheseusLayer):
+    def __init__(self, num_channels, num_filters1, num_filters2, num_groups,
+                 stride, scale):
+        super().__init__()
+        self.depthwise_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=int(num_filters1 * scale),
+            filter_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale))
+        self.pointwise_conv = ConvBNLayer(
+            num_channels=int(num_filters1 * scale),
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv(x)
+        return x
+class MobileNet(TheseusLayer):
+    """
+    MobileNet
+    Args:
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+        class_num: int=1000. The number of classes.
+    Returns:
+        model: nn.Layer. Specific MobileNet model depends on args.
+    """
+    def __init__(self, scale=1.0, class_num=1000):
+        super().__init__()
+        self.scale = scale
+        self.conv = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1)
+        #num_channels, num_filters1, num_filters2, num_groups, stride
+        self.cfg = [[int(32 * scale), 32, 64, 32, 1],
+                    [int(64 * scale), 64, 128, 64, 2],
+                    [int(128 * scale), 128, 128, 128, 1],
+                    [int(128 * scale), 128, 256, 128, 2],
+                    [int(256 * scale), 256, 256, 256, 1],
+                    [int(256 * scale), 256, 512, 256, 2],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 1024, 512, 2],
+                    [int(1024 * scale), 1024, 1024, 1024, 1]]
+        self.blocks = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=params[0],
+                num_filters1=params[1],
+                num_filters2=params[2],
+                num_groups=params[3],
+                stride=params[4],
+                scale=scale) for params in self.cfg
+        ])
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.flatten = Flatten(start_axis=1, stop_axis=-1)
+        self.fc = Linear(
+            int(1024 * scale),
+            class_num,
+            weight_attr=ParamAttr(initializer=KaimingNormal()))
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.avg_pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+def MobileNetV1_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_25` model depends on args.
+    """
+    model = MobileNet(scale=0.25, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_25"],
+                     use_ssld)
+    return model
+def MobileNetV1_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_5` model depends on args.
+    """
+    model = MobileNet(scale=0.5, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_5"],
+                     use_ssld)
+    return model
+def MobileNetV1_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_75` model depends on args.
+    """
+    model = MobileNet(scale=0.75, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_75"],
+                     use_ssld)
+    return model
+def MobileNetV1(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1` model depends on args.
+    """
+    model = MobileNet(scale=1.0, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1"], use_ssld)
+    return model
--- a/ppcls/arch/backbone/legendary_models/mobilenet_v3.py
+++ b/ppcls/arch/backbone/legendary_models/mobilenet_v3.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear
+from paddle.regularizer import L2Decay
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+MODEL_URLS = {
+    "MobileNetV3_small_x0_35":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_35_pretrained.pdparams",
+    "MobileNetV3_small_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_5_pretrained.pdparams",
+    "MobileNetV3_small_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_75_pretrained.pdparams",
+    "MobileNetV3_small_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_0_pretrained.pdparams",
+    "MobileNetV3_small_x1_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_25_pretrained.pdparams",
+    "MobileNetV3_large_x0_35":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_35_pretrained.pdparams",
+    "MobileNetV3_large_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_5_pretrained.pdparams",
+    "MobileNetV3_large_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_75_pretrained.pdparams",
+    "MobileNetV3_large_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_0_pretrained.pdparams",
+    "MobileNetV3_large_x1_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_25_pretrained.pdparams",
+}
+__all__ = MODEL_URLS.keys()
+# "large", "small" is just for MobinetV3_large, MobileNetV3_small respectively.
+# The type of "large" or "small" config is a list. Each element(list) represents a depthwise block, which is composed of k, exp, se, act, s.
+# k: kernel_size
+# exp: middle channel number in depthwise block
+# c: output channel number in depthwise block
+# se: whether to use SE block
+# act: which activation to use
+# s: stride in depthwise block
+NET_CONFIG = {
+    "large": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, False, "relu", 1],
+        [3, 64, 24, False, "relu", 2],
+        [3, 72, 24, False, "relu", 1],
+        [5, 72, 40, True, "relu", 2],
+        [5, 120, 40, True, "relu", 1],
+        [5, 120, 40, True, "relu", 1],
+        [3, 240, 80, False, "hardswish", 2],
+        [3, 200, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 480, 112, True, "hardswish", 1],
+        [3, 672, 112, True, "hardswish", 1],
+        [5, 672, 160, True, "hardswish", 2],
+        [5, 960, 160, True, "hardswish", 1],
+        [5, 960, 160, True, "hardswish", 1],
+    ],
+    "small": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, True, "relu", 2],
+        [3, 72, 24, False, "relu", 2],
+        [3, 88, 24, False, "relu", 1],
+        [5, 96, 40, True, "hardswish", 2],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 120, 48, True, "hardswish", 1],
+        [5, 144, 48, True, "hardswish", 1],
+        [5, 288, 96, True, "hardswish", 2],
+        [5, 576, 96, True, "hardswish", 1],
+        [5, 576, 96, True, "hardswish", 1],
+    ]
+}
+# first conv output channel number in MobileNetV3
+STEM_CONV_NUMBER = 16
+# last second conv output channel for "small"
+LAST_SECOND_CONV_SMALL = 576
+# last second conv output channel for "large"
+LAST_SECOND_CONV_LARGE = 960
+# last conv output channel number for "large" and "small"
+LAST_CONV = 1280
+def _make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act is None:
+        return None
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+class MobileNetV3(TheseusLayer):
+    """
+    MobileNetV3
+    Args:
+        config: list. MobileNetV3 depthwise blocks config.
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+        class_num: int=1000. The number of classes.
+        inplanes: int=16. The output channel number of first convolution layer.
+        class_squeeze: int=960. The output channel number of penultimate convolution layer. 
+        class_expand: int=1280. The output channel number of last convolution layer. 
+        dropout_prob: float=0.2.  Probability of setting units to zero.
+    Returns:
+        model: nn.Layer. Specific MobileNetV3 model depends on args.
+    """
+    def __init__(self,
+                 config,
+                 scale=1.0,
+                 class_num=1000,
+                 inplanes=STEM_CONV_NUMBER,
+                 class_squeeze=LAST_SECOND_CONV_LARGE,
+                 class_expand=LAST_CONV,
+                 dropout_prob=0.2):
+        super().__init__()
+        self.cfg = config
+        self.scale = scale
+        self.inplanes = inplanes
+        self.class_squeeze = class_squeeze
+        self.class_expand = class_expand
+        self.class_num = class_num
+        self.conv = ConvBNLayer(
+            in_c=3,
+            out_c=_make_divisible(self.inplanes * self.scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act="hardswish")
+        self.blocks = nn.Sequential(*[
+            ResidualUnit(
+                in_c=_make_divisible(self.inplanes * self.scale if i == 0 else
+                                     self.cfg[i - 1][2] * self.scale),
+                mid_c=_make_divisible(self.scale * exp),
+                out_c=_make_divisible(self.scale * c),
+                filter_size=k,
+                stride=s,
+                use_se=se,
+                act=act) for i, (k, exp, c, se, act, s) in enumerate(self.cfg)
+        ])
+        self.last_second_conv = ConvBNLayer(
+            in_c=_make_divisible(self.cfg[-1][2] * self.scale),
+            out_c=_make_divisible(self.scale * self.class_squeeze),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act="hardswish")
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.last_conv = Conv2D(
+            in_channels=_make_divisible(self.scale * self.class_squeeze),
+            out_channels=self.class_expand,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=False)
+        self.hardswish = nn.Hardswish()
+        self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        self.fc = Linear(self.class_expand, class_num)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.last_second_conv(x)
+        x = self.avg_pool(x)
+        x = self.last_conv(x)
+        x = self.hardswish(x)
+        x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 if_act=True,
+                 act=None):
+        super().__init__()
+        self.conv = Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            bias_attr=False)
+        self.bn = BatchNorm(
+            num_channels=out_c,
+            act=None,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.if_act = if_act
+        self.act = _create_act(act)
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self.act(x)
+        return x
+class ResidualUnit(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 act=None):
+        super().__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.if_se = use_se
+        self.expand_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act)
+        self.bottleneck_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            num_groups=mid_c,
+            if_act=True,
+            act=act)
+        if self.if_se:
+            self.mid_se = SEModule(mid_c)
+        self.linear_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None)
+    def forward(self, x):
+        identity = x
+        x = self.expand_conv(x)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = paddle.add(identity, x)
+        return x
+# nn.Hardsigmoid can't transfer "slope" and "offset" in nn.functional.hardsigmoid
+class Hardsigmoid(TheseusLayer):
+    def __init__(self, slope=0.2, offset=0.5):
+        super().__init__()
+        self.slope = slope
+        self.offset = offset
+    def forward(self, x):
+        return nn.functional.hardsigmoid(
+            x, slope=self.slope, offset=self.offset)
+class SEModule(TheseusLayer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = Hardsigmoid(slope=0.2, offset=0.5)
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        return paddle.multiply(x=identity, y=x)
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+def MobileNetV3_small_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_35
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_35` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.35,
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_35"],
+                     use_ssld)
+    return model
+def MobileNetV3_small_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_5` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.5,
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_5"],
+                     use_ssld)
+    return model
+def MobileNetV3_small_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_75
+    Args:
+        pretrained: bool=false or str. if `true` load pretrained parameters, `false` otherwise.
+                    if str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_75` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.75,
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_75"],
+                     use_ssld)
+    return model
+def MobileNetV3_small_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.0,
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x1_0"],
+                     use_ssld)
+    return model
+def MobileNetV3_small_x1_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x1_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x1_25` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.25,
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x1_25"],
+                     use_ssld)
+    return model
+def MobileNetV3_large_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_35
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_35` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.35,
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_35"],
+                     use_ssld)
+    return model
+def MobileNetV3_large_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_5` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.5,
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_5"],
+                     use_ssld)
+    return model
+def MobileNetV3_large_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_75` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.75,
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_75"],
+                     use_ssld)
+    return model
+def MobileNetV3_large_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.0,
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x1_0"],
+                     use_ssld)
+    return model
+def MobileNetV3_large_x1_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x1_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x1_25` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.25,
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x1_25"],
+                     use_ssld)
+    return model
--- a/ppcls/arch/backbone/legendary_models/resnet.py
+++ b/ppcls/arch/backbone/legendary_models/resnet.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+MODEL_URLS = {
+    "ResNet18":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_pretrained.pdparams",
+    "ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+    "ResNet34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_pretrained.pdparams",
+    "ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_vd_pretrained.pdparams",
+    "ResNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_pretrained.pdparams",
+    "ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_vd_pretrained.pdparams",
+    "ResNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_pretrained.pdparams",
+    "ResNet101_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_vd_pretrained.pdparams",
+    "ResNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_pretrained.pdparams",
+    "ResNet152_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+    "ResNet200_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+}
+__all__ = MODEL_URLS.keys()
+'''
+ResNet config: dict.
+    key: depth of ResNet.
+    values: config's dict of specific model.
+        keys:
+            block_type: Two different blocks in ResNet, BasicBlock and BottleneckBlock are optional.
+            block_depth: The number of blocks in different stages in ResNet.
+            num_channels: The number of channels to enter the next stage.
+'''
+NET_CONFIG = {
+    "18": {
+        "block_type": "BasicBlock",
+        "block_depth": [2, 2, 2, 2],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "34": {
+        "block_type": "BasicBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "50": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "101": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 23, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "152": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 8, 36, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "200": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 12, 48, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+}
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 lr_mult=1.0):
+        super().__init__()
+        self.is_vd_mode = is_vd_mode
+        self.act = act
+        self.avg_pool = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False)
+        self.bn = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        if self.is_vd_mode:
+            x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+class BottleneckBlock(TheseusLayer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            stride,
+            shortcut=True,
+            if_first=False,
+            lr_mult=1.0, ):
+        super().__init__()
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu",
+            lr_mult=lr_mult)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            lr_mult=lr_mult)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult)
+        self.relu = nn.ReLU()
+        self.shortcut = shortcut
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+class BasicBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            lr_mult=lr_mult)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult)
+        self.shortcut = shortcut
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+class ResNet(TheseusLayer):
+    """
+    ResNet
+    Args:
+        config: dict. config of ResNet.
+        version: str="vb". Different version of ResNet, version vd can perform better. 
+        class_num: int=1000. The number of classes.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific ResNet model depends on args.
+    """
+    def __init__(self,
+                 config,
+                 version="vb",
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0]):
+        super().__init__()
+        self.cfg = config
+        self.lr_mult_list = lr_mult_list
+        self.is_vd_mode = version == "vd"
+        self.class_num = class_num
+        self.num_filters = [64, 128, 256, 512]
+        self.block_depth = self.cfg["block_depth"]
+        self.block_type = self.cfg["block_type"]
+        self.num_channels = self.cfg["num_channels"]
+        self.channels_mult = 1 if self.num_channels[-1] == 256 else 4
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 5, "lr_mult_list length should be 5 but got {}".format(
+                       len(self.lr_mult_list))
+        self.stem_cfg = {
+            #num_channels, num_filters, filter_size, stride
+            "vb": [[3, 64, 7, 2]],
+            "vd": [[3, 32, 3, 2], [32, 32, 3, 1], [32, 64, 3, 1]]
+        }
+        self.stem = nn.Sequential(*[
+            ConvBNLayer(
+                num_channels=in_c,
+                num_filters=out_c,
+                filter_size=k,
+                stride=s,
+                act="relu",
+                lr_mult=self.lr_mult_list[0])
+            for in_c, out_c, k, s in self.stem_cfg[version]
+        ])
+        self.max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+        block_list = []
+        for block_idx in range(len(self.block_depth)):
+            shortcut = False
+            for i in range(self.block_depth[block_idx]):
+                block_list.append(globals()[self.block_type](
+                    num_channels=self.num_channels[block_idx] if i == 0 else
+                    self.num_filters[block_idx] * self.channels_mult,
+                    num_filters=self.num_filters[block_idx],
+                    stride=2 if i == 0 and block_idx != 0 else 1,
+                    shortcut=shortcut,
+                    if_first=block_idx == i == 0 if version == "vd" else True,
+                    lr_mult=self.lr_mult_list[block_idx + 1]))
+                shortcut = True
+        self.blocks = nn.Sequential(*block_list)
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.flatten = nn.Flatten()
+        self.avg_pool_channels = self.num_channels[-1] * 2
+        stdv = 1.0 / math.sqrt(self.avg_pool_channels * 1.0)
+        self.fc = Linear(
+            self.avg_pool_channels,
+            self.class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.max_pool(x)
+        x = self.blocks(x)
+        x = self.avg_pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+def ResNet18(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18"], use_ssld)
+    return model
+def ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_vd"], use_ssld)
+    return model
+def ResNet34(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34"], use_ssld)
+    return model
+def ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34_vd"], use_ssld)
+    return model
+def ResNet50(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+def ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50_vd"], use_ssld)
+    return model
+def ResNet101(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101"], use_ssld)
+    return model
+def ResNet101_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101_vd"], use_ssld)
+    return model
+def ResNet152(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152"], use_ssld)
+    return model
+def ResNet152_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152_vd"], use_ssld)
+    return model
+def ResNet200_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet200_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet200_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["200"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet200_vd"], use_ssld)
+    return model
--- a/ppcls/arch/backbone/legendary_models/vgg.py
+++ b/ppcls/arch/backbone/legendary_models/vgg.py
@@ -14,16 +14,24 @@
 from __future__ import absolute_import, division, print_function
-import paddle
-from paddle import ParamAttr
 import paddle.nn as nn
-import paddle.nn.functional as F
 from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
-from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn import MaxPool2D
 from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
-__all__ = ["VGG11", "VGG13", "VGG16", "VGG19"]
+MODEL_URLS = {
+    "VGG11":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG11_pretrained.pdparams",
+    "VGG13":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG13_pretrained.pdparams",
+    "VGG16":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG16_pretrained.pdparams",
+    "VGG19":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG19_pretrained.pdparams",
+}
+__all__ = MODEL_URLS.keys()
 # VGG config
 # key: VGG network depth
@@ -36,68 +44,12 @@ NET_CONFIG = {
 }
-def VGG11(**args):
-    """
-    VGG11
-    Args:
-        kwargs: 
-            class_num: int=1000. Output dim of last fc layer.
-            stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
-    Returns:
-        model: nn.Layer. Specific `VGG11` model depends on args.
-    """
-    model = VGGNet(config=NET_CONFIG[11], **args)
-    return model
-def VGG13(**args):
-    """
-    VGG13
-    Args:
-        kwargs: 
-            class_num: int=1000. Output dim of last fc layer.
-            stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
-    Returns:
-        model: nn.Layer. Specific `VGG11` model depends on args.
-    """
-    model = VGGNet(config=NET_CONFIG[13], **args)
-    return model
-def VGG16(**args):
-    """
-    VGG16
-    Args:
-        kwargs: 
-            class_num: int=1000. Output dim of last fc layer.
-            stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
-    Returns:
-        model: nn.Layer. Specific `VGG11` model depends on args.
-    """
-    model = VGGNet(config=NET_CONFIG[16], **args)
-    return model
-def VGG19(**args):
-    """
-    VGG19
-    Args:
-        kwargs: 
-            class_num: int=1000. Output dim of last fc layer.
-            stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
-    Returns:
-        model: nn.Layer. Specific `VGG11` model depends on args.
-    """
-    model = VGGNet(config=NET_CONFIG[19], **args)
-    return model
 class ConvBlock(TheseusLayer):
    def __init__(self, input_channels, output_channels, groups):
-        super(ConvBlock, self).__init__()
+        super().__init__()
        self.groups = groups
-        self._conv_1 = Conv2D(
+        self.conv1 = Conv2D(
            in_channels=input_channels,
            out_channels=output_channels,
            kernel_size=3,
@@ -105,7 +57,7 @@ class ConvBlock(TheseusLayer):
            padding=1,
            bias_attr=False)
        if groups == 2 or groups == 3 or groups == 4:
-            self._conv_2 = Conv2D(
+            self.conv2 = Conv2D(
                in_channels=output_channels,
                out_channels=output_channels,
                kernel_size=3,
@@ -113,7 +65,7 @@ class ConvBlock(TheseusLayer):
                padding=1,
                bias_attr=False)
        if groups == 3 or groups == 4:
-            self._conv_3 = Conv2D(
+            self.conv3 = Conv2D(
                in_channels=output_channels,
                out_channels=output_channels,
                kernel_size=3,
@@ -121,7 +73,7 @@ class ConvBlock(TheseusLayer):
                padding=1,
                bias_attr=False)
        if groups == 4:
-            self._conv_4 = Conv2D(
+            self.conv4 = Conv2D(
                in_channels=output_channels,
                out_channels=output_channels,
                kernel_size=3,
@@ -129,65 +81,148 @@ class ConvBlock(TheseusLayer):
                padding=1,
                bias_attr=False)
-        self._pool = MaxPool2D(kernel_size=2, stride=2, padding=0)
+        self.max_pool = MaxPool2D(kernel_size=2, stride=2, padding=0)
-        self._relu = nn.ReLU()
+        self.relu = nn.ReLU()
    def forward(self, inputs):
-        x = self._conv_1(inputs)
+        x = self.conv1(inputs)
-        x = self._relu(x)
+        x = self.relu(x)
        if self.groups == 2 or self.groups == 3 or self.groups == 4:
-            x = self._conv_2(x)
+            x = self.conv2(x)
-            x = self._relu(x)
+            x = self.relu(x)
        if self.groups == 3 or self.groups == 4:
-            x = self._conv_3(x)
+            x = self.conv3(x)
-            x = self._relu(x)
+            x = self.relu(x)
        if self.groups == 4:
-            x = self._conv_4(x)
+            x = self.conv4(x)
-            x = self._relu(x)
+            x = self.relu(x)
-        x = self._pool(x)
+        x = self.max_pool(x)
        return x
 class VGGNet(TheseusLayer):
+    """
+    VGGNet
+    Args:
+        config: list. VGGNet config.
+        stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
+        class_num: int=1000. The number of classes.
+    Returns:
+        model: nn.Layer. Specific VGG model depends on args.
+    """
    def __init__(self, config, stop_grad_layers=0, class_num=1000):
        super().__init__()
        self.stop_grad_layers = stop_grad_layers
-        self._conv_block_1 = ConvBlock(3, 64, config[0])
+        self.conv_block_1 = ConvBlock(3, 64, config[0])
-        self._conv_block_2 = ConvBlock(64, 128, config[1])
+        self.conv_block_2 = ConvBlock(64, 128, config[1])
-        self._conv_block_3 = ConvBlock(128, 256, config[2])
+        self.conv_block_3 = ConvBlock(128, 256, config[2])
-        self._conv_block_4 = ConvBlock(256, 512, config[3])
+        self.conv_block_4 = ConvBlock(256, 512, config[3])
-        self._conv_block_5 = ConvBlock(512, 512, config[4])
+        self.conv_block_5 = ConvBlock(512, 512, config[4])
-        self._relu = nn.ReLU()
+        self.relu = nn.ReLU()
-        self._flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
        for idx, block in enumerate([
-                self._conv_block_1, self._conv_block_2, self._conv_block_3,
+                self.conv_block_1, self.conv_block_2, self.conv_block_3,
-                self._conv_block_4, self._conv_block_5
+                self.conv_block_4, self.conv_block_5
        ]):
            if self.stop_grad_layers >= idx + 1:
                for param in block.parameters():
                    param.trainable = False
-        self._drop = Dropout(p=0.5, mode="downscale_in_infer")
+        self.drop = Dropout(p=0.5, mode="downscale_in_infer")
-        self._fc1 = Linear(7 * 7 * 512, 4096)
+        self.fc1 = Linear(7 * 7 * 512, 4096)
-        self._fc2 = Linear(4096, 4096)
+        self.fc2 = Linear(4096, 4096)
-        self._out = Linear(4096, class_num)
+        self.fc3 = Linear(4096, class_num)
    def forward(self, inputs):
-        x = self._conv_block_1(inputs)
+        x = self.conv_block_1(inputs)
-        x = self._conv_block_2(x)
+        x = self.conv_block_2(x)
-        x = self._conv_block_3(x)
+        x = self.conv_block_3(x)
-        x = self._conv_block_4(x)
+        x = self.conv_block_4(x)
-        x = self._conv_block_5(x)
+        x = self.conv_block_5(x)
-        x = self._flatten(x)
+        x = self.flatten(x)
-        x = self._fc1(x)
+        x = self.fc1(x)
-        x = self._relu(x)
+        x = self.relu(x)
-        x = self._drop(x)
+        x = self.drop(x)
-        x = self._fc2(x)
+        x = self.fc2(x)
-        x = self._relu(x)
+        x = self.relu(x)
-        x = self._drop(x)
+        x = self.drop(x)
-        x = self._out(x)
+        x = self.fc3(x)
        return x
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+def VGG11(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG11
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG11` model depends on args.
+    """
+    model = VGGNet(config=NET_CONFIG[11], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG11"], use_ssld)
+    return model
+def VGG13(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG13
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG13` model depends on args.
+    """
+    model = VGGNet(config=NET_CONFIG[13], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG13"], use_ssld)
+    return model
+def VGG16(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG16
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG16` model depends on args.
+    """
+    model = VGGNet(config=NET_CONFIG[16], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG16"], use_ssld)
+    return model
+def VGG19(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG19
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG19` model depends on args.
+    """
+    model = VGGNet(config=NET_CONFIG[19], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG19"], use_ssld)
+    return model
--- a/ppcls/arch/head/__init__.py
+++ b/ppcls/arch/head/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .arcmargin import ArcMargin
+from .cosmargin import CosMargin
+from .circlemargin import CircleMargin
+from .fc import FC
+__all__ = ['build_head']
+def build_head(config):
+    support_dict = ['ArcMargin', 'CosMargin', 'CircleMargin', 'FC']
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception('head only support {}'.format(
+        support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
--- a/ppcls/arch/head/arcmargin.py
+++ b/ppcls/arch/head/arcmargin.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+import math
+class ArcMargin(nn.Layer):
+    def __init__(self, embedding_size, 
+                       class_num,  
+                       margin=0.5, 
+                       scale=80.0, 
+                       easy_margin=False):
+        super(ArcMargin, self).__init__()
+        self.embedding_size   = embedding_size
+        self.class_num   = class_num
+        self.margin      = margin
+        self.scale       = scale
+        self.easy_margin = easy_margin
+        weight_attr =  paddle.ParamAttr(initializer = paddle.nn.initializer.XavierNormal())
+        self.fc = nn.Linear(self.embedding_size, self.class_num, weight_attr=weight_attr, bias_attr=False) 
+    def forward(self, input, label):
+        input_norm = paddle.sqrt(paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, input_norm)
+        weight = self.fc.weight
+        weight_norm = paddle.sqrt(paddle.sum(paddle.square(weight), axis=0, keepdim=True))
+        weight = paddle.divide(weight, weight_norm)
+        cos   = paddle.matmul(input, weight)
+        sin   = paddle.sqrt(1.0 - paddle.square(cos) + 1e-6)
+        cos_m = math.cos(self.margin)
+        sin_m = math.sin(self.margin)
+        phi   = cos * cos_m - sin * sin_m
+        th = math.cos(self.margin) * (-1)
+        mm = math.sin(self.margin) * self.margin
+        if self.easy_margin:
+            phi = self._paddle_where_more_than(cos, 0, phi, cos)
+        else:
+            phi = self._paddle_where_more_than(cos, th, phi, cos - mm)
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output  = paddle.multiply(one_hot, phi) + paddle.multiply((1.0 - one_hot), cos)
+        output  = output * self.scale
+        return output
+    def _paddle_where_more_than(self, target, limit, x, y):
+        mask   = paddle.cast( x = (target > limit), dtype='float32')
+        output = paddle.multiply(mask, x) + paddle.multiply((1.0 - mask), y)
+        return output
--- a/ppcls/arch/head/circlemargin.py
+++ b/ppcls/arch/head/circlemargin.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+class CircleMargin(nn.Layer):
+    def __init__(self, embedding_size, 
+                       class_num, 
+                       margin, 
+                       scale):
+        super(CircleSoftmax, self).__init__()
+        self.scale  = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+        weight_attr = paddle.ParamAttr(initializer = paddle.nn.initializer.XavierNormal())
+        self.fc0 = paddle.nn.Linear(self.embedding_size, self.class_num, weight_attr=weight_attr)
+    def forward(self, input, label):
+        feat_norm = paddle.sqrt(paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, feat_norm)
+        weight = self.fc0.weight
+        weight_norm = paddle.sqrt(paddle.sum(paddle.square(weight), axis=0, keepdim=True))
+        weight = paddle.divide(weight, weight_norm)
+        logits   = paddle.matmul(input, weight)
+        alpha_p = paddle.clip(-logits.detach() + 1 + self.margin, min=0.)
+        alpha_n = paddle.clip(logits.detach() + self.margin, min=0.)
+        delta_p = 1 - self.margin
+        delta_n = self.margin
+        index = paddle.fluid.layers.where(label != -1).reshape([-1])
+        m_hot = F.one_hot(label.reshape([-1]), num_classes=logits.shape[1])
+        logits_p = alpha_p * (logits - delta_p)
+        logits_n = alpha_n * (logits - delta_n)
+        pre_logits = logits_p * m_hot + logits_n * (1 - m_hot)
+        pre_logits = self.scale * pre_logits
+        return pre_logits
--- a/ppcls/arch/head/cosmargin.py
+++ b/ppcls/arch/head/cosmargin.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import math
+import paddle.nn as nn
+class CosMargin(paddle.nn.Layer):
+    def __init__(self, embedding_size,
+                       class_num,
+                       margin=0.35,
+                       scale=64.0):
+        super(CosMargin, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+        weight_attr =  paddle.ParamAttr(initializer = paddle.nn.initializer.XavierNormal())
+        self.fc = nn.Linear(self.embedding_size, self.class_num, weight_attr=weight_attr, bias_attr=False)
+    def forward(self, input, label):
+        label.stop_gradient = True
+        input_norm = paddle.sqrt(paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, x_norm) 
+        weight = self.fc.weight
+        weight_norm = paddle.sqrt(paddle.sum(paddle.square(weight), axis=0, keepdim=True))
+        weight = paddle.divide(weight, weight_norm)
+        cos   = paddle.matmul(input, weight)
+        cos_m = cos - self.margin
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output  = paddle.multiply(one_hot, cos_m) + paddle.multiply((1.0 - one_hot), cos)
+        output = output * self.scale
+        return output
--- a/ppcls/arch/head/fc.py
+++ b/ppcls/arch/head/fc.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.nn as nn
+class FC(nn.Layer):
+    def __init__(self, embedding_size, 
+                       class_num):
+        super(FC, self).__init__()
+        self.embedding_size  = embedding_size
+        self.class_num = class_num
+        weight_attr =  paddle.ParamAttr(initializer = paddle.nn.initializer.XavierNormal())
+        self.fc  =  paddle.nn.Linear(self.embedding_size, self.class_num, weight_attr=weight_attr)    
+    def forward(self, input, label):
+        out = self.fc(input)
+        return out
--- a/ppcls/arch/loss_metrics/__init__.py
+++ b/ppcls/arch/loss_metrics/__init__.py
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import sys
+import copy
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+# TODO: fix the format
+class CELoss(nn.Layer):
+    """
+    """
+    def __init__(self, name="loss", epsilon=None):
+        super().__init__()
+        self.name = name
+        if epsilon is not None and (epsilon <= 0 or epsilon >= 1):
+            epsilon = None
+        self.epsilon = epsilon
+    def _labelsmoothing(self, target, class_num):
+        if target.shape[-1] != class_num:
+            one_hot_target = F.one_hot(target, class_num)
+        else:
+            one_hot_target = target
+        soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon)
+        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
+        return soft_target
+    def forward(self, logits, label, mode="train"):
+        loss_dict = {}
+        if self.epsilon is not None:
+            class_num = logits.shape[-1]
+            label = self._labelsmoothing(label, class_num)
+            x = -F.log_softmax(x, axis=-1)
+            loss = paddle.sum(x * label, axis=-1)
+        else:
+            if label.shape[-1] == logits.shape[-1]:
+                label = F.softmax(label, axis=-1)
+                soft_label = True
+            else:
+                soft_label = False
+            loss = F.cross_entropy(logits, label=label, soft_label=soft_label)
+        loss_dict[self.name] = paddle.mean(loss)
+        return loss_dict
+# TODO: fix the format
+class Topk(nn.Layer):
+    def __init__(self, topk=[1, 5]):
+        super().__init__()
+        assert isinstance(topk, (int, list))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+    def forward(self, x, label):
+        metric_dict = dict()
+        for k in self.topk:
+            metric_dict["top{}".format(k)] = paddle.metric.accuracy(
+                x, label, k=k)
+        return metric_dict
+# TODO: fix the format
+def build_loss(config):
+    loss_func = CELoss()
+    return loss_func
+# TODO: fix the format
+def build_metrics(config):
+    metrics_func = Topk()
+    return metrics_func
--- a/ppcls/arch/neck/__init__.py
+++ b/ppcls/arch/neck/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+__all__ = ['build_neck"]
+def build_neck(config):
+    support_dict = ['FPN', 'FC']
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception('head only support {}'.format(
+        support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
--- a/ppcls/arch/neck/fc.py
+++ b/ppcls/arch/neck/fc.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.nn as nn
+class FC(nn.Layer):
+    def __init__(self, input_dim, 
+                       embedding_size):
+        super(FC, self).__init__()
+        self.input_dim  = input_dim
+        self.embedding_size = embedding_size
+        weight_attr =  paddle.ParamAttr(initializer = paddle.nn.initializer.XavierNormal())
+        self.fc  =  paddle.nn.Linear(self.input_dim, self.embedding_size, weight_attr=weight_attr)    
+    def forward(self, x):
+        x = self.fc(x)
+        return x
--- a/ppcls/configs/ImageNet/ResNet/ResNet50.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet50.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  class_num: 1000
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  image_shape: [3, 224, 224]
+  infer_imgs:
+# model architecture
+Arch:
+  name: "ResNet50"
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+# data loader for train and eval
+DataLoader:
+  Train:
+    # Dataset:
+    # Sampler:
+    # Loader:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+  Eval:
+    # TOTO: modify to the latest trainer
+    # Dataset:
+    # Sampler:
+    # Loader:
+    batch_size: 128
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+Metric:
+    Train:
+    - Topk:
+        k: [1, 5]
+    Eval:
+    - Topk:
+        k: [1, 5]
--- a/ppcls/data/__init__.py
+++ b/ppcls/data/__init__.py
@@ -115,5 +115,4 @@ def build_dataloader(config, mode, device, seed=None):
    dataloader = Reader(config, mode=mode, places=device)()
    return dataloader
-    return data_loader
 '''
--- a/ppcls/data/reader.py
+++ b/ppcls/data/reader.py
@@ -250,13 +250,14 @@ class Reader:
    def __init__(self, config, mode='train', places=None):
        try:
-            self.params = config[mode.upper()]
+            self.params = config[mode.capitalize()]
        except KeyError:
            raise ModeException(mode=mode)
        use_mix = config.get('use_mix')
        self.params['mode'] = mode
        self.shuffle = mode == "train"
+        self.is_train = mode == "train"
        self.collate_fn = None
        self.batch_ops = []
@@ -298,7 +299,7 @@ class Reader:
                shuffle=False,
                num_workers=self.params["num_workers"])
        else:
-            is_train = self.params['mode'] == "train"
+            is_train = self.is_train
            batch_sampler = DistributedBatchSampler(
                dataset,
                batch_size=batch_size,

--- a/ppcls/engine/trainer.py
+++ b/ppcls/engine/trainer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import numpy as np
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
+import argparse
+import paddle
+import paddle.nn as nn
+import paddle.distributed as dist
+from ppcls.utils.check import check_gpu
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+from ppcls.data import build_dataloader
+from ppcls.arch import build_model
+from ppcls.arch.loss_metrics import build_loss
+from ppcls.arch.loss_metrics import build_metrics
+from ppcls.optimizer import build_optimizer
+from ppcls.utils.save_load import load_dygraph_pretrain
+from ppcls.utils.save_load import init_model
+from ppcls.utils import save_load
+class Trainer(object):
+    def __init__(self, config, mode="train"):
+        self.mode = mode
+        self.config = config
+        self.output_dir = self.config['Global']['output_dir']
+        # set device
+        assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu"]
+        self.device = paddle.set_device(self.config["Global"]["device"])
+        # set dist
+        self.config["Global"][
+            "distributed"] = paddle.distributed.get_world_size() != 1
+        if self.config["Global"]["distributed"]:
+            dist.init_parallel_env()
+        self.model = build_model(self.config["Arch"])
+        if self.config["Global"]["pretrained_model"] is not None:
+            load_dygraph_pretrain(self.model,
+                                  self.config["Global"]["pretrained_model"])
+        if self.config["Global"]["distributed"]:
+            self.model = paddle.DataParallel(self.model)
+        self.vdl_writer = None
+        if self.config['Global']['use_visualdl']:
+            from visualdl import LogWriter
+            vdl_writer_path = os.path.join(self.output_dir, "vdl")
+            if not os.path.exists(vdl_writer_path):
+                os.makedirs(vdl_writer_path)
+            self.vdl_writer = LogWriter(logdir=vdl_writer_path)
+        logger.info('train with paddle {} and device {}'.format(
+            paddle.__version__, self.device))
+    def _build_metric_info(self, metric_config, mode="train"):
+        """
+        _build_metric_info: build metrics according to current mode
+        Return:
+            metric: dict of the metrics info
+        """
+        metric = None
+        mode = mode.capitalize()
+        if mode in metric_config and metric_config[mode] is not None:
+            metric = build_metrics(metric_config[mode])
+        return metric
+    def _build_loss_info(self, loss_config, mode="train"):
+        """
+        _build_loss_info: build loss according to current mode
+        Return:
+            loss_dict: dict of the loss info
+        """
+        loss = None
+        mode = mode.capitalize()
+        if mode in loss_config and loss_config[mode] is not None:
+            loss = build_loss(loss_config[mode])
+        return loss
+    def train(self):
+        # build train loss and metric info
+        loss_func = self._build_loss_info(self.config["Loss"])
+        metric_func = self._build_metric_info(self.config["Metric"])
+        train_dataloader = build_dataloader(self.config["DataLoader"], "train",
+                                            self.device)
+        step_each_epoch = len(train_dataloader)
+        optimizer, lr_sch = build_optimizer(self.config["Optimizer"],
+                                            self.config["Global"]["epochs"],
+                                            step_each_epoch,
+                                            self.model.parameters())
+        print_batch_step = self.config['Global']['print_batch_step']
+        save_interval = self.config["Global"]["save_interval"]
+        best_metric = {
+            "metric": 0.0,
+            "epoch": 0,
+        }
+        # key: 
+        # val: metrics list word
+        output_info = dict()
+        # global iter counter
+        global_step = 0
+        if self.config["Global"]["checkpoints"] is not None:
+            metric_info = init_model(self.config["Global"], self.model,
+                                     optimizer)
+            if metric_info is not None:
+                best_metric.update(metric_info)
+        for epoch_id in range(best_metric["epoch"] + 1,
+                              self.config["Global"]["epochs"] + 1):
+            acc = 0.0
+            self.model.train()
+            for iter_id, batch in enumerate(train_dataloader()):
+                batch_size = batch[0].shape[0]
+                batch[1] = paddle.to_tensor(batch[1].numpy().astype("int64")
+                                            .reshape([-1, 1]))
+                global_step += 1
+                # image input
+                out = self.model(batch[0])
+                # calc loss
+                loss_dict = loss_func(out, batch[-1])
+                for key in loss_dict:
+                    if not key in output_info:
+                        output_info[key] = AverageMeter(key, '7.5f')
+                    output_info[key].update(loss_dict[key].numpy()[0],
+                                            batch_size)
+                # calc metric
+                if metric_func is not None:
+                    metric_dict = metric_func(out, batch[-1])
+                    for key in metric_dict:
+                        if not key in output_info:
+                            output_info[key] = AverageMeter(key, '7.5f')
+                        output_info[key].update(metric_dict[key].numpy()[0],
+                                                batch_size)
+                if iter_id % print_batch_step == 0:
+                    lr_msg = "lr: {:.5f}".format(lr_sch.get_lr())
+                    metric_msg = ", ".join([
+                        "{}: {:.5f}".format(key, output_info[key].avg)
+                        for key in output_info
+                    ])
+                    logger.info("[Train][Epoch {}][Iter: {}/{}]{}, {}".format(
+                        epoch_id, iter_id,
+                        len(train_dataloader), lr_msg, metric_msg))
+                # step opt and lr
+                loss_dict["loss"].backward()
+                optimizer.step()
+                optimizer.clear_grad()
+                lr_sch.step()
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, output_info[key].avg)
+                for key in output_info
+            ])
+            logger.info("[Train][Epoch {}][Avg]{}".format(epoch_id,
+                                                          metric_msg))
+            output_info.clear()
+            # eval model and save model if possible
+            if self.config["Global"][
+                    "eval_during_train"] and epoch_id % self.config["Global"][
+                        "eval_during_train"] == 0:
+                acc = self.eval(epoch_id)
+                if acc > best_metric["metric"]:
+                    best_metric["metric"] = acc
+                    best_metric["epoch"] = epoch_id
+                    save_load.save_model(
+                        self.model,
+                        optimizer,
+                        best_metric,
+                        self.output_dir,
+                        model_name=self.config["Arch"]["name"],
+                        prefix="best_model")
+            # save model
+            if epoch_id % save_interval == 0:
+                save_load.save_model(
+                    self.model,
+                    optimizer, {"metric": acc,
+                                "epoch": epoch_id},
+                    self.output_dir,
+                    model_name=self.config["Arch"]["name"],
+                    prefix="ppcls_epoch_{}".format(epoch_id))
+    def build_avg_metrics(self, info_dict):
+        return {key: AverageMeter(key, '7.5f') for key in info_dict}
+    @paddle.no_grad()
+    def eval(self, epoch_id=0):
+        output_info = dict()
+        eval_dataloader = build_dataloader(self.config["DataLoader"], "eval",
+                                           self.device)
+        self.model.eval()
+        print_batch_step = self.config["Global"]["print_batch_step"]
+        # build train loss and metric info
+        loss_func = self._build_loss_info(self.config["Loss"], "eval")
+        metric_func = self._build_metric_info(self.config["Metric"], "eval")
+        metric_key = None
+        for iter_id, batch in enumerate(eval_dataloader()):
+            batch_size = batch[0].shape[0]
+            batch[0] = paddle.to_tensor(batch[0]).astype("float32")
+            batch[1] = paddle.to_tensor(batch[1]).reshape([-1, 1])
+            # image input
+            out = self.model(batch[0])
+            # calc build
+            if loss_func is not None:
+                loss_dict = loss_func(out, batch[-1])
+                for key in loss_dict:
+                    if not key in output_info:
+                        output_info[key] = AverageMeter(key, '7.5f')
+                    output_info[key].update(loss_dict[key].numpy()[0],
+                                            batch_size)
+                # calc metric
+                if metric_func is not None:
+                    metric_dict = metric_func(out, batch[-1])
+                    if paddle.distributed.get_world_size() > 1:
+                        for key in metric_dict:
+                            paddle.distributed.all_reduce(
+                                metric_dict[key],
+                                op=paddle.distributed.ReduceOp.SUM)
+                            metric_dict[key] = metric_dict[
+                                key] / paddle.distributed.get_world_size()
+                    for key in metric_dict:
+                        if metric_key is None:
+                            metric_key = key
+                        if not key in output_info:
+                            output_info[key] = AverageMeter(key, '7.5f')
+                        output_info[key].update(metric_dict[key].numpy()[0],
+                                                batch_size)
+            if iter_id % print_batch_step == 0:
+                metric_msg = ", ".join([
+                    "{}: {:.5f}".format(key, output_info[key].val)
+                    for key in output_info
+                ])
+                logger.info("[Eval][Epoch {}][Iter: {}/{}]{}".format(
+                    epoch_id, iter_id, len(eval_dataloader), metric_msg))
+        metric_msg = ", ".join([
+            "{}: {:.5f}".format(key, output_info[key].avg)
+            for key in output_info
+        ])
+        logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+        self.model.train()
+        # do not try to save best model
+        if metric_func is None:
+            return -1
+        # return 1st metric in the dict
+        return output_info[metric_key].avg
--- a/ppcls/losses/__init__.py
+++ b/ppcls/losses/__init__.py
+import copy
+import paddle
+import paddle.nn as nn
+from .celoss import CELoss
+from .triplet import TripletLoss, TripletLossV2
+from .msmloss import MSMLoss
+from .emlloss import EmlLoss
+from .npairsloss  import NpairsLoss
+from .trihardloss import TriHardLoss
+from .centerloss  import CenterLoss
+class CombinedLoss(nn.Layer):
+    def __init__(self, config_list):
+        super().__init__()
+        self.loss_func = []
+        self.loss_weight = []
+        assert isinstance(config_list, list), (
+            'operator config should be a list')
+        for config in config_list:
+            print(config)
+            assert isinstance(config,
+                              dict) and len(config) == 1, "yaml format error"
+            name = list(config)[0]
+            param = config[name]
+            assert "weight" in param, "weight must be in param, but param just contains {}".format(
+                param.keys())
+            self.loss_weight.append(param.pop("weight"))
+            self.loss_func.append(eval(name)(**param))
+    def __call__(self, input, batch):
+        loss_dict = {}
+        for idx, loss_func in enumerate(self.loss_func):
+            loss = loss_func(input, batch)
+            weight = self.loss_weight[idx]
+            loss = {key: loss[key] * weight for key in loss}
+            loss_dict.update(loss)
+        loss_dict["loss"] = paddle.add_n(list(loss_dict.values()))
+        return loss_dict
+def build_loss(config):
+    module_class = CombinedLoss(config)
+    logger.info("build loss {} success.".format(module_class))
+    return module_class
--- a/ppcls/losses/celoss.py
+++ b/ppcls/losses/celoss.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn.functional as F
+__all__ = ['CELoss', 'JSDivLoss', 'KLDivLoss']
+class Loss(object):
+    """
+    Loss
+    """
+    def __init__(self, class_dim=1000, epsilon=None):
+        assert class_dim > 1, "class_dim=%d is not larger than 1" % (class_dim)
+        self._class_dim = class_dim
+        if epsilon is not None and epsilon >= 0.0 and epsilon <= 1.0:
+            self._epsilon = epsilon
+            self._label_smoothing = True  #use label smoothing.(Actually, it is softmax label)
+        else:
+            self._epsilon = None
+            self._label_smoothing = False
+    #do label_smoothing
+    def _labelsmoothing(self, target):
+        if target.shape[-1] != self._class_dim:
+            one_hot_target = F.one_hot(target, self._class_dim)  #do ont hot(23,34,46)-> 3 * _class_dim
+        else:
+            one_hot_target = target
+        #do label_smooth
+        soft_target = F.label_smooth(one_hot_target, epsilon=self._epsilon)   #(1 - epsilon) * input + eposilon / K.
+        soft_target = paddle.reshape(soft_target, shape=[-1, self._class_dim])
+        return soft_target
+    def _crossentropy(self, input, target, use_pure_fp16=False):
+        if self._label_smoothing:
+            target = self._labelsmoothing(target)
+            input = -F.log_softmax(input, axis=-1)      #softmax and do log
+            cost = paddle.sum(target * input, axis=-1)  #sum  
+        else:
+            cost = F.cross_entropy(input=input, label=target) 
+        if use_pure_fp16:
+            avg_cost = paddle.sum(cost)
+        else:
+            avg_cost = paddle.mean(cost)
+        return avg_cost
+    def _kldiv(self, input, target, name=None):
+        eps = 1.0e-10
+        cost = target * paddle.log(
+            (target + eps) / (input + eps)) * self._class_dim
+        return cost
+    def _jsdiv(self, input, target):  #so the input and target is the fc output; no softmax
+        input = F.softmax(input)
+        target = F.softmax(target) 
+        #two distribution
+        cost = self._kldiv(input, target) + self._kldiv(target, input)
+        cost = cost / 2
+        avg_cost = paddle.mean(cost)
+        return avg_cost
+    def __call__(self, input, target):
+        pass
+class CELoss(Loss):
+    """
+    Cross entropy loss
+    """
+    def __init__(self, class_dim=1000, epsilon=None):
+        super(CELoss, self).__init__(class_dim, epsilon)
+    def __call__(self, input, target, use_pure_fp16=False):
+        logits = input["logits"]
+        cost = self._crossentropy(logits, target, use_pure_fp16)
+        return {"CELoss": cost}
+class JSDivLoss(Loss):
+    """
+    JSDiv loss
+    """
+    def __init__(self, class_dim=1000, epsilon=None):
+        super(JSDivLoss, self).__init__(class_dim, epsilon)
+    def __call__(self, input, target):
+        cost = self._jsdiv(input, target)
+        return cost
+class KLDivLoss(paddle.nn.Layer):
+    def __init__(self):
+        super(KLDivLoss, self).__init__()
+    def __call__(self, p, q, is_logit=True):
+        if is_logit:
+            p = paddle.nn.functional.softmax(p)
+            q = paddle.nn.functional.softmax(q)
+        return -(p * paddle.log(q + 1e-8)).sum(1).mean()
--- a/ppcls/losses/centerloss.py
+++ b/ppcls/losses/centerloss.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+class CenterLoss(nn.Layer):
+    def __init__(self, num_classes=5013, feat_dim=2048):
+        super(CenterLoss, self).__init__()
+        self.num_classes = num_classes
+        self.feat_dim = feat_dim
+        self.centers  = paddle.randn(shape=[self.num_classes, self.feat_dim]).astype("float64")  #random center
+    def __call__(self, input, target):
+        """
+        inputs: network output: {"features: xxx", "logits": xxxx}
+        target: image label
+        """
+        feats = input["features"]
+        labels = target
+        batch_size = feats.shape[0]
+        #calc feat * feat   
+        dist1 = paddle.sum(paddle.square(feats), axis=1, keepdim=True)
+        dist1 = paddle.expand(dist1, [batch_size, self.num_classes])  
+        #dist2 of centers
+        dist2 = paddle.sum(paddle.square(self.centers), axis=1, keepdim=True)   #num_classes
+        dist2 = paddle.expand(dist2, [self.num_classes, batch_size]).astype("float64")
+        dist2 = paddle.transpose(dist2, [1, 0])
+        #first x * x + y * y
+        distmat = paddle.add(dist1, dist2)
+        tmp = paddle.matmul(feats,  paddle.transpose(self.centers, [1, 0]))
+        distmat = distmat -  2.0 * tmp
+        #generate the mask
+        classes = paddle.arange(self.num_classes).astype("int64")
+        labels  = paddle.expand(paddle.unsqueeze(labels, 1), (batch_size, self.num_classes))
+        mask    = paddle.equal(paddle.expand(classes, [batch_size, self.num_classes]), labels).astype("float64")  #get mask
+        dist = paddle.multiply(distmat,  mask)
+        loss = paddle.sum(paddle.clip(dist, min=1e-12, max=1e+12)) / batch_size
+        return {'CenterLoss': loss}
--- a/ppcls/losses/comfunc.py
+++ b/ppcls/losses/comfunc.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+def rerange_index(batch_size, samples_each_class):
+    tmp = np.arange(0, batch_size * batch_size) 
+    tmp = tmp.reshape(-1, batch_size) 
+    rerange_index = []
+    for i in range(batch_size):
+        step = i // samples_each_class
+        start = step * samples_each_class
+        end   = (step + 1) * samples_each_class
+        pos_idx = []   
+        neg_idx = []   
+        for j, k in enumerate(tmp[i]):
+            if j >= start and j < end:
+                if j == i:
+                    pos_idx.insert(0, k)
+                else:
+                    pos_idx.append(k)  
+            else:
+                neg_idx.append(k)  
+        rerange_index += (pos_idx + neg_idx)
+    rerange_index = np.array(rerange_index).astype(np.int32)
+    return rerange_index
--- a/ppcls/losses/emlloss.py
+++ b/ppcls/losses/emlloss.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import math
+import paddle
+import numpy as np
+from .comfunc import rerange_index
+class EmlLoss(paddle.nn.Layer):
+    def __init__(self, batch_size = 40, samples_each_class = 2):
+        super(EmlLoss, self).__init__()
+        assert(batch_size % samples_each_class == 0)
+        self.samples_each_class = samples_each_class
+        self.batch_size   = batch_size
+        self.rerange_index      = rerange_index(batch_size, samples_each_class)
+        self.thresh = 20.0
+        self.beta   = 100000
+    def surrogate_function(self, beta, theta, bias):
+        x = theta * paddle.exp(bias) 
+        output = paddle.log(1 + beta * x) / math.log(1 + beta)
+        return output
+    def surrogate_function_approximate(self, beta, theta, bias):
+        output = (paddle.log(theta) + bias + math.log(beta)) / math.log(1+beta)
+        return output
+    def surrogate_function_stable(self, beta, theta, target, thresh):
+        max_gap = paddle.to_tensor(thresh, dtype='float32')
+        max_gap.stop_gradient = True
+        target_max = paddle.maximum(target, max_gap)
+        target_min = paddle.minimum(target, max_gap)
+        loss1 = self.surrogate_function(beta, theta, target_min)
+        loss2 = self.surrogate_function_approximate(beta, theta, target_max)
+        bias  = self.surrogate_function(beta, theta, max_gap)
+        loss  = loss1 + loss2 - bias
+        return loss
+    def forward(self, input, target=None):
+        features = input["features"]
+        samples_each_class = self.samples_each_class
+        batch_size         = self.batch_size
+        rerange_index      = self.rerange_index
+        #calc distance
+        diffs = paddle.unsqueeze(features, axis=1) - paddle.unsqueeze(features, axis=0)
+        similary_matrix =  paddle.sum(paddle.square(diffs), axis=-1)   
+        tmp = paddle.reshape(similary_matrix, shape = [-1, 1]) 
+        rerange_index = paddle.to_tensor(rerange_index)
+        tmp = paddle.gather(tmp, index=rerange_index)   
+        similary_matrix = paddle.reshape(tmp, shape=[-1, batch_size])  
+        ignore, pos, neg = paddle.split(similary_matrix, num_or_sections= [1, 
+            samples_each_class - 1, batch_size - samples_each_class], axis = 1)
+        ignore.stop_gradient = True 
+        pos_max = paddle.max(pos, axis=1, keepdim=True)
+        pos = paddle.exp(pos - pos_max)
+        pos_mean = paddle.mean(pos, axis=1, keepdim=True)
+        neg_min = paddle.min(neg, axis=1, keepdim=True)
+        neg = paddle.exp(neg_min - neg)
+        neg_mean = paddle.mean(neg, axis=1, keepdim=True)
+        bias = pos_max - neg_min
+        theta = paddle.multiply(neg_mean, pos_mean)
+        loss = self.surrogate_function_stable(self.beta, theta, bias, self.thresh)
+        loss = paddle.mean(loss)
+        return {"emlloss": loss}
--- a/ppcls/losses/msmloss.py
+++ b/ppcls/losses/msmloss.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+from .comfunc import rerange_index
+class MSMLoss(paddle.nn.Layer):
+    """
+    MSMLoss Loss, based on triplet loss. USE P * K samples.
+    the batch size is fixed. Batch_size = P * K;  but the K may vary between batches.
+    same label gather together
+            supported_metrics = [
+            'euclidean',
+            'sqeuclidean',
+            'cityblock',
+        ]
+    only consider samples_each_class = 2
+    """
+    def __init__(self, batch_size = 120, samples_each_class=2,  margin=0.1):
+        super(MSMLoss, self).__init__()
+        self.margin = margin
+        self.samples_each_class = samples_each_class
+        self.batch_size         = batch_size
+        self.rerange_index      = rerange_index(batch_size, samples_each_class)
+    def forward(self, input, target=None):
+        #normalization 
+        features = input["features"]
+        features = self._nomalize(features)
+        samples_each_class = self.samples_each_class
+        rerange_index      = paddle.to_tensor(self.rerange_index)
+        #calc sm
+        diffs = paddle.unsqueeze(features, axis=1) - paddle.unsqueeze(features, axis=0)
+        similary_matrix =  paddle.sum(paddle.square(diffs), axis=-1)
+        #rerange 
+        tmp = paddle.reshape(similary_matrix, shape = [-1, 1]) 
+        tmp = paddle.gather(tmp, index=rerange_index)   
+        similary_matrix = paddle.reshape(tmp, shape=[-1, self.batch_size])  
+        #split
+        ignore, pos, neg = paddle.split(similary_matrix, num_or_sections= [1, 
+            samples_each_class - 1, -1], axis = 1)
+        ignore.stop_gradient = True   
+        hard_pos = paddle.max(pos)   
+        hard_neg = paddle.min(neg)
+        loss = hard_pos + self.margin - hard_neg
+        loss = paddle.nn.ReLU()(loss)  
+        return {"msmloss": loss}
+    def _nomalize(self, input):
+        input_norm = paddle.sqrt(paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        return paddle.divide(input, input_norm)
--- a/ppcls/losses/npairsloss.py
+++ b/ppcls/losses/npairsloss.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+class NpairsLoss(paddle.nn.Layer):
+    def __init__(self, reg_lambda=0.01):
+        super(NpairsLoss, self).__init__()
+        self.reg_lambda = reg_lambda
+    def forward(self, input, target=None):
+        """
+        anchor and positive(should include label)
+        """
+        features = input["features"]
+        reg_lambda = self.reg_lambda
+        batch_size = features.shape[0]
+        fea_dim    = features.shape[1]
+        num_class = batch_size // 2
+        #reshape
+        out_feas = paddle.reshape(features, shape=[-1, 2, fea_dim])
+        anc_feas, pos_feas = paddle.split(out_feas, num_or_sections = 2, axis = 1)
+        anc_feas   = paddle.squeeze(anc_feas, axis=1)
+        pos_feas = paddle.squeeze(pos_feas, axis=1)
+        #get simi matrix
+        similarity_matrix = paddle.matmul(anc_feas, pos_feas, transpose_y=True)     #get similarity matrix
+        sparse_labels = paddle.arange(0, num_class, dtype='int64')
+        xentloss = paddle.nn.CrossEntropyLoss()(similarity_matrix, sparse_labels)   #by default: mean
+        #l2 norm
+        reg = paddle.mean(paddle.sum(paddle.square(features), axis=1))
+        l2loss = 0.5 * reg_lambda * reg
+        return {"npairsloss": xentloss + l2loss}
--- a/ppcls/losses/trihardloss.py
+++ b/ppcls/losses/trihardloss.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+from .comfunc import rerange_index
+class TriHardLoss(paddle.nn.Layer):
+    """
+    TriHard Loss, based on triplet loss. USE P * K samples.
+    the batch size is fixed. Batch_size = P * K;  but the K may vary between batches.
+    same label gather together
+            supported_metrics = [
+            'euclidean',
+            'sqeuclidean',
+            'cityblock',
+        ]
+    only consider samples_each_class = 2
+    """
+    def __init__(self, batch_size = 120, samples_each_class=2,  margin=0.1):
+        super(TriHardLoss, self).__init__()
+        self.margin = margin
+        self.samples_each_class = samples_each_class
+        self.batch_size         = batch_size
+        self.rerange_index      = rerange_index(batch_size, samples_each_class)
+    def forward(self, input, target=None):
+        features = input["features"]
+        assert (self.batch_size == features.shape[0])
+        #normalization 
+        features = self._nomalize(features)
+        samples_each_class = self.samples_each_class
+        rerange_index      = paddle.to_tensor(self.rerange_index)
+        #calc sm
+        diffs = paddle.unsqueeze(features, axis=1) - paddle.unsqueeze(features, axis=0)
+        similary_matrix =  paddle.sum(paddle.square(diffs), axis=-1)
+        #rerange 
+        tmp = paddle.reshape(similary_matrix, shape = [-1, 1]) 
+        tmp = paddle.gather(tmp, index=rerange_index)   
+        similary_matrix = paddle.reshape(tmp, shape=[-1, self.batch_size])  
+        #split
+        ignore, pos, neg = paddle.split(similary_matrix, num_or_sections= [1, 
+            samples_each_class - 1, -1], axis = 1)
+        ignore.stop_gradient = True    
+        hard_pos = paddle.max(pos, axis=1) 
+        hard_neg = paddle.min(neg, axis=1)
+        loss = hard_pos + self.margin - hard_neg
+        loss = paddle.nn.ReLU()(loss)    
+        loss = paddle.mean(loss)
+        return {"trihardloss": loss}
+    def _nomalize(self, input):
+        input_norm = paddle.sqrt(paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        return paddle.divide(input, input_norm)
--- a/ppcls/losses/triplet.py
+++ b/ppcls/losses/triplet.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.nn as nn
+class TripletLossV2(nn.Layer):
+    """Triplet loss with hard positive/negative mining.
+    Args:
+        margin (float): margin for triplet.
+    """
+    def __init__(self, margin=0.5):
+        super(TripletLossV2, self).__init__()
+        self.margin = margin
+        self.ranking_loss = paddle.nn.loss.MarginRankingLoss(margin=margin)
+    def forward(self, input, target, normalize_feature=True):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (num_classes)
+        """
+        inputs = input["features"]
+        if normalize_feature:
+            inputs = 1. * inputs / (paddle.expand_as(
+                paddle.norm(inputs, p=2, axis=-1, keepdim=True), inputs) +
+                                    1e-12)
+        bs = inputs.shape[0]
+        # compute distance
+        dist = paddle.pow(inputs, 2).sum(axis=1, keepdim=True).expand([bs, bs])
+        dist = dist + dist.t()
+        dist = paddle.addmm(input=dist,
+                            x=inputs,
+                            y=inputs.t(),
+                            alpha=-2.0,
+                            beta=1.0)
+        dist = paddle.clip(dist, min=1e-12).sqrt()
+        # hard negative mining
+        is_pos = paddle.expand(target, (bs, bs)).equal(
+            paddle.expand(target, (bs, bs)).t())
+        is_neg = paddle.expand(target, (bs, bs)).not_equal(
+            paddle.expand(target, (bs, bs)).t())
+        # `dist_ap` means distance(anchor, positive)
+        ## both `dist_ap` and `relative_p_inds` with shape [N, 1]
+        #print(is_pos.shape, dist.shape, type(is_pos), type(dist), paddle.reshape(paddle.masked_select(dist, is_pos),(bs, -1)))
+        '''
+        dist_ap, relative_p_inds = paddle.max(
+            paddle.reshape(dist[is_pos], (bs, -1)), axis=1, keepdim=True)
+        # `dist_an` means distance(anchor, negative)
+        # both `dist_an` and `relative_n_inds` with shape [N, 1]
+        dist_an, relative_n_inds = paddle.min(
+            paddle.reshape(dist[is_neg], (bs, -1)), axis=1, keepdim=True)
+        '''
+        dist_ap = paddle.max(paddle.reshape(paddle.masked_select(dist, is_pos),
+                                            (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # `dist_an` means distance(anchor, negative)
+        # both `dist_an` and `relative_n_inds` with shape [N, 1]
+        dist_an = paddle.min(paddle.reshape(paddle.masked_select(dist, is_neg),
+                                            (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # shape [N]
+        dist_ap = paddle.squeeze(dist_ap, axis=1)
+        dist_an = paddle.squeeze(dist_an, axis=1)
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_an, dist_ap, y)
+        return {"TripletLossV2": loss}
+class TripletLoss(nn.Layer):
+    """Triplet loss with hard positive/negative mining.
+    Reference:
+    Hermans et al. In Defense of the Triplet Loss for Person Re-Identification. arXiv:1703.07737.
+    Code imported from https://github.com/Cysu/open-reid/blob/master/reid/loss/triplet.py.
+    Args:
+        margin (float): margin for triplet.
+    """
+    def __init__(self, margin=1.0):
+        super(TripletLoss, self).__init__()
+        self.margin = margin
+        self.ranking_loss = paddle.nn.loss.MarginRankingLoss(margin=margin)
+    def forward(self, input, target):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (num_classes)
+        """
+        inputs = input["features"]
+        #print(inputs.shape, targets.shape)
+        bs = inputs.shape[0]
+        # Compute pairwise distance, replace by the official when merged
+        dist = paddle.pow(inputs, 2).sum(axis=1, keepdim=True).expand([bs, bs])
+        dist = dist + dist.t()
+        dist = paddle.addmm(input=dist,
+                            x=inputs,
+                            y=inputs.t(),
+                            alpha=-2.0,
+                            beta=1.0)
+        dist = paddle.clip(dist, min=1e-12).sqrt()
+        mask = paddle.equal(target.expand([bs, bs]),
+                            target.expand([bs, bs]).t())
+        mask_numpy_idx = mask.numpy()
+        dist_ap, dist_an = [], []
+        for i in range(bs):
+            # dist_ap_i = paddle.to_tensor(dist[i].numpy()[mask_numpy_idx[i]].max(),dtype='float64').unsqueeze(0)
+            # dist_ap_i.stop_gradient = False
+            # dist_ap.append(dist_ap_i)
+            dist_ap.append(
+                max([
+                    dist[i][j]
+                    if mask_numpy_idx[i][j] == True else float("-inf")
+                    for j in range(bs)
+                ]).unsqueeze(0))
+            # dist_an_i = paddle.to_tensor(dist[i].numpy()[mask_numpy_idx[i] == False].min(), dtype='float64').unsqueeze(0)
+            # dist_an_i.stop_gradient = False
+            # dist_an.append(dist_an_i)
+            dist_an.append(
+                min([
+                    dist[i][k]
+                    if mask_numpy_idx[i][k] == False else float("inf")
+                    for k in range(bs)
+                ]).unsqueeze(0))
+        dist_ap = paddle.concat(dist_ap, axis=0)
+        dist_an = paddle.concat(dist_an, axis=0)
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_an, dist_ap, y)
+        return {"TripletLoss": loss}
--- a/ppcls/optimizer/__init__.py
+++ b/ppcls/optimizer/__init__.py
@@ -12,8 +12,54 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import copy
+import paddle
+from ppcls.utils import logger
 from . import optimizer
-from . import learning_rate
-from .optimizer import OptimizerBuilder
+__all__ = ['build_optimizer']
-from .learning_rate import LearningRateBuilder
+def build_lr_scheduler(lr_config, epochs, step_each_epoch):
+    from . import learning_rate
+    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
+    if 'name' in lr_config:
+        lr_name = lr_config.pop('name')
+        lr = getattr(learning_rate, lr_name)(**lr_config)()
+    else:
+        lr = lr_config['learning_rate']
+    return lr
+def build_optimizer(config, epochs, step_each_epoch, parameters):
+    config = copy.deepcopy(config)
+    # step1 build lr
+    lr = build_lr_scheduler(config.pop('lr'), epochs, step_each_epoch)
+    logger.info("build lr ({}) success..".format(lr))
+    # step2 build regularization
+    if 'regularizer' in config and config['regularizer'] is not None:
+        reg_config = config.pop('regularizer')
+        reg_name = reg_config.pop('name') + 'Decay'
+        reg = getattr(paddle.regularizer, reg_name)(**reg_config)
+    else:
+        reg = None
+    logger.info("build regularizer ({}) success..".format(reg))
+    # step3 build optimizer
+    optim_name = config.pop('name')
+    if 'clip_norm' in config:
+        clip_norm = config.pop('clip_norm')
+        grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm)
+    else:
+        grad_clip = None
+    optim = getattr(optimizer, optim_name)(learning_rate=lr,
+                                           weight_decay=reg,
+                                           grad_clip=grad_clip,
+                                           **config)(parameters=parameters)
+    logger.info("build optimizer ({}) success..".format(optim))
+    return optim, lr
--- a/ppcls/optimizer/learning_rate.py
+++ b/ppcls/optimizer/learning_rate.py
@@ -11,149 +11,173 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+from __future__ import unicode_literals
+from paddle.optimizer import lr
-import sys
-import math
-from paddle.optimizer.lr import LinearWarmup
-from paddle.optimizer.lr import PiecewiseDecay
-from paddle.optimizer.lr import CosineAnnealingDecay
-from paddle.optimizer.lr import ExponentialDecay
-__all__ = ['LearningRateBuilder']
-class Cosine(CosineAnnealingDecay):
-    """
-    Cosine learning rate decay
-    lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1)
-    Args:
-        lr(float): initial learning rate
-        step_each_epoch(int): steps each epoch
-        epochs(int): total training epochs
-    """
-    def __init__(self, lr, step_each_epoch, epochs, **kwargs):
-        super(Cosine, self).__init__(
-            learning_rate=lr,
-            T_max=step_each_epoch * epochs, )
-        self.update_specified = False
+class Linear(object):
-class Piecewise(PiecewiseDecay):
    """
-    Piecewise learning rate decay
+    Linear learning rate decay
    Args:
-        lr(float): initial learning rate
+        lr (float): The initial learning rate. It is a python float number.
-        step_each_epoch(int): steps each epoch
+        epochs(int): The decay step size. It determines the decay cycle.
-        decay_epochs(list): piecewise decay epochs
+        end_lr(float, optional): The minimum final learning rate. Default: 0.0001.
-        gamma(float): decay factor
+        power(float, optional): Power of polynomial. Default: 1.0.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
    """
-    def __init__(self, lr, step_each_epoch, decay_epochs, gamma=0.1, **kwargs):
+    def __init__(self,
-        boundaries = [step_each_epoch * e for e in decay_epochs]
+                 learning_rate,
-        lr_values = [lr * (gamma**i) for i in range(len(boundaries) + 1)]
+                 epochs,
-        super(Piecewise, self).__init__(
+                 step_each_epoch,
-            boundaries=boundaries, values=lr_values)
+                 end_lr=0.0,
+                 power=1.0,
-        self.update_specified = False
+                 warmup_epoch=0,
+                 last_epoch=-1,
+                 **kwargs):
+        super(Linear, self).__init__()
+        self.learning_rate = learning_rate
+        self.epochs = epochs * step_each_epoch
+        self.end_lr = end_lr
+        self.power = power
+        self.last_epoch = last_epoch
+        self.warmup_epoch = round(warmup_epoch * step_each_epoch)
-class CosineWarmup(LinearWarmup):
+    def __call__(self):
+        learning_rate = lr.PolynomialDecay(
+            learning_rate=self.learning_rate,
+            decay_steps=self.epochs,
+            end_lr=self.end_lr,
+            power=self.power,
+            last_epoch=self.last_epoch)
+        if self.warmup_epoch > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_epoch,
+                start_lr=0.0,
+                end_lr=self.learning_rate,
+                last_epoch=self.last_epoch)
+        return learning_rate
+class Cosine(object):
    """
-    Cosine learning rate decay with warmup
+    Cosine learning rate decay
-    [0, warmup_epoch): linear warmup
+    lr = 0.05 * (math.cos(epoch * (math.pi / epochs)) + 1)
-    [warmup_epoch, epochs): cosine decay
    Args:
        lr(float): initial learning rate
        step_each_epoch(int): steps each epoch
        epochs(int): total training epochs
-        warmup_epoch(int): epoch num of warmup
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
    """
-    def __init__(self, lr, step_each_epoch, epochs, warmup_epoch=5, **kwargs):
+    def __init__(self,
-        assert epochs > warmup_epoch, "total epoch({}) should be larger than warmup_epoch({}) in CosineWarmup.".format(
+                 learning_rate,
-            epochs, warmup_epoch)
+                 step_each_epoch,
-        warmup_step = warmup_epoch * step_each_epoch
+                 epochs,
-        start_lr = 0.0
+                 warmup_epoch=0,
-        end_lr = lr
+                 last_epoch=-1,
-        lr_sch = Cosine(lr, step_each_epoch, epochs - warmup_epoch)
+                 **kwargs):
+        super(Cosine, self).__init__()
-        super(CosineWarmup, self).__init__(
+        self.learning_rate = learning_rate
-            learning_rate=lr_sch,
+        self.T_max = step_each_epoch * epochs
-            warmup_steps=warmup_step,
+        self.last_epoch = last_epoch
-            start_lr=start_lr,
+        self.warmup_epoch = round(warmup_epoch * step_each_epoch)
-            end_lr=end_lr)
-        self.update_specified = False
-class ExponentialWarmup(LinearWarmup):
+    def __call__(self):
+        learning_rate = lr.CosineAnnealingDecay(
+            learning_rate=self.learning_rate,
+            T_max=self.T_max,
+            last_epoch=self.last_epoch)
+        if self.warmup_epoch > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_epoch,
+                start_lr=0.0,
+                end_lr=self.learning_rate,
+                last_epoch=self.last_epoch)
+        return learning_rate
+class Step(object):
    """
-    Exponential learning rate decay with warmup
+    Piecewise learning rate decay
-    [0, warmup_epoch): linear warmup
-    [warmup_epoch, epochs): Exponential decay
    Args:
-        lr(float): initial learning rate
        step_each_epoch(int): steps each epoch
-        decay_epochs(float): decay epochs
+        learning_rate (float): The initial learning rate. It is a python float number.
-        decay_rate(float): decay rate
+        step_size (int): the interval to update.
-        warmup_epoch(int): epoch num of warmup
+        gamma (float, optional): The Ratio that the learning rate will be reduced. ``new_lr = origin_lr * gamma`` .
+            It should be less than 1.0. Default: 0.1.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
    """
    def __init__(self,
-                 lr,
+                 learning_rate,
+                 step_size,
                 step_each_epoch,
-                 decay_epochs=2.4,
+                 gamma,
-                 decay_rate=0.97,
+                 warmup_epoch=0,
-                 warmup_epoch=5,
+                 last_epoch=-1,
                 **kwargs):
-        warmup_step = warmup_epoch * step_each_epoch
+        super(Step, self).__init__()
-        start_lr = 0.0
+        self.step_size = step_each_epoch * step_size
-        end_lr = lr
+        self.learning_rate = learning_rate
-        lr_sch = ExponentialDecay(lr, decay_rate)
+        self.gamma = gamma
+        self.last_epoch = last_epoch
-        super(ExponentialWarmup, self).__init__(
+        self.warmup_epoch = round(warmup_epoch * step_each_epoch)
-            learning_rate=lr_sch,
-            warmup_steps=warmup_step,
-            start_lr=start_lr,
-            end_lr=end_lr)
-        # NOTE: hac method to update exponential lr scheduler
-        self.update_specified = True
-        self.update_start_step = warmup_step
-        self.update_step_interval = int(decay_epochs * step_each_epoch)
-        self.step_each_epoch = step_each_epoch
+    def __call__(self):
-class LearningRateBuilder():
+        learning_rate = lr.StepDecay(
+            learning_rate=self.learning_rate,
+            step_size=self.step_size,
+            gamma=self.gamma,
+            last_epoch=self.last_epoch)
+        if self.warmup_epoch > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_epoch,
+                start_lr=0.0,
+                end_lr=self.learning_rate,
+                last_epoch=self.last_epoch)
+        return learning_rate
+class Piecewise(object):
    """
-    Build learning rate variable
+    Piecewise learning rate decay
-    https://www.paddlepaddle.org.cn/documentation/docs/zh/api_cn/layers_cn.html
    Args:
-        function(str): class name of learning rate
+        boundaries(list): A list of steps numbers. The type of element in the list is python int.
-        params(dict): parameters used for init the class
+        values(list): A list of learning rate values that will be picked during different epoch boundaries.
+            The type of element in the list is python float.
+        last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
    """
    def __init__(self,
-                 function='Linear',
+                 step_each_epoch,
-                 params={'lr': 0.1,
+                 decay_epochs,
-                         'steps': 100,
+                 values,
-                         'end_lr': 0.0}):
+                 warmup_epoch=0,
-        self.function = function
+                 last_epoch=-1,
-        self.params = params
+                 **kwargs):
+        super(Piecewise, self).__init__()
+        self.boundaries = [step_each_epoch * e for e in decay_epochs]
+        self.values = values
+        self.last_epoch = last_epoch
+        self.warmup_epoch = round(warmup_epoch * step_each_epoch)
    def __call__(self):
-        mod = sys.modules[__name__]
+        learning_rate = lr.PiecewiseDecay(
-        lr = getattr(mod, self.function)(**self.params)
+            boundaries=self.boundaries,
-        return lr
+            values=self.values,
+            last_epoch=self.last_epoch)
+        if self.warmup_epoch > 0:
+            learning_rate = lr.LinearWarmup(
+                learning_rate=learning_rate,
+                warmup_steps=self.warmup_epoch,
+                start_lr=0.0,
+                end_lr=self.values[0],
+                last_epoch=self.last_epoch)
+        return learning_rate
--- a/ppcls/optimizer/optimizer.py
+++ b/ppcls/optimizer/optimizer.py
@@ -16,52 +16,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import sys
+from paddle import optimizer as optim
-import paddle
-import paddle.regularizer as regularizer
-__all__ = ['OptimizerBuilder']
-class L1Decay(object):
-    """
-    L1 Weight Decay Regularization, which encourages the weights to be sparse.
-    Args:
-        factor(float): regularization coeff. Default:0.0.
-    """
-    def __init__(self, factor=0.0):
-        super(L1Decay, self).__init__()
-        self.factor = factor
-    def __call__(self):
-        reg = regularizer.L1Decay(self.factor)
-        return reg
-class L2Decay(object):
-    """
-    L2 Weight Decay Regularization, which encourages the weights to be sparse.
-    Args:
-        factor(float): regularization coeff. Default:0.0.
-    """
-    def __init__(self, factor=0.0):
-        super(L2Decay, self).__init__()
-        self.factor = factor
-    def __call__(self):
-        reg = regularizer.L2Decay(self.factor)
-        return reg
 class Momentum(object):
    """
    Simple Momentum optimizer with velocity state.
    Args:
        learning_rate (float|Variable) - The learning rate used to update parameters.
            Can be a float value or a Variable with one float value as data element.
@@ -72,31 +32,63 @@ class Momentum(object):
    def __init__(self,
                 learning_rate,
                 momentum,
-                 parameter_list=None,
+                 weight_decay=None,
-                 regularization=None,
+                 grad_clip=None):
-                 multi_precision=False,
-                 **args):
        super(Momentum, self).__init__()
        self.learning_rate = learning_rate
        self.momentum = momentum
-        self.parameter_list = parameter_list
+        self.weight_decay = weight_decay
-        self.regularization = regularization
+        self.grad_clip = grad_clip
-        self.multi_precision = multi_precision
-    def __call__(self):
+    def __call__(self, parameters):
-        opt = paddle.optimizer.Momentum(
+        opt = optim.Momentum(
            learning_rate=self.learning_rate,
            momentum=self.momentum,
-            parameters=self.parameter_list,
+            weight_decay=self.weight_decay,
-            weight_decay=self.regularization,
+            grad_clip=self.grad_clip,
-            multi_precision=self.multi_precision)
+            parameters=parameters)
+        return opt
+class Adam(object):
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-08,
+                 parameter_list=None,
+                 weight_decay=None,
+                 grad_clip=None,
+                 name=None,
+                 lazy_mode=False):
+        self.learning_rate = learning_rate
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.parameter_list = parameter_list
+        self.learning_rate = learning_rate
+        self.weight_decay = weight_decay
+        self.grad_clip = grad_clip
+        self.name = name
+        self.lazy_mode = lazy_mode
+    def __call__(self, parameters):
+        opt = optim.Adam(
+            learning_rate=self.learning_rate,
+            beta1=self.beta1,
+            beta2=self.beta2,
+            epsilon=self.epsilon,
+            weight_decay=self.weight_decay,
+            grad_clip=self.grad_clip,
+            name=self.name,
+            lazy_mode=self.lazy_mode,
+            parameters=parameters)
        return opt
 class RMSProp(object):
    """
    Root Mean Squared Propagation (RMSProp) is an unpublished, adaptive learning rate method.
    Args:
        learning_rate (float|Variable) - The learning rate used to update parameters.
            Can be a float value or a Variable with one float value as data element.
@@ -108,58 +100,26 @@ class RMSProp(object):
    def __init__(self,
                 learning_rate,
-                 momentum,
+                 momentum=0.0,
                 rho=0.95,
                 epsilon=1e-6,
-                 parameter_list=None,
+                 weight_decay=None,
-                 regularization=None,
+                 grad_clip=None):
-                 **args):
        super(RMSProp, self).__init__()
        self.learning_rate = learning_rate
        self.momentum = momentum
        self.rho = rho
        self.epsilon = epsilon
-        self.parameter_list = parameter_list
+        self.weight_decay = weight_decay
-        self.regularization = regularization
+        self.grad_clip = grad_clip
-    def __call__(self):
+    def __call__(self, parameters):
-        opt = paddle.optimizer.RMSProp(
+        opt = optim.RMSProp(
            learning_rate=self.learning_rate,
            momentum=self.momentum,
            rho=self.rho,
            epsilon=self.epsilon,
-            parameters=self.parameter_list,
+            weight_decay=self.weight_decay,
-            weight_decay=self.regularization)
+            grad_clip=self.grad_clip,
-        return opt
+            parameters=parameters)
+        return opt
\ No newline at end of file
-class OptimizerBuilder(object):
-    """
-    Build optimizer
-    Args:
-        function(str): optimizer name of learning rate
-        params(dict): parameters used for init the class
-        regularizer (dict): parameters used for create regularization
-    """
-    def __init__(self,
-                 function='Momentum',
-                 params={'momentum': 0.9},
-                 regularizer=None):
-        self.function = function
-        self.params = params
-        # create regularizer
-        if regularizer is not None:
-            mod = sys.modules[__name__]
-            reg_func = regularizer['function'] + 'Decay'
-            del regularizer['function']
-            reg = getattr(mod, reg_func)(**regularizer)()
-            self.params['regularization'] = reg
-    def __call__(self, learning_rate, parameter_list=None):
-        mod = sys.modules[__name__]
-        opt = getattr(mod, self.function)
-        return opt(learning_rate=learning_rate,
-                   parameter_list=parameter_list,
-                   **self.params)()
--- a/ppcls/utils/config.py
+++ b/ppcls/utils/config.py
@@ -11,13 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
+import os
+import copy
+import argparse
 import yaml
-from ppcls.utils import check
 from ppcls.utils import logger
+from ppcls.utils import check
 __all__ = ['get_config']
@@ -31,6 +31,9 @@ class AttrDict(dict):
        else:
            self[key] = value
+    def __deepcopy__(self, content):
+        return copy.deepcopy(dict(self))
 def create_attr_dict(yaml_config):
    from ast import literal_eval
@@ -76,7 +79,6 @@ def print_dict(d, delimiter=0):
            logger.info("{}{} : {}".format(delimiter * " ",
                                           logger.coloring(k, "HEADER"),
                                           logger.coloring(v, "OKGREEN")))
        if k.isupper():
            logger.info(placeholder)
@@ -84,7 +86,6 @@ def print_dict(d, delimiter=0):
 def print_config(config):
    """
    visualize configs
    Arguments:
        config: configs
    """
@@ -97,21 +98,15 @@ def check_config(config):
    Check config
    """
    check.check_version()
    use_gpu = config.get('use_gpu', True)
    if use_gpu:
        check.check_gpu()
    architecture = config.get('ARCHITECTURE')
-    check.check_architecture(architecture)
+    #check.check_architecture(architecture)
-    check.check_model_with_running_mode(architecture)
    use_mix = config.get('use_mix', False)
    check.check_mix(architecture, use_mix)
    classes_num = config.get('classes_num')
    check.check_classes_num(classes_num)
    mode = config.get('mode', 'train')
    if mode.lower() == 'train':
        check.check_function_params(config, 'LEARNING_RATE')
@@ -121,7 +116,6 @@ def check_config(config):
 def override(dl, ks, v):
    """
    Recursively replace dict of list
    Args:
        dl(dict or list): dict or list to be replaced
        ks(list): list of keys
@@ -147,19 +141,15 @@ def override(dl, ks, v):
        if len(ks) == 1:
            # assert ks[0] in dl, ('{} is not exist in {}'.format(ks[0], dl))
            if not ks[0] in dl:
-                logger.warning('A new filed ({}) detected!'.format(ks[0]))
+                logger.warning('A new filed ({}) detected!'.format(ks[0], dl))
            dl[ks[0]] = str2num(v)
        else:
-            if not ks[0] in dl:
-                logger.warning('A new filed ({}) detected!'.format(ks[0]))
-                dl[ks[0]] = {}
            override(dl[ks[0]], ks[1:], v)
 def override_config(config, options=None):
    """
    Recursively override the config
    Args:
        config(dict): dict to be replaced
        options(list): list of pairs(key0.key1.idx.key2=value)
@@ -167,7 +157,6 @@ def override_config(config, options=None):
                'topk=2',
                'VALID.transforms.1.ResizeImage.resize_short=300'
            ]
    Returns:
        config(dict): replaced config
    """
@@ -183,7 +172,6 @@ def override_config(config, options=None):
            key, value = pair
            keys = key.split('.')
            override(config, keys, value)
    return config
@@ -197,5 +185,23 @@ def get_config(fname, overrides=None, show=True):
    override_config(config, overrides)
    if show:
        print_config(config)
-    check_config(config)
+    # check_config(config)
    return config
+def parse_args():
+    parser = argparse.ArgumentParser("generic-image-rec train script")
+    parser.add_argument(
+        '-c',
+        '--config',
+        type=str,
+        default='configs/config.yaml',
+        help='config file path')
+    parser.add_argument(
+        '-o',
+        '--override',
+        action='append',
+        default=[],
+        help='config options to be overridden')
+    args = parser.parse_args()
+    return args
--- a/ppcls/utils/save_load.py
+++ b/ppcls/utils/save_load.py
@@ -24,6 +24,7 @@ import tempfile
 import paddle
 from paddle.static import load_program_state
+from paddle.utils.download import get_weights_path_from_url
 from ppcls.utils import logger
@@ -70,6 +71,20 @@ def load_dygraph_pretrain(model, path=None, load_static_weights=False):
    return
+def load_dygraph_pretrain_from_url(model,
+                                   pretrained_url,
+                                   use_ssld,
+                                   load_static_weights=False):
+    if use_ssld:
+        pretrained_url = pretrained_url.replace("_pretrained",
+                                                "_ssld_pretrained")
+    local_weight_path = get_weights_path_from_url(pretrained_url).replace(
+        ".pdparams", "")
+    load_dygraph_pretrain(
+        model, path=local_weight_path, load_static_weights=load_static_weights)
+    return
 def load_distillation_model(model, pretrained_model, load_static_weights):
    logger.info("In distillation mode, teacher model will be "
                "loaded firstly before student model.")
@@ -112,10 +127,11 @@ def init_model(config, net, optimizer=None):
            "Given dir {}.pdopt not exist.".format(checkpoints)
        para_dict = paddle.load(checkpoints + ".pdparams")
        opti_dict = paddle.load(checkpoints + ".pdopt")
+        metric_dict = paddle.load(checkpoints + ".pdstates")
        net.set_dict(para_dict)
        optimizer.set_state_dict(opti_dict)
        logger.info("Finish load checkpoints from {}".format(checkpoints))
-        return
+        return metric_dict
    pretrained_model = config.get('pretrained_model')
    load_static_weights = config.get('load_static_weights', False)
@@ -146,13 +162,18 @@ def _save_student_model(net, model_prefix):
            student_model_prefix))
-def save_model(net, optimizer, model_path, epoch_id, prefix='ppcls'):
+def save_model(net,
+               optimizer,
+               metric_info,
+               model_path,
+               model_name="",
+               prefix='ppcls'):
    """
    save model to the target path
    """
    if paddle.distributed.get_rank() != 0:
        return
-    model_path = os.path.join(model_path, str(epoch_id))
+    model_path = os.path.join(model_path, model_name)
    _mkdir_if_not_exist(model_path)
    model_prefix = os.path.join(model_path, prefix)
@@ -160,4 +181,5 @@ def save_model(net, optimizer, model_path, epoch_id, prefix='ppcls'):
    paddle.save(net.state_dict(), model_prefix + ".pdparams")
    paddle.save(optimizer.state_dict(), model_prefix + ".pdopt")
+    paddle.save(metric_info, model_prefix + ".pdstates")
    logger.info("Already save model in {}".format(model_path))
--- a/tools/eval.py
+++ b/tools/eval.py
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,105 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import paddle
+from __future__ import absolute_import
-import paddle.nn.functional as F
+from __future__ import division
+from __future__ import print_function
-import argparse
 import os
 import sys
 __dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
-sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
-from ppcls.utils import logger
-from ppcls.utils.save_load import init_model
-from ppcls.utils.config import get_config
-from ppcls.utils import multi_hot_encode
-from ppcls.utils import accuracy_score
-from ppcls.utils import mean_average_precision
-from ppcls.utils import precision_recall_fscore
-from ppcls.data import Reader
-import program
-import numpy as np
-def parse_args():
-    parser = argparse.ArgumentParser("PaddleClas eval script")
-    parser.add_argument(
-        '-c',
-        '--config',
-        type=str,
-        default='./configs/eval.yaml',
-        help='config file path')
-    parser.add_argument(
-        '-o',
-        '--override',
-        action='append',
-        default=[],
-        help='config options to be overridden')
-    args = parser.parse_args()
-    return args
-def main(args, return_dict={}):
-    config = get_config(args.config, overrides=args.override, show=True)
-    config.mode = "valid"
-    # assign place
-    use_gpu = config.get("use_gpu", True)
-    place = paddle.set_device('gpu' if use_gpu else 'cpu')
-    multilabel = config.get("multilabel", False)
-    trainer_num = paddle.distributed.get_world_size()
-    use_data_parallel = trainer_num != 1
-    config["use_data_parallel"] = use_data_parallel
-    if config["use_data_parallel"]:
-        paddle.distributed.init_parallel_env()
-    net = program.create_model(config.ARCHITECTURE, config.classes_num)
-    init_model(config, net, optimizer=None)
-    valid_dataloader = Reader(config, 'valid', places=place)()
-    if len(valid_dataloader) <= 0:
-        logger.error(
-            "valid dataloader is empty, please check your data config again!")
-        sys.exit(-1)
-    net.eval()
-    with paddle.no_grad():
-        if not multilabel:
-            top1_acc = program.run(valid_dataloader, config, net, None, None,
-                                   0, 'valid')
-            return_dict["top1_acc"] = top1_acc
-            return top1_acc
-        else:
-            all_outs = []
-            targets = []
-            for _, batch in enumerate(valid_dataloader()):
-                feeds = program.create_feeds(batch, False, config.classes_num,
-                                             multilabel)
-                out = net(feeds["image"])
-                out = F.sigmoid(out)
-                use_distillation = config.get("use_distillation", False)
-                if use_distillation:
-                    out = out[1]
-                all_outs.extend(list(out.numpy()))
-                targets.extend(list(feeds["label"].numpy()))
-            all_outs = np.array(all_outs)
-            targets = np.array(targets)
-            mAP = mean_average_precision(all_outs, targets)
-            return_dict["mean average precision"] = mAP
-            return mAP
+from ppcls.utils import config
+from ppcls.engine.trainer import Trainer
-if __name__ == '__main__':
+if __name__ == "__main__":
-    args = parse_args()
+    args = config.parse_args()
-    return_dict = {}
+    config = config.get_config(args.config, overrides=args.override, show=True)
-    main(args, return_dict)
+    trainer = Trainer(config, mode="eval")
-    print(return_dict)
+    trainer.eval()
--- a/tools/train.py
+++ b/tools/train.py
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -15,144 +15,16 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import argparse
 import os
 import sys
 __dir__ = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(__dir__)
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../')))
-sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
-import paddle
-from ppcls.data import Reader
-from ppcls.utils.config import get_config
-from ppcls.utils.save_load import init_model, save_model
-from ppcls.utils import logger
-import program
-def parse_args():
-    parser = argparse.ArgumentParser("PaddleClas train script")
-    parser.add_argument(
-        '-c',
-        '--config',
-        type=str,
-        default='configs/ResNet/ResNet50.yaml',
-        help='config file path')
-    parser.add_argument(
-        '-p',
-        '--profiler_options',
-        type=str,
-        default=None,
-        help='The option of profiler, which should be in format \"key1=value1;key2=value2;key3=value3\".'
-    )
-    parser.add_argument(
-        '-o',
-        '--override',
-        action='append',
-        default=[],
-        help='config options to be overridden')
-    args = parser.parse_args()
-    return args
-def main(args):
-    paddle.seed(12345)
-    config = get_config(args.config, overrides=args.override, show=True)
-    # assign the place
-    use_gpu = config.get("use_gpu", True)
-    use_xpu = config.get("use_xpu", False)
-    assert (
-        use_gpu and use_xpu
-    ) is not True, "gpu and xpu can not be true in the same time in static mode!"
-    if use_gpu:
-        place = paddle.set_device('gpu')
-    elif use_xpu:
-        place = paddle.set_device('xpu')
-    else:
-        place = paddle.set_device('cpu')
-    trainer_num = paddle.distributed.get_world_size()
-    use_data_parallel = trainer_num != 1
-    config["use_data_parallel"] = use_data_parallel
-    if config["use_data_parallel"]:
-        paddle.distributed.init_parallel_env()
-    net = program.create_model(config.ARCHITECTURE, config.classes_num)
-    optimizer, lr_scheduler = program.create_optimizer(
-        config, parameter_list=net.parameters())
-    dp_net = net
-    if config["use_data_parallel"]:
-        find_unused_parameters = config.get("find_unused_parameters", False)
-        dp_net = paddle.DataParallel(
-            net, find_unused_parameters=find_unused_parameters)
-    # load model from checkpoint or pretrained model
-    init_model(config, net, optimizer)
-    train_dataloader = Reader(config, 'train', places=place)()
-    if len(train_dataloader) <= 0:
-        logger.error(
-            "train dataloader is empty, please check your data config again!")
-        sys.exit(-1)
-    if config.validate:
-        valid_dataloader = Reader(config, 'valid', places=place)()
-        if len(valid_dataloader) <= 0:
-            logger.error(
-                "valid dataloader is empty, please check your data config again!"
-            )
-            sys.exit(-1)
-    last_epoch_id = config.get("last_epoch", -1)
-    best_top1_acc = 0.0  # best top1 acc record
-    best_top1_epoch = last_epoch_id
-    vdl_writer_path = config.get("vdl_dir", None)
-    vdl_writer = None
-    if vdl_writer_path:
-        from visualdl import LogWriter
-        vdl_writer = LogWriter(vdl_writer_path)
-    # Ensure that the vdl log file can be closed normally
-    try:
-        for epoch_id in range(last_epoch_id + 1, config.epochs):
-            net.train()
-            # 1. train with train dataset
-            program.run(train_dataloader, config, dp_net, optimizer,
-                        lr_scheduler, epoch_id, 'train', vdl_writer,
-                        args.profiler_options)
-            # 2. validate with validate dataset
-            if config.validate and epoch_id % config.valid_interval == 0:
-                net.eval()
-                with paddle.no_grad():
-                    top1_acc = program.run(valid_dataloader, config, net, None,
-                                           None, epoch_id, 'valid', vdl_writer)
-                if top1_acc > best_top1_acc:
-                    best_top1_acc = top1_acc
-                    best_top1_epoch = epoch_id
-                    model_path = os.path.join(config.model_save_dir,
-                                              config.ARCHITECTURE["name"])
-                    save_model(net, optimizer, model_path, "best_model")
-                message = "The best top1 acc {:.5f}, in epoch: {:d}".format(
-                    best_top1_acc, best_top1_epoch)
-                logger.info(message)
-            # 3. save the persistable model
-            if epoch_id % config.save_interval == 0:
-                model_path = os.path.join(config.model_save_dir,
-                                          config.ARCHITECTURE["name"])
-                save_model(net, optimizer, model_path, epoch_id)
-    except Exception as e:
-        logger.error(e)
-    finally:
-        vdl_writer.close() if vdl_writer else None
+from ppcls.utils import config
+from ppcls.engine.trainer import Trainer
-if __name__ == '__main__':
+if __name__ == "__main__":
-    args = parse_args()
+    args = config.parse_args()
-    main(args)
+    config = config.get_config(args.config, overrides=args.override, show=True)
+    trainer = Trainer(config, mode="train")
+    trainer.train()