Merge branch 'develop_reg' into develop_reg

46955a26 · Felix · GitHub · 1b904319 · 82ed9470 · 46955a26
37 changed file
--- a/docs/zh_CN/feature_visiualization/get_started.md
+++ b/docs/zh_CN/feature_visiualization/get_started.md
@@ -37,7 +37,7 @@ def forward(self, inputs):
    y = self.pool2d_max(y)
    for bottleneck_block in self.bottleneck_block_list:
        y = bottleneck_block(y)
-    y = self.pool2d_avg(y)
+    y = self.avg_pool(y)
    y = fluid.layers.reshape(y, shape=[-1, self.pool2d_avg_output])
    y = self.out(y)
    return y, self.fm

--- a/ppcls/arch/__init__.py
+++ b/ppcls/arch/__init__.py
@@ -12,8 +12,54 @@
 #See the License for the specific language governing permissions and
 #limitations under the License.

+import copy
+import importlib
+
+import paddle.nn as nn
+
 from . import backbone

 from .backbone import *
 from ppcls.arch.loss_metrics.loss import *
 from .utils import *
+
+
+def build_model(config):
+    config = copy.deepcopy(config)
+    model_type = config.pop("name")
+    mod = importlib.import_module(__name__)
+    arch = getattr(mod, model_type)(**config)
+    return arch
+
+
+class RecModel(nn.Layer):
+    def __init__(self, **config):
+        super().__init__()
+        backbone_config = config["Backbone"]
+        backbone_name = backbone_config.pop("name")
+        self.backbone = getattr(backbone_name)(**backbone_config)
+        if "backbone_stop_layer" in config:
+            backbone_stop_layer = config["backbone_stop_layer"]
+            self.backbone.stop_layer(backbone_stop_layer)
+
+        if "Neck" in config:
+            neck_config = config["Neck"]
+            neck_name = neck_config.pop("name")
+            self.neck = getattr(neck_name)(**neck_config)
+        else:
+            self.neck = None
+
+        if "Head" in config:
+            head_config = config["Head"]
+            head_name = head_config.pop("name")
+            self.head = getattr(head_name)(**head_config)
+        else:
+            self.head = None
+
+    def forward(self, x):
+        y = self.backbone(x)
+        if self.neck is not None:
+            y = self.neck(y)
+        if self.head is not None:
+            y = self.head(y)
+        return y
--- a/ppcls/arch/backbone/legendary_models/__init__.py
+++ b/ppcls/arch/backbone/legendary_models/__init__.py
+from .resnet import ResNet18, ResNet34, ResNet50, ResNet101, ResNet152, ResNet18_vd, ResNet34_vd, ResNet50_vd, ResNet101_vd, ResNet152_vd
+from .hrnet import HRNet_W18_C, HRNet_W30_C, HRNet_W32_C, HRNet_W40_C, HRNet_W44_C, HRNet_W48_C, HRNet_W64_C
+from .mobilenet_v1 import MobileNetV1_x0_25, MobileNetV1_x0_5, MobileNetV1_x0_75, MobileNetV1
+from .mobilenet_v3 import MobileNetV3_small_x0_35, MobileNetV3_small_x0_5, MobileNetV3_small_x0_75, MobileNetV3_small_x1_0, MobileNetV3_small_x1_25, MobileNetV3_large_x0_35, MobileNetV3_large_x0_5, MobileNetV3_large_x0_75, MobileNetV3_large_x1_0, MobileNetV3_large_x1_25
+from .inception_v3 import InceptionV3
+from .vgg import VGG11, VGG13, VGG16, VGG19
--- a/ppcls/arch/backbone/legendary_models/hrnet.py
+++ b/ppcls/arch/backbone/legendary_models/hrnet.py
--- a/ppcls/arch/backbone/legendary_models/inception_v3.py
+++ b/ppcls/arch/backbone/legendary_models/inception_v3.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+import math
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "InceptionV3":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/InceptionV3_pretrained.pdparams"
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+InceptionV3 config: dict.
+    key: inception blocks of InceptionV3.
+    values: conv num in different blocks.
+'''
+NET_CONFIG = {
+    "inception_a": [[192, 256, 288], [32, 64, 64]],
+    "inception_b": [288],
+    "inception_c": [[768, 768, 768, 768], [128, 160, 160, 192]],
+    "inception_d": [768],
+    "inception_e": [1280, 2048]
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 padding=0,
+                 groups=1,
+                 act="relu"):
+        super().__init__()
+        self.act = act
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+        self.bn = BatchNorm(num_filters)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class InceptionStem(TheseusLayer):
+    def __init__(self):
+        super().__init__()
+        self.conv_1a_3x3 = ConvBNLayer(
+            num_channels=3,
+            num_filters=32,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.conv_2a_3x3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=32,
+            filter_size=3,
+            stride=1,
+            act="relu")
+        self.conv_2b_3x3 = ConvBNLayer(
+            num_channels=32,
+            num_filters=64,
+            filter_size=3,
+            padding=1,
+            act="relu")
+
+        self.max_pool = MaxPool2D(kernel_size=3, stride=2, padding=0)
+        self.conv_3b_1x1 = ConvBNLayer(
+            num_channels=64, num_filters=80, filter_size=1, act="relu")
+        self.conv_4a_3x3 = ConvBNLayer(
+            num_channels=80, num_filters=192, filter_size=3, act="relu")
+
+    def forward(self, x):
+        x = self.conv_1a_3x3(x)
+        x = self.conv_2a_3x3(x)
+        x = self.conv_2b_3x3(x)
+        x = self.max_pool(x)
+        x = self.conv_3b_1x1(x)
+        x = self.conv_4a_3x3(x)
+        x = self.max_pool(x)
+        return x
+
+
+class InceptionA(TheseusLayer):
+    def __init__(self, num_channels, pool_features):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch5x5_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=48,
+            filter_size=1,
+            act="relu")
+        self.branch5x5_2 = ConvBNLayer(
+            num_channels=48,
+            num_filters=64,
+            filter_size=5,
+            padding=2,
+            act="relu")
+
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3 = ConvBNLayer(
+            num_channels=96,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=pool_features,
+            filter_size=1,
+            act="relu")
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+        branch5x5 = self.branch5x5_1(x)
+        branch5x5 = self.branch5x5_2(branch5x5)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+        x = paddle.concat(
+            [branch1x1, branch5x5, branch3x3dbl, branch_pool], axis=1)
+        return x
+
+
+class InceptionB(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch3x3 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=384,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=64,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=64,
+            num_filters=96,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3 = ConvBNLayer(
+            num_channels=96,
+            num_filters=96,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+
+    def forward(self, x):
+        branch3x3 = self.branch3x3(x)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
+
+        branch_pool = self.branch_pool(x)
+
+        x = paddle.concat([branch3x3, branch3x3dbl, branch_pool], axis=1)
+
+        return x
+
+
+class InceptionC(TheseusLayer):
+    def __init__(self, num_channels, channels_7x7):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+
+        self.branch7x7_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=channels_7x7,
+            filter_size=1,
+            stride=1,
+            act="relu")
+        self.branch7x7_2 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(1, 7),
+            stride=1,
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7_3 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=192,
+            filter_size=(7, 1),
+            stride=1,
+            padding=(3, 0),
+            act="relu")
+
+        self.branch7x7dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=channels_7x7,
+            filter_size=1,
+            act="relu")
+        self.branch7x7dbl_2 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7dbl_3 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7dbl_4 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=channels_7x7,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7dbl_5 = ConvBNLayer(
+            num_channels=channels_7x7,
+            num_filters=192,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch7x7 = self.branch7x7_1(x)
+        branch7x7 = self.branch7x7_2(branch7x7)
+        branch7x7 = self.branch7x7_3(branch7x7)
+
+        branch7x7dbl = self.branch7x7dbl_1(x)
+        branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
+        branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
+
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+
+        x = paddle.concat(
+            [branch1x1, branch7x7, branch7x7dbl, branch_pool], axis=1)
+
+        return x
+
+
+class InceptionD(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch3x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_2 = ConvBNLayer(
+            num_channels=192,
+            num_filters=320,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch7x7x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+        self.branch7x7x3_2 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=(1, 7),
+            padding=(0, 3),
+            act="relu")
+        self.branch7x7x3_3 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=(7, 1),
+            padding=(3, 0),
+            act="relu")
+        self.branch7x7x3_4 = ConvBNLayer(
+            num_channels=192,
+            num_filters=192,
+            filter_size=3,
+            stride=2,
+            act="relu")
+        self.branch_pool = MaxPool2D(kernel_size=3, stride=2)
+
+    def forward(self, x):
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = self.branch3x3_2(branch3x3)
+
+        branch7x7x3 = self.branch7x7x3_1(x)
+        branch7x7x3 = self.branch7x7x3_2(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_3(branch7x7x3)
+        branch7x7x3 = self.branch7x7x3_4(branch7x7x3)
+
+        branch_pool = self.branch_pool(x)
+
+        x = paddle.concat([branch3x3, branch7x7x3, branch_pool], axis=1)
+        return x
+
+
+class InceptionE(TheseusLayer):
+    def __init__(self, num_channels):
+        super().__init__()
+        self.branch1x1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=320,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=384,
+            filter_size=1,
+            act="relu")
+        self.branch3x3_2a = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(1, 3),
+            padding=(0, 1),
+            act="relu")
+        self.branch3x3_2b = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(3, 1),
+            padding=(1, 0),
+            act="relu")
+
+        self.branch3x3dbl_1 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=448,
+            filter_size=1,
+            act="relu")
+        self.branch3x3dbl_2 = ConvBNLayer(
+            num_channels=448,
+            num_filters=384,
+            filter_size=3,
+            padding=1,
+            act="relu")
+        self.branch3x3dbl_3a = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(1, 3),
+            padding=(0, 1),
+            act="relu")
+        self.branch3x3dbl_3b = ConvBNLayer(
+            num_channels=384,
+            num_filters=384,
+            filter_size=(3, 1),
+            padding=(1, 0),
+            act="relu")
+        self.branch_pool = AvgPool2D(
+            kernel_size=3, stride=1, padding=1, exclusive=False)
+        self.branch_pool_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=192,
+            filter_size=1,
+            act="relu")
+
+    def forward(self, x):
+        branch1x1 = self.branch1x1(x)
+
+        branch3x3 = self.branch3x3_1(x)
+        branch3x3 = [
+            self.branch3x3_2a(branch3x3),
+            self.branch3x3_2b(branch3x3),
+        ]
+        branch3x3 = paddle.concat(branch3x3, axis=1)
+
+        branch3x3dbl = self.branch3x3dbl_1(x)
+        branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
+        branch3x3dbl = [
+            self.branch3x3dbl_3a(branch3x3dbl),
+            self.branch3x3dbl_3b(branch3x3dbl),
+        ]
+        branch3x3dbl = paddle.concat(branch3x3dbl, axis=1)
+
+        branch_pool = self.branch_pool(x)
+        branch_pool = self.branch_pool_conv(branch_pool)
+
+        x = paddle.concat(
+            [branch1x1, branch3x3, branch3x3dbl, branch_pool], axis=1)
+        return x
+
+
+class Inception_V3(TheseusLayer):
+    """
+    Inception_V3
+    Args:
+        config: dict. config of Inception_V3.
+        class_num: int=1000. The number of classes.
+        pretrained: (True or False) or path of pretrained_model. Whether to load the pretrained model.
+    Returns:
+        model: nn.Layer. Specific Inception_V3 model depends on args.
+    """
+
+    def __init__(self, config, class_num=1000):
+        super().__init__()
+
+        self.inception_a_list = config["inception_a"]
+        self.inception_c_list = config["inception_c"]
+        self.inception_b_list = config["inception_b"]
+        self.inception_d_list = config["inception_d"]
+        self.inception_e_list = config["inception_e"]
+
+        self.inception_stem = InceptionStem()
+
+        self.inception_block_list = nn.LayerList()
+        for i in range(len(self.inception_a_list[0])):
+            inception_a = InceptionA(self.inception_a_list[0][i],
+                                     self.inception_a_list[1][i])
+            self.inception_block_list.append(inception_a)
+
+        for i in range(len(self.inception_b_list)):
+            inception_b = InceptionB(self.inception_b_list[i])
+            self.inception_block_list.append(inception_b)
+
+        for i in range(len(self.inception_c_list[0])):
+            inception_c = InceptionC(self.inception_c_list[0][i],
+                                     self.inception_c_list[1][i])
+            self.inception_block_list.append(inception_c)
+
+        for i in range(len(self.inception_d_list)):
+            inception_d = InceptionD(self.inception_d_list[i])
+            self.inception_block_list.append(inception_d)
+
+        for i in range(len(self.inception_e_list)):
+            inception_e = InceptionE(self.inception_e_list[i])
+            self.inception_block_list.append(inception_e)
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.dropout = Dropout(p=0.2, mode="downscale_in_infer")
+        stdv = 1.0 / math.sqrt(2048 * 1.0)
+        self.fc = Linear(
+            2048,
+            class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr())
+
+    def forward(self, x):
+        x = self.inception_stem(x)
+        for inception_block in self.inception_block_list:
+            x = inception_block(x)
+        x = self.avg_pool(x)
+        x = paddle.reshape(x, shape=[-1, 2048])
+        x = self.dropout(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def InceptionV3(pretrained=False, use_ssld=False, **kwargs):
+    """
+    InceptionV3
+    Args:
+        pretrained: bool=false or str. if `true` load pretrained parameters, `false` otherwise.
+                    if str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `InceptionV3` model 
+    """
+    model = Inception_V3(NET_CONFIG, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["InceptionV3"], use_ssld)
+    return model
--- a/ppcls/arch/backbone/legendary_models/mobilenet_v1.py
+++ b/ppcls/arch/backbone/legendary_models/mobilenet_v1.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear, ReLU, Flatten
+from paddle.nn import AdaptiveAvgPool2D
+from paddle.nn.initializer import KaimingNormal
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "MobileNetV1_x0_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_25_pretrained.pdparams",
+    "MobileNetV1_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_5_pretrained.pdparams",
+    "MobileNetV1_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_x0_75_pretrained.pdparams",
+    "MobileNetV1":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV1_pretrained.pdparams"
+}
+
+__all__ = MODEL_URLS.keys()
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 padding,
+                 num_groups=1):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+        self.bn = BatchNorm(num_filters)
+        self.relu = ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+
+
+class DepthwiseSeparable(TheseusLayer):
+    def __init__(self, num_channels, num_filters1, num_filters2, num_groups,
+                 stride, scale):
+        super().__init__()
+
+        self.depthwise_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=int(num_filters1 * scale),
+            filter_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale))
+
+        self.pointwise_conv = ConvBNLayer(
+            num_channels=int(num_filters1 * scale),
+            filter_size=1,
+            num_filters=int(num_filters2 * scale),
+            stride=1,
+            padding=0)
+
+    def forward(self, x):
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv(x)
+        return x
+
+
+class MobileNet(TheseusLayer):
+    """
+    MobileNet
+    Args:
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+        class_num: int=1000. The number of classes.
+    Returns:
+        model: nn.Layer. Specific MobileNet model depends on args.
+    """
+
+    def __init__(self, scale=1.0, class_num=1000):
+        super().__init__()
+        self.scale = scale
+
+        self.conv = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            num_filters=int(32 * scale),
+            stride=2,
+            padding=1)
+
+        #num_channels, num_filters1, num_filters2, num_groups, stride
+        self.cfg = [[int(32 * scale), 32, 64, 32, 1],
+                    [int(64 * scale), 64, 128, 64, 2],
+                    [int(128 * scale), 128, 128, 128, 1],
+                    [int(128 * scale), 128, 256, 128, 2],
+                    [int(256 * scale), 256, 256, 256, 1],
+                    [int(256 * scale), 256, 512, 256, 2],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 512, 512, 1],
+                    [int(512 * scale), 512, 1024, 512, 2],
+                    [int(1024 * scale), 1024, 1024, 1024, 1]]
+
+        self.blocks = nn.Sequential(*[
+            DepthwiseSeparable(
+                num_channels=params[0],
+                num_filters1=params[1],
+                num_filters2=params[2],
+                num_groups=params[3],
+                stride=params[4],
+                scale=scale) for params in self.cfg
+        ])
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.flatten = Flatten(start_axis=1, stop_axis=-1)
+
+        self.fc = Linear(
+            int(1024 * scale),
+            class_num,
+            weight_attr=ParamAttr(initializer=KaimingNormal()))
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.avg_pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileNetV1_x0_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_25` model depends on args.
+    """
+    model = MobileNet(scale=0.25, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_25"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV1_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_5` model depends on args.
+    """
+    model = MobileNet(scale=0.5, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_5"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV1_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1_x0_75` model depends on args.
+    """
+    model = MobileNet(scale=0.75, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1_x0_75"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV1(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV1
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV1` model depends on args.
+    """
+    model = MobileNet(scale=1.0, **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV1"], use_ssld)
+    return model
--- a/ppcls/arch/backbone/legendary_models/mobilenet_v3.py
+++ b/ppcls/arch/backbone/legendary_models/mobilenet_v3.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, BatchNorm, Conv2D, Dropout, Linear
+from paddle.regularizer import L2Decay
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "MobileNetV3_small_x0_35":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_35_pretrained.pdparams",
+    "MobileNetV3_small_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_5_pretrained.pdparams",
+    "MobileNetV3_small_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x0_75_pretrained.pdparams",
+    "MobileNetV3_small_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_0_pretrained.pdparams",
+    "MobileNetV3_small_x1_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_small_x1_25_pretrained.pdparams",
+    "MobileNetV3_large_x0_35":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_35_pretrained.pdparams",
+    "MobileNetV3_large_x0_5":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_5_pretrained.pdparams",
+    "MobileNetV3_large_x0_75":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x0_75_pretrained.pdparams",
+    "MobileNetV3_large_x1_0":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_0_pretrained.pdparams",
+    "MobileNetV3_large_x1_25":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/MobileNetV3_large_x1_25_pretrained.pdparams",
+}
+
+__all__ = MODEL_URLS.keys()
+
+# "large", "small" is just for MobinetV3_large, MobileNetV3_small respectively.
+# The type of "large" or "small" config is a list. Each element(list) represents a depthwise block, which is composed of k, exp, se, act, s.
+# k: kernel_size
+# exp: middle channel number in depthwise block
+# c: output channel number in depthwise block
+# se: whether to use SE block
+# act: which activation to use
+# s: stride in depthwise block
+NET_CONFIG = {
+    "large": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, False, "relu", 1],
+        [3, 64, 24, False, "relu", 2],
+        [3, 72, 24, False, "relu", 1],
+        [5, 72, 40, True, "relu", 2],
+        [5, 120, 40, True, "relu", 1],
+        [5, 120, 40, True, "relu", 1],
+        [3, 240, 80, False, "hardswish", 2],
+        [3, 200, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 184, 80, False, "hardswish", 1],
+        [3, 480, 112, True, "hardswish", 1],
+        [3, 672, 112, True, "hardswish", 1],
+        [5, 672, 160, True, "hardswish", 2],
+        [5, 960, 160, True, "hardswish", 1],
+        [5, 960, 160, True, "hardswish", 1],
+    ],
+    "small": [
+        # k, exp, c, se, act, s
+        [3, 16, 16, True, "relu", 2],
+        [3, 72, 24, False, "relu", 2],
+        [3, 88, 24, False, "relu", 1],
+        [5, 96, 40, True, "hardswish", 2],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 240, 40, True, "hardswish", 1],
+        [5, 120, 48, True, "hardswish", 1],
+        [5, 144, 48, True, "hardswish", 1],
+        [5, 288, 96, True, "hardswish", 2],
+        [5, 576, 96, True, "hardswish", 1],
+        [5, 576, 96, True, "hardswish", 1],
+    ]
+}
+# first conv output channel number in MobileNetV3
+STEM_CONV_NUMBER = 16
+# last second conv output channel for "small"
+LAST_SECOND_CONV_SMALL = 576
+# last second conv output channel for "large"
+LAST_SECOND_CONV_LARGE = 960
+# last conv output channel number for "large" and "small"
+LAST_CONV = 1280
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def _create_act(act):
+    if act == "hardswish":
+        return nn.Hardswish()
+    elif act == "relu":
+        return nn.ReLU()
+    elif act is None:
+        return None
+    else:
+        raise RuntimeError(
+            "The activation function is not supported: {}".format(act))
+
+
+class MobileNetV3(TheseusLayer):
+    """
+    MobileNetV3
+    Args:
+        config: list. MobileNetV3 depthwise blocks config.
+        scale: float=1.0. The coefficient that controls the size of network parameters. 
+        class_num: int=1000. The number of classes.
+        inplanes: int=16. The output channel number of first convolution layer.
+        class_squeeze: int=960. The output channel number of penultimate convolution layer. 
+        class_expand: int=1280. The output channel number of last convolution layer. 
+        dropout_prob: float=0.2.  Probability of setting units to zero.
+    Returns:
+        model: nn.Layer. Specific MobileNetV3 model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 scale=1.0,
+                 class_num=1000,
+                 inplanes=STEM_CONV_NUMBER,
+                 class_squeeze=LAST_SECOND_CONV_LARGE,
+                 class_expand=LAST_CONV,
+                 dropout_prob=0.2):
+        super().__init__()
+
+        self.cfg = config
+        self.scale = scale
+        self.inplanes = inplanes
+        self.class_squeeze = class_squeeze
+        self.class_expand = class_expand
+        self.class_num = class_num
+
+        self.conv = ConvBNLayer(
+            in_c=3,
+            out_c=_make_divisible(self.inplanes * self.scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            if_act=True,
+            act="hardswish")
+
+        self.blocks = nn.Sequential(*[
+            ResidualUnit(
+                in_c=_make_divisible(self.inplanes * self.scale if i == 0 else
+                                     self.cfg[i - 1][2] * self.scale),
+                mid_c=_make_divisible(self.scale * exp),
+                out_c=_make_divisible(self.scale * c),
+                filter_size=k,
+                stride=s,
+                use_se=se,
+                act=act) for i, (k, exp, c, se, act, s) in enumerate(self.cfg)
+        ])
+
+        self.last_second_conv = ConvBNLayer(
+            in_c=_make_divisible(self.cfg[-1][2] * self.scale),
+            out_c=_make_divisible(self.scale * self.class_squeeze),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            num_groups=1,
+            if_act=True,
+            act="hardswish")
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+
+        self.last_conv = Conv2D(
+            in_channels=_make_divisible(self.scale * self.class_squeeze),
+            out_channels=self.class_expand,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias_attr=False)
+
+        self.hardswish = nn.Hardswish()
+        self.dropout = Dropout(p=dropout_prob, mode="downscale_in_infer")
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+
+        self.fc = Linear(self.class_expand, class_num)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.blocks(x)
+        x = self.last_second_conv(x)
+        x = self.avg_pool(x)
+        x = self.last_conv(x)
+        x = self.hardswish(x)
+        x = self.dropout(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+
+        return x
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 if_act=True,
+                 act=None):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            bias_attr=False)
+        self.bn = BatchNorm(
+            num_channels=out_c,
+            act=None,
+            param_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.if_act = if_act
+        self.act = _create_act(act)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.if_act:
+            x = self.act(x)
+        return x
+
+
+class ResidualUnit(TheseusLayer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 act=None):
+        super().__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.if_se = use_se
+
+        self.expand_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=True,
+            act=act)
+        self.bottleneck_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            num_groups=mid_c,
+            if_act=True,
+            act=act)
+        if self.if_se:
+            self.mid_se = SEModule(mid_c)
+        self.linear_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            if_act=False,
+            act=None)
+
+    def forward(self, x):
+        identity = x
+        x = self.expand_conv(x)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = paddle.add(identity, x)
+        return x
+
+
+# nn.Hardsigmoid can't transfer "slope" and "offset" in nn.functional.hardsigmoid
+class Hardsigmoid(TheseusLayer):
+    def __init__(self, slope=0.2, offset=0.5):
+        super().__init__()
+        self.slope = slope
+        self.offset = offset
+
+    def forward(self, x):
+        return nn.functional.hardsigmoid(
+            x, slope=self.slope, offset=self.offset)
+
+
+class SEModule(TheseusLayer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = Hardsigmoid(slope=0.2, offset=0.5)
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        return paddle.multiply(x=identity, y=x)
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def MobileNetV3_small_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_35
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_35` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.35,
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_35"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_5` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.5,
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_5"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x0_75
+    Args:
+        pretrained: bool=false or str. if `true` load pretrained parameters, `false` otherwise.
+                    if str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x0_75` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=0.75,
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x0_75"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.0,
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x1_0"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_small_x1_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_small_x1_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_small_x1_25` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["small"],
+        scale=1.25,
+        class_squeeze=LAST_SECOND_CONV_SMALL,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_small_x1_25"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x0_35(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_35
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_35` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.35,
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_35"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x0_5(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_5
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_5` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.5,
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_5"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x0_75(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x0_75
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x0_75` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=0.75,
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x0_75"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x1_0(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x1_0
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x1_0` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.0,
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x1_0"],
+                     use_ssld)
+    return model
+
+
+def MobileNetV3_large_x1_25(pretrained=False, use_ssld=False, **kwargs):
+    """
+    MobileNetV3_large_x1_25
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `MobileNetV3_large_x1_25` model depends on args.
+    """
+    model = MobileNetV3(
+        config=NET_CONFIG["large"],
+        scale=1.25,
+        class_squeeze=LAST_SECOND_CONV_LARGE,
+        **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["MobileNetV3_large_x1_25"],
+                     use_ssld)
+    return model
--- a/ppcls/arch/backbone/legendary_models/resnet.py
+++ b/ppcls/arch/backbone/legendary_models/resnet.py
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import, division, print_function
+
+import numpy as np
+import paddle
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn import Conv2D, BatchNorm, Linear
+from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn.initializer import Uniform
+import math
+
+from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "ResNet18":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_pretrained.pdparams",
+    "ResNet18_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet18_vd_pretrained.pdparams",
+    "ResNet34":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_pretrained.pdparams",
+    "ResNet34_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet34_vd_pretrained.pdparams",
+    "ResNet50":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_pretrained.pdparams",
+    "ResNet50_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet50_vd_pretrained.pdparams",
+    "ResNet101":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_pretrained.pdparams",
+    "ResNet101_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet101_vd_pretrained.pdparams",
+    "ResNet152":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_pretrained.pdparams",
+    "ResNet152_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet152_vd_pretrained.pdparams",
+    "ResNet200_vd":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/ResNet200_vd_pretrained.pdparams",
+}
+
+__all__ = MODEL_URLS.keys()
+'''
+ResNet config: dict.
+    key: depth of ResNet.
+    values: config's dict of specific model.
+        keys:
+            block_type: Two different blocks in ResNet, BasicBlock and BottleneckBlock are optional.
+            block_depth: The number of blocks in different stages in ResNet.
+            num_channels: The number of channels to enter the next stage.
+'''
+NET_CONFIG = {
+    "18": {
+        "block_type": "BasicBlock",
+        "block_depth": [2, 2, 2, 2],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "34": {
+        "block_type": "BasicBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 64, 128, 256]
+    },
+    "50": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 6, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "101": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 4, 23, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "152": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 8, 36, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+    "200": {
+        "block_type": "BottleneckBlock",
+        "block_depth": [3, 12, 48, 3],
+        "num_channels": [64, 256, 512, 1024]
+    },
+}
+
+
+class ConvBNLayer(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 stride=1,
+                 groups=1,
+                 is_vd_mode=False,
+                 act=None,
+                 lr_mult=1.0):
+        super().__init__()
+        self.is_vd_mode = is_vd_mode
+        self.act = act
+        self.avg_pool = AvgPool2D(
+            kernel_size=2, stride=2, padding=0, ceil_mode=True)
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False)
+        self.bn = BatchNorm(
+            num_filters,
+            param_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        if self.is_vd_mode:
+            x = self.avg_pool(x)
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act:
+            x = self.relu(x)
+        return x
+
+
+class BottleneckBlock(TheseusLayer):
+    def __init__(
+            self,
+            num_channels,
+            num_filters,
+            stride,
+            shortcut=True,
+            if_first=False,
+            lr_mult=1.0, ):
+        super().__init__()
+
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=1,
+            act="relu",
+            lr_mult=lr_mult)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult)
+        self.conv2 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters * 4,
+            filter_size=1,
+            act=None,
+            lr_mult=lr_mult)
+
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters * 4,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult)
+        self.relu = nn.ReLU()
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        x = self.conv2(x)
+
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class BasicBlock(TheseusLayer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 shortcut=True,
+                 if_first=False,
+                 lr_mult=1.0):
+        super().__init__()
+
+        self.stride = stride
+        self.conv0 = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=3,
+            stride=stride,
+            act="relu",
+            lr_mult=lr_mult)
+        self.conv1 = ConvBNLayer(
+            num_channels=num_filters,
+            num_filters=num_filters,
+            filter_size=3,
+            act=None,
+            lr_mult=lr_mult)
+        if not shortcut:
+            self.short = ConvBNLayer(
+                num_channels=num_channels,
+                num_filters=num_filters,
+                filter_size=1,
+                stride=stride if if_first else 1,
+                is_vd_mode=False if if_first else True,
+                lr_mult=lr_mult)
+        self.shortcut = shortcut
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        identity = x
+        x = self.conv0(x)
+        x = self.conv1(x)
+        if self.shortcut:
+            short = identity
+        else:
+            short = self.short(identity)
+        x = paddle.add(x=x, y=short)
+        x = self.relu(x)
+        return x
+
+
+class ResNet(TheseusLayer):
+    """
+    ResNet
+    Args:
+        config: dict. config of ResNet.
+        version: str="vb". Different version of ResNet, version vd can perform better. 
+        class_num: int=1000. The number of classes.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific ResNet model depends on args.
+    """
+
+    def __init__(self,
+                 config,
+                 version="vb",
+                 class_num=1000,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0]):
+        super().__init__()
+
+        self.cfg = config
+        self.lr_mult_list = lr_mult_list
+        self.is_vd_mode = version == "vd"
+        self.class_num = class_num
+        self.num_filters = [64, 128, 256, 512]
+        self.block_depth = self.cfg["block_depth"]
+        self.block_type = self.cfg["block_type"]
+        self.num_channels = self.cfg["num_channels"]
+        self.channels_mult = 1 if self.num_channels[-1] == 256 else 4
+
+        assert isinstance(self.lr_mult_list, (
+            list, tuple
+        )), "lr_mult_list should be in (list, tuple) but got {}".format(
+            type(self.lr_mult_list))
+        assert len(self.lr_mult_list
+                   ) == 5, "lr_mult_list length should be 5 but got {}".format(
+                       len(self.lr_mult_list))
+
+        self.stem_cfg = {
+            #num_channels, num_filters, filter_size, stride
+            "vb": [[3, 64, 7, 2]],
+            "vd": [[3, 32, 3, 2], [32, 32, 3, 1], [32, 64, 3, 1]]
+        }
+
+        self.stem = nn.Sequential(*[
+            ConvBNLayer(
+                num_channels=in_c,
+                num_filters=out_c,
+                filter_size=k,
+                stride=s,
+                act="relu",
+                lr_mult=self.lr_mult_list[0])
+            for in_c, out_c, k, s in self.stem_cfg[version]
+        ])
+
+        self.max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+        block_list = []
+        for block_idx in range(len(self.block_depth)):
+            shortcut = False
+            for i in range(self.block_depth[block_idx]):
+                block_list.append(globals()[self.block_type](
+                    num_channels=self.num_channels[block_idx] if i == 0 else
+                    self.num_filters[block_idx] * self.channels_mult,
+                    num_filters=self.num_filters[block_idx],
+                    stride=2 if i == 0 and block_idx != 0 else 1,
+                    shortcut=shortcut,
+                    if_first=block_idx == i == 0 if version == "vd" else True,
+                    lr_mult=self.lr_mult_list[block_idx + 1]))
+                shortcut = True
+        self.blocks = nn.Sequential(*block_list)
+
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.flatten = nn.Flatten()
+        self.avg_pool_channels = self.num_channels[-1] * 2
+        stdv = 1.0 / math.sqrt(self.avg_pool_channels * 1.0)
+        self.fc = Linear(
+            self.avg_pool_channels,
+            self.class_num,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
+
+    def forward(self, x):
+        x = self.stem(x)
+        x = self.max_pool(x)
+        x = self.blocks(x)
+        x = self.avg_pool(x)
+        x = self.flatten(x)
+        x = self.fc(x)
+        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def ResNet18(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18"], use_ssld)
+    return model
+
+
+def ResNet18_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet18_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet18_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["18"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet18_vd"], use_ssld)
+    return model
+
+
+def ResNet34(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34"], use_ssld)
+    return model
+
+
+def ResNet34_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet34_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet34_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["34"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet34_vd"], use_ssld)
+    return model
+
+
+def ResNet50(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50"], use_ssld)
+    return model
+
+
+def ResNet50_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet50_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet50_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["50"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet50_vd"], use_ssld)
+    return model
+
+
+def ResNet101(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101"], use_ssld)
+    return model
+
+
+def ResNet101_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet101_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet101_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["101"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet101_vd"], use_ssld)
+    return model
+
+
+def ResNet152(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vb", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152"], use_ssld)
+    return model
+
+
+def ResNet152_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet152_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet152_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["152"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet152_vd"], use_ssld)
+    return model
+
+
+def ResNet200_vd(pretrained=False, use_ssld=False, **kwargs):
+    """
+    ResNet200_vd
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `ResNet200_vd` model depends on args.
+    """
+    model = ResNet(config=NET_CONFIG["200"], version="vd", **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["ResNet200_vd"], use_ssld)
+    return model
--- a/ppcls/arch/backbone/legendary_models/vgg.py
+++ b/ppcls/arch/backbone/legendary_models/vgg.py
@@ -14,16 +14,24 @@

 from __future__ import absolute_import, division, print_function

-import paddle
-from paddle import ParamAttr
 import paddle.nn as nn
-import paddle.nn.functional as F
 from paddle.nn import Conv2D, BatchNorm, Linear, Dropout
-from paddle.nn import AdaptiveAvgPool2D, MaxPool2D, AvgPool2D
+from paddle.nn import MaxPool2D

 from ppcls.arch.backbone.base.theseus_layer import TheseusLayer
-
-__all__ = ["VGG11", "VGG13", "VGG16", "VGG19"]
+from ppcls.utils.save_load import load_dygraph_pretrain, load_dygraph_pretrain_from_url
+
+MODEL_URLS = {
+    "VGG11":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG11_pretrained.pdparams",
+    "VGG13":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG13_pretrained.pdparams",
+    "VGG16":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG16_pretrained.pdparams",
+    "VGG19":
+    "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/VGG19_pretrained.pdparams",
+}
+__all__ = MODEL_URLS.keys()

 # VGG config
 # key: VGG network depth
@@ -36,68 +44,12 @@ NET_CONFIG = {
 }


-def VGG11(**args):
-    """
-    VGG11
-    Args:
-        kwargs: 
-            class_num: int=1000. Output dim of last fc layer.
-            stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
-    Returns:
-        model: nn.Layer. Specific `VGG11` model depends on args.
-    """
-    model = VGGNet(config=NET_CONFIG[11], **args)
-    return model
-
-
-def VGG13(**args):
-    """
-    VGG13
-    Args:
-        kwargs: 
-            class_num: int=1000. Output dim of last fc layer.
-            stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
-    Returns:
-        model: nn.Layer. Specific `VGG11` model depends on args.
-    """
-    model = VGGNet(config=NET_CONFIG[13], **args)
-    return model
-
-
-def VGG16(**args):
-    """
-    VGG16
-    Args:
-        kwargs: 
-            class_num: int=1000. Output dim of last fc layer.
-            stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
-    Returns:
-        model: nn.Layer. Specific `VGG11` model depends on args.
-    """
-    model = VGGNet(config=NET_CONFIG[16], **args)
-    return model
-
-
-def VGG19(**args):
-    """
-    VGG19
-    Args:
-        kwargs: 
-            class_num: int=1000. Output dim of last fc layer.
-            stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
-    Returns:
-        model: nn.Layer. Specific `VGG11` model depends on args.
-    """
-    model = VGGNet(config=NET_CONFIG[19], **args)
-    return model
-
-
 class ConvBlock(TheseusLayer):
    def __init__(self, input_channels, output_channels, groups):
-        super(ConvBlock, self).__init__()
+        super().__init__()

        self.groups = groups
-        self._conv_1 = Conv2D(
+        self.conv1 = Conv2D(
            in_channels=input_channels,
            out_channels=output_channels,
            kernel_size=3,
@@ -105,7 +57,7 @@ class ConvBlock(TheseusLayer):
            padding=1,
            bias_attr=False)
        if groups == 2 or groups == 3 or groups == 4:
-            self._conv_2 = Conv2D(
+            self.conv2 = Conv2D(
                in_channels=output_channels,
                out_channels=output_channels,
                kernel_size=3,
@@ -113,7 +65,7 @@ class ConvBlock(TheseusLayer):
                padding=1,
                bias_attr=False)
        if groups == 3 or groups == 4:
-            self._conv_3 = Conv2D(
+            self.conv3 = Conv2D(
                in_channels=output_channels,
                out_channels=output_channels,
                kernel_size=3,
@@ -121,7 +73,7 @@ class ConvBlock(TheseusLayer):
                padding=1,
                bias_attr=False)
        if groups == 4:
-            self._conv_4 = Conv2D(
+            self.conv4 = Conv2D(
                in_channels=output_channels,
                out_channels=output_channels,
                kernel_size=3,
@@ -129,65 +81,148 @@ class ConvBlock(TheseusLayer):
                padding=1,
                bias_attr=False)

-        self._pool = MaxPool2D(kernel_size=2, stride=2, padding=0)
-        self._relu = nn.ReLU()
+        self.max_pool = MaxPool2D(kernel_size=2, stride=2, padding=0)
+        self.relu = nn.ReLU()

    def forward(self, inputs):
-        x = self._conv_1(inputs)
-        x = self._relu(x)
+        x = self.conv1(inputs)
+        x = self.relu(x)
        if self.groups == 2 or self.groups == 3 or self.groups == 4:
-            x = self._conv_2(x)
-            x = self._relu(x)
+            x = self.conv2(x)
+            x = self.relu(x)
        if self.groups == 3 or self.groups == 4:
-            x = self._conv_3(x)
-            x = self._relu(x)
+            x = self.conv3(x)
+            x = self.relu(x)
        if self.groups == 4:
-            x = self._conv_4(x)
-            x = self._relu(x)
-        x = self._pool(x)
+            x = self.conv4(x)
+            x = self.relu(x)
+        x = self.max_pool(x)
        return x


 class VGGNet(TheseusLayer):
+    """
+    VGGNet
+    Args:
+        config: list. VGGNet config.
+        stop_grad_layers: int=0. The parameters in blocks which index larger than `stop_grad_layers`, will be set `param.trainable=False`
+        class_num: int=1000. The number of classes.
+    Returns:
+        model: nn.Layer. Specific VGG model depends on args.
+    """
+
    def __init__(self, config, stop_grad_layers=0, class_num=1000):
        super().__init__()

        self.stop_grad_layers = stop_grad_layers

-        self._conv_block_1 = ConvBlock(3, 64, config[0])
-        self._conv_block_2 = ConvBlock(64, 128, config[1])
-        self._conv_block_3 = ConvBlock(128, 256, config[2])
-        self._conv_block_4 = ConvBlock(256, 512, config[3])
-        self._conv_block_5 = ConvBlock(512, 512, config[4])
+        self.conv_block_1 = ConvBlock(3, 64, config[0])
+        self.conv_block_2 = ConvBlock(64, 128, config[1])
+        self.conv_block_3 = ConvBlock(128, 256, config[2])
+        self.conv_block_4 = ConvBlock(256, 512, config[3])
+        self.conv_block_5 = ConvBlock(512, 512, config[4])

-        self._relu = nn.ReLU()
-        self._flatten = nn.Flatten(start_axis=1, stop_axis=-1)
+        self.relu = nn.ReLU()
+        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)

        for idx, block in enumerate([
-                self._conv_block_1, self._conv_block_2, self._conv_block_3,
-                self._conv_block_4, self._conv_block_5
+                self.conv_block_1, self.conv_block_2, self.conv_block_3,
+                self.conv_block_4, self.conv_block_5
        ]):
            if self.stop_grad_layers >= idx + 1:
                for param in block.parameters():
                    param.trainable = False

-        self._drop = Dropout(p=0.5, mode="downscale_in_infer")
-        self._fc1 = Linear(7 * 7 * 512, 4096)
-        self._fc2 = Linear(4096, 4096)
-        self._out = Linear(4096, class_num)
+        self.drop = Dropout(p=0.5, mode="downscale_in_infer")
+        self.fc1 = Linear(7 * 7 * 512, 4096)
+        self.fc2 = Linear(4096, 4096)
+        self.fc3 = Linear(4096, class_num)

    def forward(self, inputs):
-        x = self._conv_block_1(inputs)
-        x = self._conv_block_2(x)
-        x = self._conv_block_3(x)
-        x = self._conv_block_4(x)
-        x = self._conv_block_5(x)
-        x = self._flatten(x)
-        x = self._fc1(x)
-        x = self._relu(x)
-        x = self._drop(x)
-        x = self._fc2(x)
-        x = self._relu(x)
-        x = self._drop(x)
-        x = self._out(x)
+        x = self.conv_block_1(inputs)
+        x = self.conv_block_2(x)
+        x = self.conv_block_3(x)
+        x = self.conv_block_4(x)
+        x = self.conv_block_5(x)
+        x = self.flatten(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.relu(x)
+        x = self.drop(x)
+        x = self.fc3(x)
        return x
+
+
+def _load_pretrained(pretrained, model, model_url, use_ssld):
+    if pretrained is False:
+        pass
+    elif pretrained is True:
+        load_dygraph_pretrain_from_url(model, model_url, use_ssld=use_ssld)
+    elif isinstance(pretrained, str):
+        load_dygraph_pretrain(model, pretrained)
+    else:
+        raise RuntimeError(
+            "pretrained type is not available. Please use `string` or `boolean` type."
+        )
+
+
+def VGG11(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG11
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG11` model depends on args.
+    """
+    model = VGGNet(config=NET_CONFIG[11], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG11"], use_ssld)
+    return model
+
+
+def VGG13(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG13
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG13` model depends on args.
+    """
+    model = VGGNet(config=NET_CONFIG[13], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG13"], use_ssld)
+    return model
+
+
+def VGG16(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG16
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG16` model depends on args.
+    """
+    model = VGGNet(config=NET_CONFIG[16], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG16"], use_ssld)
+    return model
+
+
+def VGG19(pretrained=False, use_ssld=False, **kwargs):
+    """
+    VGG19
+    Args:
+        pretrained: bool=False or str. If `True` load pretrained parameters, `False` otherwise.
+                    If str, means the path of the pretrained model.
+        use_ssld: bool=False. Whether using distillation pretrained model when pretrained=True.
+    Returns:
+        model: nn.Layer. Specific `VGG19` model depends on args.
+    """
+    model = VGGNet(config=NET_CONFIG[19], **kwargs)
+    _load_pretrained(pretrained, model, MODEL_URLS["VGG19"], use_ssld)
+    return model
--- a/ppcls/arch/head/__init__.py
+++ b/ppcls/arch/head/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .arcmargin import ArcMargin
+from .cosmargin import CosMargin
+from .circlemargin import CircleMargin
+from .fc import FC
+
+__all__ = ['build_head']
+
+def build_head(config):
+    support_dict = ['ArcMargin', 'CosMargin', 'CircleMargin', 'FC']
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception('head only support {}'.format(
+        support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
--- a/ppcls/arch/head/arcmargin.py
+++ b/ppcls/arch/head/arcmargin.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import math
+
+class ArcMargin(nn.Layer):
+    def __init__(self, embedding_size, 
+                       class_num,  
+                       margin=0.5, 
+                       scale=80.0, 
+                       easy_margin=False):
+        super(ArcMargin, self).__init__()
+        self.embedding_size   = embedding_size
+        self.class_num   = class_num
+        self.margin      = margin
+        self.scale       = scale
+        self.easy_margin = easy_margin
+
+        weight_attr =  paddle.ParamAttr(initializer = paddle.nn.initializer.XavierNormal())
+        self.fc = nn.Linear(self.embedding_size, self.class_num, weight_attr=weight_attr, bias_attr=False) 
+
+    def forward(self, input, label):
+        input_norm = paddle.sqrt(paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, input_norm)
+
+        weight = self.fc.weight
+        weight_norm = paddle.sqrt(paddle.sum(paddle.square(weight), axis=0, keepdim=True))
+        weight = paddle.divide(weight, weight_norm)
+        
+        cos   = paddle.matmul(input, weight)
+        sin   = paddle.sqrt(1.0 - paddle.square(cos) + 1e-6)
+        cos_m = math.cos(self.margin)
+        sin_m = math.sin(self.margin)
+        phi   = cos * cos_m - sin * sin_m
+
+        th = math.cos(self.margin) * (-1)
+        mm = math.sin(self.margin) * self.margin
+        if self.easy_margin:
+            phi = self._paddle_where_more_than(cos, 0, phi, cos)
+        else:
+            phi = self._paddle_where_more_than(cos, th, phi, cos - mm)
+
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output  = paddle.multiply(one_hot, phi) + paddle.multiply((1.0 - one_hot), cos)
+        output  = output * self.scale
+        return output
+
+    def _paddle_where_more_than(self, target, limit, x, y):
+        mask   = paddle.cast( x = (target > limit), dtype='float32')
+        output = paddle.multiply(mask, x) + paddle.multiply((1.0 - mask), y)
+        return output
--- a/ppcls/arch/head/circlemargin.py
+++ b/ppcls/arch/head/circlemargin.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+ 
+class CircleMargin(nn.Layer):
+    def __init__(self, embedding_size, 
+                       class_num, 
+                       margin, 
+                       scale):
+        super(CircleSoftmax, self).__init__()
+        self.scale  = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+
+        weight_attr = paddle.ParamAttr(initializer = paddle.nn.initializer.XavierNormal())
+        self.fc0 = paddle.nn.Linear(self.embedding_size, self.class_num, weight_attr=weight_attr)
+ 
+    def forward(self, input, label):
+        feat_norm = paddle.sqrt(paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, feat_norm)
+
+        weight = self.fc0.weight
+        weight_norm = paddle.sqrt(paddle.sum(paddle.square(weight), axis=0, keepdim=True))
+        weight = paddle.divide(weight, weight_norm)
+ 
+        logits   = paddle.matmul(input, weight)
+
+        alpha_p = paddle.clip(-logits.detach() + 1 + self.margin, min=0.)
+        alpha_n = paddle.clip(logits.detach() + self.margin, min=0.)
+        delta_p = 1 - self.margin
+        delta_n = self.margin
+        index = paddle.fluid.layers.where(label != -1).reshape([-1])
+        m_hot = F.one_hot(label.reshape([-1]), num_classes=logits.shape[1])
+        logits_p = alpha_p * (logits - delta_p)
+        logits_n = alpha_n * (logits - delta_n)
+        pre_logits = logits_p * m_hot + logits_n * (1 - m_hot)
+        pre_logits = self.scale * pre_logits
+ 
+        return pre_logits
--- a/ppcls/arch/head/cosmargin.py
+++ b/ppcls/arch/head/cosmargin.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import math
+import paddle.nn as nn
+
+class CosMargin(paddle.nn.Layer):
+    def __init__(self, embedding_size,
+                       class_num,
+                       margin=0.35,
+                       scale=64.0):
+        super(CosMargin, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.embedding_size = embedding_size
+        self.class_num = class_num
+        
+        weight_attr =  paddle.ParamAttr(initializer = paddle.nn.initializer.XavierNormal())
+        self.fc = nn.Linear(self.embedding_size, self.class_num, weight_attr=weight_attr, bias_attr=False)
+        
+    def forward(self, input, label):
+        label.stop_gradient = True
+
+        input_norm = paddle.sqrt(paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        input = paddle.divide(input, x_norm) 
+
+        weight = self.fc.weight
+        weight_norm = paddle.sqrt(paddle.sum(paddle.square(weight), axis=0, keepdim=True))
+        weight = paddle.divide(weight, weight_norm)
+
+        cos   = paddle.matmul(input, weight)
+        cos_m = cos - self.margin
+        
+        one_hot = paddle.nn.functional.one_hot(label, self.class_num)
+        one_hot = paddle.squeeze(one_hot, axis=[1])
+        output  = paddle.multiply(one_hot, cos_m) + paddle.multiply((1.0 - one_hot), cos)
+        output = output * self.scale
+        return output
--- a/ppcls/arch/head/fc.py
+++ b/ppcls/arch/head/fc.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+class FC(nn.Layer):
+    def __init__(self, embedding_size, 
+                       class_num):
+        super(FC, self).__init__()
+        self.embedding_size  = embedding_size
+        self.class_num = class_num
+        weight_attr =  paddle.ParamAttr(initializer = paddle.nn.initializer.XavierNormal())
+        self.fc  =  paddle.nn.Linear(self.embedding_size, self.class_num, weight_attr=weight_attr)    
+
+    def forward(self, input, label):
+        out = self.fc(input)
+        return out
--- a/ppcls/arch/loss_metrics/__init__.py
+++ b/ppcls/arch/loss_metrics/__init__.py
+#copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import sys
+import copy
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+# TODO: fix the format
+class CELoss(nn.Layer):
+    """
+    """
+
+    def __init__(self, name="loss", epsilon=None):
+        super().__init__()
+        self.name = name
+        if epsilon is not None and (epsilon <= 0 or epsilon >= 1):
+            epsilon = None
+        self.epsilon = epsilon
+
+    def _labelsmoothing(self, target, class_num):
+        if target.shape[-1] != class_num:
+            one_hot_target = F.one_hot(target, class_num)
+        else:
+            one_hot_target = target
+        soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon)
+        soft_target = paddle.reshape(soft_target, shape=[-1, class_num])
+        return soft_target
+
+    def forward(self, logits, label, mode="train"):
+        loss_dict = {}
+        if self.epsilon is not None:
+            class_num = logits.shape[-1]
+            label = self._labelsmoothing(label, class_num)
+            x = -F.log_softmax(x, axis=-1)
+            loss = paddle.sum(x * label, axis=-1)
+        else:
+            if label.shape[-1] == logits.shape[-1]:
+                label = F.softmax(label, axis=-1)
+                soft_label = True
+            else:
+                soft_label = False
+            loss = F.cross_entropy(logits, label=label, soft_label=soft_label)
+        loss_dict[self.name] = paddle.mean(loss)
+        return loss_dict
+
+
+# TODO: fix the format
+class Topk(nn.Layer):
+    def __init__(self, topk=[1, 5]):
+        super().__init__()
+        assert isinstance(topk, (int, list))
+        if isinstance(topk, int):
+            topk = [topk]
+        self.topk = topk
+
+    def forward(self, x, label):
+        metric_dict = dict()
+        for k in self.topk:
+            metric_dict["top{}".format(k)] = paddle.metric.accuracy(
+                x, label, k=k)
+        return metric_dict
+
+
+# TODO: fix the format
+def build_loss(config):
+    loss_func = CELoss()
+    return loss_func
+
+
+# TODO: fix the format
+def build_metrics(config):
+    metrics_func = Topk()
+    return metrics_func
--- a/ppcls/arch/neck/__init__.py
+++ b/ppcls/arch/neck/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['build_neck"]
+
+def build_neck(config):
+    support_dict = ['FPN', 'FC']
+    module_name = config.pop('name')
+    assert module_name in support_dict, Exception('head only support {}'.format(
+        support_dict))
+    module_class = eval(module_name)(**config)
+    return module_class
--- a/ppcls/arch/neck/fc.py
+++ b/ppcls/arch/neck/fc.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+class FC(nn.Layer):
+    def __init__(self, input_dim, 
+                       embedding_size):
+        super(FC, self).__init__()
+        self.input_dim  = input_dim
+        self.embedding_size = embedding_size
+        weight_attr =  paddle.ParamAttr(initializer = paddle.nn.initializer.XavierNormal())
+        self.fc  =  paddle.nn.Linear(self.input_dim, self.embedding_size, weight_attr=weight_attr)    
+
+    def forward(self, x):
+        x = self.fc(x)
+        return x
--- a/ppcls/configs/ImageNet/ResNet/ResNet50.yaml
+++ b/ppcls/configs/ImageNet/ResNet/ResNet50.yaml
+# global configs
+Global:
+  checkpoints: null
+  pretrained_model: null
+  output_dir: "./output/"
+  device: "gpu"
+  class_num: 1000
+  save_interval: 1
+  eval_during_train: True
+  eval_interval: 1
+  epochs: 120
+  print_batch_step: 10
+  use_visualdl: False
+  image_shape: [3, 224, 224]
+  infer_imgs:
+
+# model architecture
+Arch:
+  name: "ResNet50"
+
+# loss function config for traing/eval process
+Loss:
+  Train:
+    - CELoss:
+        weight: 1.0
+  Eval:
+    - CELoss:
+        weight: 1.0
+
+
+Optimizer:
+  name: Momentum
+  momentum: 0.9
+  lr:
+    name: Piecewise
+    learning_rate: 0.1
+    decay_epochs: [30, 60, 90]
+    values: [0.1, 0.01, 0.001, 0.0001]
+  regularizer:
+    name: 'L2'
+    coeff: 0.0001
+
+
+# data loader for train and eval
+DataLoader:
+  Train:
+    # Dataset:
+    # Sampler:
+    # Loader:
+    batch_size: 256
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/train_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - RandCropImage:
+            size: 224
+        - RandFlipImage:
+            flip_code: 1
+        - NormalizeImage:
+            scale: 1./255.
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+  Eval:
+    # TOTO: modify to the latest trainer
+    # Dataset:
+    # Sampler:
+    # Loader:
+    batch_size: 128
+    num_workers: 4
+    file_list: "./dataset/ILSVRC2012/val_list.txt"
+    data_dir: "./dataset/ILSVRC2012/"
+    shuffle_seed: 0
+    transforms:
+        - DecodeImage:
+            to_rgb: True
+            channel_first: False
+        - ResizeImage:
+            resize_short: 256
+        - CropImage:
+            size: 224
+        - NormalizeImage:
+            scale: 1.0/255.0
+            mean: [0.485, 0.456, 0.406]
+            std: [0.229, 0.224, 0.225]
+            order: ''
+        - ToCHWImage:
+
+Metric:
+    Train:
+    - Topk:
+        k: [1, 5]
+    Eval:
+    - Topk:
+        k: [1, 5]
+
--- a/ppcls/data/__init__.py
+++ b/ppcls/data/__init__.py
@@ -115,5 +115,4 @@ def build_dataloader(config, mode, device, seed=None):
    dataloader = Reader(config, mode=mode, places=device)()
    return dataloader

-    return data_loader
 '''
--- a/ppcls/data/reader.py
+++ b/ppcls/data/reader.py
@@ -250,13 +250,14 @@ class Reader:

    def __init__(self, config, mode='train', places=None):
        try:
-            self.params = config[mode.upper()]
+            self.params = config[mode.capitalize()]
        except KeyError:
            raise ModeException(mode=mode)

        use_mix = config.get('use_mix')
        self.params['mode'] = mode
        self.shuffle = mode == "train"
+        self.is_train = mode == "train"

        self.collate_fn = None
        self.batch_ops = []
@@ -298,7 +299,7 @@ class Reader:
                shuffle=False,
                num_workers=self.params["num_workers"])
        else:
-            is_train = self.params['mode'] == "train"
+            is_train = self.is_train
            batch_sampler = DistributedBatchSampler(
                dataset,
                batch_size=batch_size,

--- a/ppcls/engine/trainer.py
+++ b/ppcls/engine/trainer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import os
+import sys
+import numpy as np
+__dir__ = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(os.path.abspath(os.path.join(__dir__, '../../')))
+
+import argparse
+import paddle
+import paddle.nn as nn
+import paddle.distributed as dist
+
+from ppcls.utils.check import check_gpu
+from ppcls.utils.misc import AverageMeter
+from ppcls.utils import logger
+from ppcls.data import build_dataloader
+from ppcls.arch import build_model
+from ppcls.arch.loss_metrics import build_loss
+from ppcls.arch.loss_metrics import build_metrics
+from ppcls.optimizer import build_optimizer
+from ppcls.utils.save_load import load_dygraph_pretrain
+from ppcls.utils.save_load import init_model
+from ppcls.utils import save_load
+
+
+class Trainer(object):
+    def __init__(self, config, mode="train"):
+        self.mode = mode
+        self.config = config
+        self.output_dir = self.config['Global']['output_dir']
+        # set device
+        assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu"]
+        self.device = paddle.set_device(self.config["Global"]["device"])
+        # set dist
+        self.config["Global"][
+            "distributed"] = paddle.distributed.get_world_size() != 1
+        if self.config["Global"]["distributed"]:
+            dist.init_parallel_env()
+        self.model = build_model(self.config["Arch"])
+
+        if self.config["Global"]["pretrained_model"] is not None:
+            load_dygraph_pretrain(self.model,
+                                  self.config["Global"]["pretrained_model"])
+
+        if self.config["Global"]["distributed"]:
+            self.model = paddle.DataParallel(self.model)
+
+        self.vdl_writer = None
+        if self.config['Global']['use_visualdl']:
+            from visualdl import LogWriter
+            vdl_writer_path = os.path.join(self.output_dir, "vdl")
+            if not os.path.exists(vdl_writer_path):
+                os.makedirs(vdl_writer_path)
+            self.vdl_writer = LogWriter(logdir=vdl_writer_path)
+        logger.info('train with paddle {} and device {}'.format(
+            paddle.__version__, self.device))
+
+    def _build_metric_info(self, metric_config, mode="train"):
+        """
+        _build_metric_info: build metrics according to current mode
+        Return:
+            metric: dict of the metrics info
+        """
+        metric = None
+        mode = mode.capitalize()
+        if mode in metric_config and metric_config[mode] is not None:
+            metric = build_metrics(metric_config[mode])
+        return metric
+
+    def _build_loss_info(self, loss_config, mode="train"):
+        """
+        _build_loss_info: build loss according to current mode
+        Return:
+            loss_dict: dict of the loss info
+        """
+        loss = None
+        mode = mode.capitalize()
+        if mode in loss_config and loss_config[mode] is not None:
+            loss = build_loss(loss_config[mode])
+        return loss
+
+    def train(self):
+        # build train loss and metric info
+        loss_func = self._build_loss_info(self.config["Loss"])
+
+        metric_func = self._build_metric_info(self.config["Metric"])
+
+        train_dataloader = build_dataloader(self.config["DataLoader"], "train",
+                                            self.device)
+
+        step_each_epoch = len(train_dataloader)
+
+        optimizer, lr_sch = build_optimizer(self.config["Optimizer"],
+                                            self.config["Global"]["epochs"],
+                                            step_each_epoch,
+                                            self.model.parameters())
+
+        print_batch_step = self.config['Global']['print_batch_step']
+        save_interval = self.config["Global"]["save_interval"]
+
+        best_metric = {
+            "metric": 0.0,
+            "epoch": 0,
+        }
+        # key: 
+        # val: metrics list word
+        output_info = dict()
+        # global iter counter
+        global_step = 0
+
+        if self.config["Global"]["checkpoints"] is not None:
+            metric_info = init_model(self.config["Global"], self.model,
+                                     optimizer)
+            if metric_info is not None:
+                best_metric.update(metric_info)
+
+        for epoch_id in range(best_metric["epoch"] + 1,
+                              self.config["Global"]["epochs"] + 1):
+            acc = 0.0
+            self.model.train()
+            for iter_id, batch in enumerate(train_dataloader()):
+                batch_size = batch[0].shape[0]
+                batch[1] = paddle.to_tensor(batch[1].numpy().astype("int64")
+                                            .reshape([-1, 1]))
+                global_step += 1
+                # image input
+                out = self.model(batch[0])
+                # calc loss
+                loss_dict = loss_func(out, batch[-1])
+                for key in loss_dict:
+                    if not key in output_info:
+                        output_info[key] = AverageMeter(key, '7.5f')
+                    output_info[key].update(loss_dict[key].numpy()[0],
+                                            batch_size)
+                # calc metric
+                if metric_func is not None:
+                    metric_dict = metric_func(out, batch[-1])
+                    for key in metric_dict:
+                        if not key in output_info:
+                            output_info[key] = AverageMeter(key, '7.5f')
+                        output_info[key].update(metric_dict[key].numpy()[0],
+                                                batch_size)
+
+                if iter_id % print_batch_step == 0:
+                    lr_msg = "lr: {:.5f}".format(lr_sch.get_lr())
+                    metric_msg = ", ".join([
+                        "{}: {:.5f}".format(key, output_info[key].avg)
+                        for key in output_info
+                    ])
+                    logger.info("[Train][Epoch {}][Iter: {}/{}]{}, {}".format(
+                        epoch_id, iter_id,
+                        len(train_dataloader), lr_msg, metric_msg))
+
+                # step opt and lr
+                loss_dict["loss"].backward()
+                optimizer.step()
+                optimizer.clear_grad()
+                lr_sch.step()
+
+            metric_msg = ", ".join([
+                "{}: {:.5f}".format(key, output_info[key].avg)
+                for key in output_info
+            ])
+            logger.info("[Train][Epoch {}][Avg]{}".format(epoch_id,
+                                                          metric_msg))
+            output_info.clear()
+
+            # eval model and save model if possible
+            if self.config["Global"][
+                    "eval_during_train"] and epoch_id % self.config["Global"][
+                        "eval_during_train"] == 0:
+                acc = self.eval(epoch_id)
+                if acc > best_metric["metric"]:
+                    best_metric["metric"] = acc
+                    best_metric["epoch"] = epoch_id
+                    save_load.save_model(
+                        self.model,
+                        optimizer,
+                        best_metric,
+                        self.output_dir,
+                        model_name=self.config["Arch"]["name"],
+                        prefix="best_model")
+
+            # save model
+            if epoch_id % save_interval == 0:
+                save_load.save_model(
+                    self.model,
+                    optimizer, {"metric": acc,
+                                "epoch": epoch_id},
+                    self.output_dir,
+                    model_name=self.config["Arch"]["name"],
+                    prefix="ppcls_epoch_{}".format(epoch_id))
+
+    def build_avg_metrics(self, info_dict):
+        return {key: AverageMeter(key, '7.5f') for key in info_dict}
+
+    @paddle.no_grad()
+    def eval(self, epoch_id=0):
+        output_info = dict()
+
+        eval_dataloader = build_dataloader(self.config["DataLoader"], "eval",
+                                           self.device)
+
+        self.model.eval()
+        print_batch_step = self.config["Global"]["print_batch_step"]
+
+        # build train loss and metric info
+        loss_func = self._build_loss_info(self.config["Loss"], "eval")
+        metric_func = self._build_metric_info(self.config["Metric"], "eval")
+        metric_key = None
+
+        for iter_id, batch in enumerate(eval_dataloader()):
+            batch_size = batch[0].shape[0]
+            batch[0] = paddle.to_tensor(batch[0]).astype("float32")
+            batch[1] = paddle.to_tensor(batch[1]).reshape([-1, 1])
+            # image input
+            out = self.model(batch[0])
+            # calc build
+            if loss_func is not None:
+                loss_dict = loss_func(out, batch[-1])
+                for key in loss_dict:
+                    if not key in output_info:
+                        output_info[key] = AverageMeter(key, '7.5f')
+                    output_info[key].update(loss_dict[key].numpy()[0],
+                                            batch_size)
+                # calc metric
+                if metric_func is not None:
+                    metric_dict = metric_func(out, batch[-1])
+                    if paddle.distributed.get_world_size() > 1:
+                        for key in metric_dict:
+                            paddle.distributed.all_reduce(
+                                metric_dict[key],
+                                op=paddle.distributed.ReduceOp.SUM)
+                            metric_dict[key] = metric_dict[
+                                key] / paddle.distributed.get_world_size()
+                    for key in metric_dict:
+                        if metric_key is None:
+                            metric_key = key
+                        if not key in output_info:
+                            output_info[key] = AverageMeter(key, '7.5f')
+
+                        output_info[key].update(metric_dict[key].numpy()[0],
+                                                batch_size)
+
+            if iter_id % print_batch_step == 0:
+                metric_msg = ", ".join([
+                    "{}: {:.5f}".format(key, output_info[key].val)
+                    for key in output_info
+                ])
+                logger.info("[Eval][Epoch {}][Iter: {}/{}]{}".format(
+                    epoch_id, iter_id, len(eval_dataloader), metric_msg))
+
+        metric_msg = ", ".join([
+            "{}: {:.5f}".format(key, output_info[key].avg)
+            for key in output_info
+        ])
+        logger.info("[Eval][Epoch {}][Avg]{}".format(epoch_id, metric_msg))
+
+        self.model.train()
+        # do not try to save best model
+        if metric_func is None:
+            return -1
+        # return 1st metric in the dict
+        return output_info[metric_key].avg
--- a/ppcls/losses/__init__.py
+++ b/ppcls/losses/__init__.py
+import copy
+import paddle
+import paddle.nn as nn
+
+from .celoss import CELoss
+
+from .triplet import TripletLoss, TripletLossV2
+from .msmloss import MSMLoss
+from .emlloss import EmlLoss
+from .npairsloss  import NpairsLoss
+from .trihardloss import TriHardLoss
+from .centerloss  import CenterLoss
+
+class CombinedLoss(nn.Layer):
+    def __init__(self, config_list):
+        super().__init__()
+        self.loss_func = []
+        self.loss_weight = []
+        assert isinstance(config_list, list), (
+            'operator config should be a list')
+        for config in config_list:
+            print(config)
+            assert isinstance(config,
+                              dict) and len(config) == 1, "yaml format error"
+            name = list(config)[0]
+            param = config[name]
+            assert "weight" in param, "weight must be in param, but param just contains {}".format(
+                param.keys())
+            self.loss_weight.append(param.pop("weight"))
+            self.loss_func.append(eval(name)(**param))
+
+    def __call__(self, input, batch):
+        loss_dict = {}
+        for idx, loss_func in enumerate(self.loss_func):
+            loss = loss_func(input, batch)
+            weight = self.loss_weight[idx]
+            loss = {key: loss[key] * weight for key in loss}
+            loss_dict.update(loss)
+        loss_dict["loss"] = paddle.add_n(list(loss_dict.values()))
+        return loss_dict
+
+def build_loss(config):
+    module_class = CombinedLoss(config)
+    logger.info("build loss {} success.".format(module_class))
+    return module_class
--- a/ppcls/losses/celoss.py
+++ b/ppcls/losses/celoss.py
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+
+__all__ = ['CELoss', 'JSDivLoss', 'KLDivLoss']
+
+
+class Loss(object):
+    """
+    Loss
+    """
+    def __init__(self, class_dim=1000, epsilon=None):
+        assert class_dim > 1, "class_dim=%d is not larger than 1" % (class_dim)
+        self._class_dim = class_dim
+        if epsilon is not None and epsilon >= 0.0 and epsilon <= 1.0:
+            self._epsilon = epsilon
+            self._label_smoothing = True  #use label smoothing.(Actually, it is softmax label)
+        else:
+            self._epsilon = None
+            self._label_smoothing = False
+
+    #do label_smoothing
+    def _labelsmoothing(self, target):
+        if target.shape[-1] != self._class_dim:
+            one_hot_target = F.one_hot(target, self._class_dim)  #do ont hot(23,34,46)-> 3 * _class_dim
+        else:
+            one_hot_target = target
+
+        #do label_smooth
+        soft_target = F.label_smooth(one_hot_target, epsilon=self._epsilon)   #(1 - epsilon) * input + eposilon / K.
+        soft_target = paddle.reshape(soft_target, shape=[-1, self._class_dim])
+        return soft_target
+
+    def _crossentropy(self, input, target, use_pure_fp16=False):
+        if self._label_smoothing:
+            target = self._labelsmoothing(target)
+            input = -F.log_softmax(input, axis=-1)      #softmax and do log
+            cost = paddle.sum(target * input, axis=-1)  #sum  
+        else:
+            cost = F.cross_entropy(input=input, label=target) 
+
+        if use_pure_fp16:
+            avg_cost = paddle.sum(cost)
+        else:
+            avg_cost = paddle.mean(cost)
+        return avg_cost
+
+    def _kldiv(self, input, target, name=None):
+        eps = 1.0e-10
+        cost = target * paddle.log(
+            (target + eps) / (input + eps)) * self._class_dim
+        return cost
+
+    def _jsdiv(self, input, target):  #so the input and target is the fc output; no softmax
+        input = F.softmax(input)
+        target = F.softmax(target) 
+
+        #two distribution
+        cost = self._kldiv(input, target) + self._kldiv(target, input)
+        cost = cost / 2
+        avg_cost = paddle.mean(cost)
+        return avg_cost
+
+    def __call__(self, input, target):
+        pass
+
+
+class CELoss(Loss):
+    """
+    Cross entropy loss
+    """
+
+    def __init__(self, class_dim=1000, epsilon=None):
+        super(CELoss, self).__init__(class_dim, epsilon)
+
+    def __call__(self, input, target, use_pure_fp16=False):
+        logits = input["logits"]
+        cost = self._crossentropy(logits, target, use_pure_fp16)
+        return {"CELoss": cost}
+
+class JSDivLoss(Loss):
+    """
+    JSDiv loss
+    """
+    def __init__(self, class_dim=1000, epsilon=None):
+        super(JSDivLoss, self).__init__(class_dim, epsilon)
+
+    def __call__(self, input, target):
+        cost = self._jsdiv(input, target)
+        return cost
+
+
+class KLDivLoss(paddle.nn.Layer):
+    def __init__(self):
+        super(KLDivLoss, self).__init__()
+
+    def __call__(self, p, q, is_logit=True):
+        if is_logit:
+            p = paddle.nn.functional.softmax(p)
+            q = paddle.nn.functional.softmax(q)
+        return -(p * paddle.log(q + 1e-8)).sum(1).mean()
+    
--- a/ppcls/losses/centerloss.py
+++ b/ppcls/losses/centerloss.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+class CenterLoss(nn.Layer):
+    def __init__(self, num_classes=5013, feat_dim=2048):
+        super(CenterLoss, self).__init__()
+        self.num_classes = num_classes
+        self.feat_dim = feat_dim
+        self.centers  = paddle.randn(shape=[self.num_classes, self.feat_dim]).astype("float64")  #random center
+
+    def __call__(self, input, target):
+        """
+        inputs: network output: {"features: xxx", "logits": xxxx}
+        target: image label
+        """
+        feats = input["features"]
+        labels = target
+        batch_size = feats.shape[0]
+
+        #calc feat * feat   
+        dist1 = paddle.sum(paddle.square(feats), axis=1, keepdim=True)
+        dist1 = paddle.expand(dist1, [batch_size, self.num_classes])  
+
+        #dist2 of centers
+        dist2 = paddle.sum(paddle.square(self.centers), axis=1, keepdim=True)   #num_classes
+        dist2 = paddle.expand(dist2, [self.num_classes, batch_size]).astype("float64")
+        dist2 = paddle.transpose(dist2, [1, 0])
+
+        #first x * x + y * y
+        distmat = paddle.add(dist1, dist2)
+        tmp = paddle.matmul(feats,  paddle.transpose(self.centers, [1, 0]))
+        distmat = distmat -  2.0 * tmp
+
+        #generate the mask
+        classes = paddle.arange(self.num_classes).astype("int64")
+        labels  = paddle.expand(paddle.unsqueeze(labels, 1), (batch_size, self.num_classes))
+        mask    = paddle.equal(paddle.expand(classes, [batch_size, self.num_classes]), labels).astype("float64")  #get mask
+
+        dist = paddle.multiply(distmat,  mask)
+        loss = paddle.sum(paddle.clip(dist, min=1e-12, max=1e+12)) / batch_size
+
+        return {'CenterLoss': loss}
+    
--- a/ppcls/losses/comfunc.py
+++ b/ppcls/losses/comfunc.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+def rerange_index(batch_size, samples_each_class):
+    tmp = np.arange(0, batch_size * batch_size) 
+    tmp = tmp.reshape(-1, batch_size) 
+    rerange_index = []
+
+    for i in range(batch_size):
+        step = i // samples_each_class
+        start = step * samples_each_class
+        end   = (step + 1) * samples_each_class
+
+        pos_idx = []   
+        neg_idx = []   
+        for j, k in enumerate(tmp[i]):
+            if j >= start and j < end:
+                if j == i:
+                    pos_idx.insert(0, k)
+                else:
+                    pos_idx.append(k)  
+            else:
+                neg_idx.append(k)  
+        rerange_index += (pos_idx + neg_idx)
+
+    rerange_index = np.array(rerange_index).astype(np.int32)
+    return rerange_index
--- a/ppcls/losses/emlloss.py
+++ b/ppcls/losses/emlloss.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import numpy as np
+from .comfunc import rerange_index
+
+class EmlLoss(paddle.nn.Layer):
+    def __init__(self, batch_size = 40, samples_each_class = 2):
+        super(EmlLoss, self).__init__()
+        assert(batch_size % samples_each_class == 0)
+        self.samples_each_class = samples_each_class
+        self.batch_size   = batch_size
+        self.rerange_index      = rerange_index(batch_size, samples_each_class)
+        self.thresh = 20.0
+        self.beta   = 100000
+        
+    def surrogate_function(self, beta, theta, bias):
+        x = theta * paddle.exp(bias) 
+        output = paddle.log(1 + beta * x) / math.log(1 + beta)
+        return output
+
+    def surrogate_function_approximate(self, beta, theta, bias):
+        output = (paddle.log(theta) + bias + math.log(beta)) / math.log(1+beta)
+        return output
+
+    def surrogate_function_stable(self, beta, theta, target, thresh):
+        max_gap = paddle.to_tensor(thresh, dtype='float32')
+        max_gap.stop_gradient = True
+        
+        target_max = paddle.maximum(target, max_gap)
+        target_min = paddle.minimum(target, max_gap)
+        
+        loss1 = self.surrogate_function(beta, theta, target_min)
+        loss2 = self.surrogate_function_approximate(beta, theta, target_max)
+        bias  = self.surrogate_function(beta, theta, max_gap)
+        loss  = loss1 + loss2 - bias
+        return loss
+
+    def forward(self, input, target=None):
+        features = input["features"]
+        samples_each_class = self.samples_each_class
+        batch_size         = self.batch_size
+        rerange_index      = self.rerange_index
+        
+        #calc distance
+        diffs = paddle.unsqueeze(features, axis=1) - paddle.unsqueeze(features, axis=0)
+        similary_matrix =  paddle.sum(paddle.square(diffs), axis=-1)   
+    
+        tmp = paddle.reshape(similary_matrix, shape = [-1, 1]) 
+        rerange_index = paddle.to_tensor(rerange_index)
+        tmp = paddle.gather(tmp, index=rerange_index)   
+        similary_matrix = paddle.reshape(tmp, shape=[-1, batch_size])  
+        
+        ignore, pos, neg = paddle.split(similary_matrix, num_or_sections= [1, 
+            samples_each_class - 1, batch_size - samples_each_class], axis = 1)
+        ignore.stop_gradient = True 
+
+        pos_max = paddle.max(pos, axis=1, keepdim=True)
+        pos = paddle.exp(pos - pos_max)
+        pos_mean = paddle.mean(pos, axis=1, keepdim=True)
+
+        neg_min = paddle.min(neg, axis=1, keepdim=True)
+        neg = paddle.exp(neg_min - neg)
+        neg_mean = paddle.mean(neg, axis=1, keepdim=True)
+        
+        bias = pos_max - neg_min
+        theta = paddle.multiply(neg_mean, pos_mean)
+
+        loss = self.surrogate_function_stable(self.beta, theta, bias, self.thresh)
+        loss = paddle.mean(loss)
+        return {"emlloss": loss}
+    
--- a/ppcls/losses/msmloss.py
+++ b/ppcls/losses/msmloss.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+from .comfunc import rerange_index
+
+class MSMLoss(paddle.nn.Layer):
+    """
+    MSMLoss Loss, based on triplet loss. USE P * K samples.
+    the batch size is fixed. Batch_size = P * K;  but the K may vary between batches.
+    same label gather together
+    
+            supported_metrics = [
+            'euclidean',
+            'sqeuclidean',
+            'cityblock',
+        ]
+    only consider samples_each_class = 2
+    """
+    def __init__(self, batch_size = 120, samples_each_class=2,  margin=0.1):
+        super(MSMLoss, self).__init__()
+        self.margin = margin
+        self.samples_each_class = samples_each_class
+        self.batch_size         = batch_size
+        self.rerange_index      = rerange_index(batch_size, samples_each_class)
+
+    def forward(self, input, target=None):
+        #normalization 
+        features = input["features"]
+        features = self._nomalize(features)
+        samples_each_class = self.samples_each_class
+        rerange_index      = paddle.to_tensor(self.rerange_index)
+
+        #calc sm
+        diffs = paddle.unsqueeze(features, axis=1) - paddle.unsqueeze(features, axis=0)
+        similary_matrix =  paddle.sum(paddle.square(diffs), axis=-1)
+        
+        #rerange 
+        tmp = paddle.reshape(similary_matrix, shape = [-1, 1]) 
+        tmp = paddle.gather(tmp, index=rerange_index)   
+        similary_matrix = paddle.reshape(tmp, shape=[-1, self.batch_size])  
+        
+        #split
+        ignore, pos, neg = paddle.split(similary_matrix, num_or_sections= [1, 
+            samples_each_class - 1, -1], axis = 1)
+        ignore.stop_gradient = True   
+
+        hard_pos = paddle.max(pos)   
+        hard_neg = paddle.min(neg)
+
+        loss = hard_pos + self.margin - hard_neg
+        loss = paddle.nn.ReLU()(loss)  
+        return {"msmloss": loss}
+
+    def _nomalize(self, input):
+        input_norm = paddle.sqrt(paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        return paddle.divide(input, input_norm)
+    
--- a/ppcls/losses/npairsloss.py
+++ b/ppcls/losses/npairsloss.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import paddle
+
+class NpairsLoss(paddle.nn.Layer):
+    
+    def __init__(self, reg_lambda=0.01):
+        super(NpairsLoss, self).__init__()
+        self.reg_lambda = reg_lambda
+        
+    def forward(self, input, target=None):
+        """
+        anchor and positive(should include label)
+        """
+        features = input["features"]
+        reg_lambda = self.reg_lambda
+        batch_size = features.shape[0]
+        fea_dim    = features.shape[1]
+        num_class = batch_size // 2
+        
+        #reshape
+        out_feas = paddle.reshape(features, shape=[-1, 2, fea_dim])
+        anc_feas, pos_feas = paddle.split(out_feas, num_or_sections = 2, axis = 1)
+        anc_feas   = paddle.squeeze(anc_feas, axis=1)
+        pos_feas = paddle.squeeze(pos_feas, axis=1)
+        
+        #get simi matrix
+        similarity_matrix = paddle.matmul(anc_feas, pos_feas, transpose_y=True)     #get similarity matrix
+        sparse_labels = paddle.arange(0, num_class, dtype='int64')
+        xentloss = paddle.nn.CrossEntropyLoss()(similarity_matrix, sparse_labels)   #by default: mean
+        
+        #l2 norm
+        reg = paddle.mean(paddle.sum(paddle.square(features), axis=1))
+        l2loss = 0.5 * reg_lambda * reg
+        return {"npairsloss": xentloss + l2loss}
+    
--- a/ppcls/losses/trihardloss.py
+++ b/ppcls/losses/trihardloss.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from .comfunc import rerange_index
+
+class TriHardLoss(paddle.nn.Layer):
+    """
+    TriHard Loss, based on triplet loss. USE P * K samples.
+    the batch size is fixed. Batch_size = P * K;  but the K may vary between batches.
+    same label gather together
+    
+            supported_metrics = [
+            'euclidean',
+            'sqeuclidean',
+            'cityblock',
+        ]
+    only consider samples_each_class = 2
+    """
+    def __init__(self, batch_size = 120, samples_each_class=2,  margin=0.1):
+        super(TriHardLoss, self).__init__()
+        self.margin = margin
+        self.samples_each_class = samples_each_class
+        self.batch_size         = batch_size
+        self.rerange_index      = rerange_index(batch_size, samples_each_class)
+
+    def forward(self, input, target=None):
+        features = input["features"]
+        assert (self.batch_size == features.shape[0])
+        
+        #normalization 
+        features = self._nomalize(features)
+        samples_each_class = self.samples_each_class
+        rerange_index      = paddle.to_tensor(self.rerange_index)
+
+        #calc sm
+        diffs = paddle.unsqueeze(features, axis=1) - paddle.unsqueeze(features, axis=0)
+        similary_matrix =  paddle.sum(paddle.square(diffs), axis=-1)
+        
+        #rerange 
+        tmp = paddle.reshape(similary_matrix, shape = [-1, 1]) 
+        tmp = paddle.gather(tmp, index=rerange_index)   
+        similary_matrix = paddle.reshape(tmp, shape=[-1, self.batch_size])  
+        
+        #split
+        ignore, pos, neg = paddle.split(similary_matrix, num_or_sections= [1, 
+            samples_each_class - 1, -1], axis = 1)
+        
+        ignore.stop_gradient = True    
+        hard_pos = paddle.max(pos, axis=1) 
+        hard_neg = paddle.min(neg, axis=1)
+
+        loss = hard_pos + self.margin - hard_neg
+        loss = paddle.nn.ReLU()(loss)    
+        loss = paddle.mean(loss)
+        return {"trihardloss": loss}
+
+    def _nomalize(self, input):
+        input_norm = paddle.sqrt(paddle.sum(paddle.square(input), axis=1, keepdim=True))
+        return paddle.divide(input, input_norm)
+    
--- a/ppcls/losses/triplet.py
+++ b/ppcls/losses/triplet.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+
+class TripletLossV2(nn.Layer):
+    """Triplet loss with hard positive/negative mining.
+    Args:
+        margin (float): margin for triplet.
+    """
+    def __init__(self, margin=0.5):
+        super(TripletLossV2, self).__init__()
+        self.margin = margin
+        self.ranking_loss = paddle.nn.loss.MarginRankingLoss(margin=margin)
+
+    def forward(self, input, target, normalize_feature=True):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (num_classes)
+        """
+        inputs = input["features"]
+
+        if normalize_feature:
+            inputs = 1. * inputs / (paddle.expand_as(
+                paddle.norm(inputs, p=2, axis=-1, keepdim=True), inputs) +
+                                    1e-12)
+
+        bs = inputs.shape[0]
+
+        # compute distance
+        dist = paddle.pow(inputs, 2).sum(axis=1, keepdim=True).expand([bs, bs])
+        dist = dist + dist.t()
+        dist = paddle.addmm(input=dist,
+                            x=inputs,
+                            y=inputs.t(),
+                            alpha=-2.0,
+                            beta=1.0)
+        dist = paddle.clip(dist, min=1e-12).sqrt()
+
+        # hard negative mining
+        is_pos = paddle.expand(target, (bs, bs)).equal(
+            paddle.expand(target, (bs, bs)).t())
+        is_neg = paddle.expand(target, (bs, bs)).not_equal(
+            paddle.expand(target, (bs, bs)).t())
+
+        # `dist_ap` means distance(anchor, positive)
+        ## both `dist_ap` and `relative_p_inds` with shape [N, 1]
+        #print(is_pos.shape, dist.shape, type(is_pos), type(dist), paddle.reshape(paddle.masked_select(dist, is_pos),(bs, -1)))
+        '''
+        dist_ap, relative_p_inds = paddle.max(
+            paddle.reshape(dist[is_pos], (bs, -1)), axis=1, keepdim=True)
+        # `dist_an` means distance(anchor, negative)
+        # both `dist_an` and `relative_n_inds` with shape [N, 1]
+        dist_an, relative_n_inds = paddle.min(
+            paddle.reshape(dist[is_neg], (bs, -1)), axis=1, keepdim=True)
+        '''
+        dist_ap = paddle.max(paddle.reshape(paddle.masked_select(dist, is_pos),
+                                            (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # `dist_an` means distance(anchor, negative)
+        # both `dist_an` and `relative_n_inds` with shape [N, 1]
+        dist_an = paddle.min(paddle.reshape(paddle.masked_select(dist, is_neg),
+                                            (bs, -1)),
+                             axis=1,
+                             keepdim=True)
+        # shape [N]
+        dist_ap = paddle.squeeze(dist_ap, axis=1)
+        dist_an = paddle.squeeze(dist_an, axis=1)
+
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_an, dist_ap, y)
+        return {"TripletLossV2": loss}
+
+
+class TripletLoss(nn.Layer):
+    """Triplet loss with hard positive/negative mining.
+    Reference:
+    Hermans et al. In Defense of the Triplet Loss for Person Re-Identification. arXiv:1703.07737.
+    Code imported from https://github.com/Cysu/open-reid/blob/master/reid/loss/triplet.py.
+    Args:
+        margin (float): margin for triplet.
+    """
+    def __init__(self, margin=1.0):
+        super(TripletLoss, self).__init__()
+        self.margin = margin
+        self.ranking_loss = paddle.nn.loss.MarginRankingLoss(margin=margin)
+
+    def forward(self, input, target):
+        """
+        Args:
+            inputs: feature matrix with shape (batch_size, feat_dim)
+            target: ground truth labels with shape (num_classes)
+        """
+        inputs = input["features"]
+
+        #print(inputs.shape, targets.shape)
+        bs = inputs.shape[0]
+        # Compute pairwise distance, replace by the official when merged
+        dist = paddle.pow(inputs, 2).sum(axis=1, keepdim=True).expand([bs, bs])
+        dist = dist + dist.t()
+        dist = paddle.addmm(input=dist,
+                            x=inputs,
+                            y=inputs.t(),
+                            alpha=-2.0,
+                            beta=1.0)
+        dist = paddle.clip(dist, min=1e-12).sqrt()
+
+        mask = paddle.equal(target.expand([bs, bs]),
+                            target.expand([bs, bs]).t())
+        mask_numpy_idx = mask.numpy()
+        dist_ap, dist_an = [], []
+        for i in range(bs):
+            # dist_ap_i = paddle.to_tensor(dist[i].numpy()[mask_numpy_idx[i]].max(),dtype='float64').unsqueeze(0)
+            # dist_ap_i.stop_gradient = False
+            # dist_ap.append(dist_ap_i)
+            dist_ap.append(
+                max([
+                    dist[i][j]
+                    if mask_numpy_idx[i][j] == True else float("-inf")
+                    for j in range(bs)
+                ]).unsqueeze(0))
+            # dist_an_i = paddle.to_tensor(dist[i].numpy()[mask_numpy_idx[i] == False].min(), dtype='float64').unsqueeze(0)
+            # dist_an_i.stop_gradient = False
+            # dist_an.append(dist_an_i)
+            dist_an.append(
+                min([
+                    dist[i][k]
+                    if mask_numpy_idx[i][k] == False else float("inf")
+                    for k in range(bs)
+                ]).unsqueeze(0))
+
+        dist_ap = paddle.concat(dist_ap, axis=0)
+        dist_an = paddle.concat(dist_an, axis=0)
+
+        # Compute ranking hinge loss
+        y = paddle.ones_like(dist_an)
+        loss = self.ranking_loss(dist_an, dist_ap, y)
+        return {"TripletLoss": loss}
+    
--- a/ppcls/optimizer/__init__.py
+++ b/ppcls/optimizer/__init__.py
@@ -12,8 +12,54 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import paddle
+
+from ppcls.utils import logger
+
 from . import optimizer
-from . import learning_rate

-from .optimizer import OptimizerBuilder
-from .learning_rate import LearningRateBuilder
+__all__ = ['build_optimizer']
+
+
+def build_lr_scheduler(lr_config, epochs, step_each_epoch):
+    from . import learning_rate
+    lr_config.update({'epochs': epochs, 'step_each_epoch': step_each_epoch})
+    if 'name' in lr_config:
+        lr_name = lr_config.pop('name')
+        lr = getattr(learning_rate, lr_name)(**lr_config)()
+    else:
+        lr = lr_config['learning_rate']
+    return lr
+
+
+def build_optimizer(config, epochs, step_each_epoch, parameters):
+    config = copy.deepcopy(config)
+    # step1 build lr
+    lr = build_lr_scheduler(config.pop('lr'), epochs, step_each_epoch)
+    logger.info("build lr ({}) success..".format(lr))
+    # step2 build regularization
+    if 'regularizer' in config and config['regularizer'] is not None:
+        reg_config = config.pop('regularizer')
+        reg_name = reg_config.pop('name') + 'Decay'
+        reg = getattr(paddle.regularizer, reg_name)(**reg_config)
+    else:
+        reg = None
+    logger.info("build regularizer ({}) success..".format(reg))
+    # step3 build optimizer
+    optim_name = config.pop('name')
+    if 'clip_norm' in config:
+        clip_norm = config.pop('clip_norm')
+        grad_clip = paddle.nn.ClipGradByNorm(clip_norm=clip_norm)
+    else:
+        grad_clip = None
+    optim = getattr(optimizer, optim_name)(learning_rate=lr,
+                                           weight_decay=reg,
+                                           grad_clip=grad_clip,
+                                           **config)(parameters=parameters)
+    logger.info("build optimizer ({}) success..".format(optim))
+    return optim, lr
--- a/ppcls/optimizer/learning_rate.py
+++ b/ppcls/optimizer/learning_rate.py
--- a/ppcls/optimizer/optimizer.py
+++ b/ppcls/optimizer/optimizer.py
--- a/ppcls/utils/config.py
+++ b/ppcls/utils/config.py
--- a/ppcls/utils/save_load.py
+++ b/ppcls/utils/save_load.py
@@ -24,6 +24,7 @@ import tempfile

 import paddle
 from paddle.static import load_program_state
+from paddle.utils.download import get_weights_path_from_url

 from ppcls.utils import logger

@@ -70,6 +71,20 @@ def load_dygraph_pretrain(model, path=None, load_static_weights=False):
    return


+def load_dygraph_pretrain_from_url(model,
+                                   pretrained_url,
+                                   use_ssld,
+                                   load_static_weights=False):
+    if use_ssld:
+        pretrained_url = pretrained_url.replace("_pretrained",
+                                                "_ssld_pretrained")
+    local_weight_path = get_weights_path_from_url(pretrained_url).replace(
+        ".pdparams", "")
+    load_dygraph_pretrain(
+        model, path=local_weight_path, load_static_weights=load_static_weights)
+    return
+
+
 def load_distillation_model(model, pretrained_model, load_static_weights):
    logger.info("In distillation mode, teacher model will be "
                "loaded firstly before student model.")
@@ -112,10 +127,11 @@ def init_model(config, net, optimizer=None):
            "Given dir {}.pdopt not exist.".format(checkpoints)
        para_dict = paddle.load(checkpoints + ".pdparams")
        opti_dict = paddle.load(checkpoints + ".pdopt")
+        metric_dict = paddle.load(checkpoints + ".pdstates")
        net.set_dict(para_dict)
        optimizer.set_state_dict(opti_dict)
        logger.info("Finish load checkpoints from {}".format(checkpoints))
-        return
+        return metric_dict

    pretrained_model = config.get('pretrained_model')
    load_static_weights = config.get('load_static_weights', False)
@@ -146,13 +162,18 @@ def _save_student_model(net, model_prefix):
            student_model_prefix))


-def save_model(net, optimizer, model_path, epoch_id, prefix='ppcls'):
+def save_model(net,
+               optimizer,
+               metric_info,
+               model_path,
+               model_name="",
+               prefix='ppcls'):
    """
    save model to the target path
    """
    if paddle.distributed.get_rank() != 0:
        return
-    model_path = os.path.join(model_path, str(epoch_id))
+    model_path = os.path.join(model_path, model_name)
    _mkdir_if_not_exist(model_path)
    model_prefix = os.path.join(model_path, prefix)

@@ -160,4 +181,5 @@ def save_model(net, optimizer, model_path, epoch_id, prefix='ppcls'):

    paddle.save(net.state_dict(), model_prefix + ".pdparams")
    paddle.save(optimizer.state_dict(), model_prefix + ".pdopt")
+    paddle.save(metric_info, model_prefix + ".pdstates")
    logger.info("Already save model in {}".format(model_path))
--- a/tools/eval.py
+++ b/tools/eval.py
--- a/tools/train.py
+++ b/tools/train.py