diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py index bbde64f2e609cd511ca17b3a3c3e8d11712c84a5..4441faee14e021ab4c6d45c6bad36653e79db37e 100644 --- a/python/paddle/tests/test_pretrained_model.py +++ b/python/paddle/tests/test_pretrained_model.py @@ -61,6 +61,8 @@ class TestPretrainedModel(unittest.TestCase): arches = [ 'mobilenet_v1', 'mobilenet_v2', + 'mobilenet_v3_small', + 'mobilenet_v3_large', 'squeezenet1_0', 'shufflenet_v2_x0_25', ] diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py index 547c53345995c28c0f4b4fbdfa9ae17477f09864..dc98fc3219bff6d2ae5e65a4ad21e4303baba8c7 100644 --- a/python/paddle/tests/test_vision_models.py +++ b/python/paddle/tests/test_vision_models.py @@ -40,6 +40,12 @@ class TestVisonModels(unittest.TestCase): def test_mobilenetv1(self): self.models_infer('mobilenet_v1') + def test_mobilenetv3_small(self): + self.models_infer('mobilenet_v3_small') + + def test_mobilenetv3_large(self): + self.models_infer('mobilenet_v3_large') + def test_vgg11(self): self.models_infer('vgg11') diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py index 37520175a719f7aedabf23e89b4436bb04469bd7..3749e0f64fc6a83433dd0533cc91736503a7495b 100644 --- a/python/paddle/vision/__init__.py +++ b/python/paddle/vision/__init__.py @@ -40,6 +40,10 @@ from .models import MobileNetV1 # noqa: F401 from .models import mobilenet_v1 # noqa: F401 from .models import MobileNetV2 # noqa: F401 from .models import mobilenet_v2 # noqa: F401 +from .models import MobileNetV3Small # noqa: F401 +from .models import MobileNetV3Large # noqa: F401 +from .models import mobilenet_v3_small # noqa: F401 +from .models import mobilenet_v3_large # noqa: F401 from .models import SqueezeNet # noqa: F401 from .models import squeezenet1_0 # noqa: F401 from .models import squeezenet1_1 # noqa: F401 diff --git a/python/paddle/vision/models/__init__.py b/python/paddle/vision/models/__init__.py index 044be6a42b7c28699747910bd610c86103a32d73..5ff3562e56ea8c2984d14cc9da00bdaccba9017b 100644 --- a/python/paddle/vision/models/__init__.py +++ b/python/paddle/vision/models/__init__.py @@ -24,6 +24,10 @@ from .mobilenetv1 import MobileNetV1 # noqa: F401 from .mobilenetv1 import mobilenet_v1 # noqa: F401 from .mobilenetv2 import MobileNetV2 # noqa: F401 from .mobilenetv2 import mobilenet_v2 # noqa: F401 +from .mobilenetv3 import MobileNetV3Small # noqa: F401 +from .mobilenetv3 import MobileNetV3Large # noqa: F401 +from .mobilenetv3 import mobilenet_v3_small # noqa: F401 +from .mobilenetv3 import mobilenet_v3_large # noqa: F401 from .vgg import VGG # noqa: F401 from .vgg import vgg11 # noqa: F401 from .vgg import vgg13 # noqa: F401 @@ -79,6 +83,10 @@ __all__ = [ #noqa 'mobilenet_v1', 'MobileNetV2', 'mobilenet_v2', + 'MobileNetV3Small', + 'MobileNetV3Large', + 'mobilenet_v3_small', + 'mobilenet_v3_large', 'LeNet', 'DenseNet', 'densenet121', diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py index 74071fc121688eafbf17833a6410b94d34191ec4..6c486037c7d30539eb7996f836028be2109413fd 100644 --- a/python/paddle/vision/models/mobilenetv2.py +++ b/python/paddle/vision/models/mobilenetv2.py @@ -12,14 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np import paddle - import paddle.nn as nn -import paddle.nn.functional as F - from paddle.utils.download import get_weights_path_from_url +from .utils import _make_divisible + __all__ = [] model_urls = { @@ -29,16 +27,6 @@ model_urls = { } -def _make_divisible(v, divisor, min_value=None): - if min_value is None: - min_value = divisor - new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) - - if new_v < 0.9 * v: - new_v += divisor - return new_v - - class ConvBNReLU(nn.Sequential): def __init__(self, in_planes, diff --git a/python/paddle/vision/models/mobilenetv3.py b/python/paddle/vision/models/mobilenetv3.py new file mode 100644 index 0000000000000000000000000000000000000000..da7ae010c58f6babbe5a949c642cdbcace4e951c --- /dev/null +++ b/python/paddle/vision/models/mobilenetv3.py @@ -0,0 +1,445 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +from paddle.utils.download import get_weights_path_from_url +from functools import partial + +from .utils import _make_divisible +from ..ops import ConvNormActivation + +__all__ = [] + +model_urls = { + "mobilenet_v3_small_x1.0": + ("https://paddle-hapi.bj.bcebos.com/models/mobilenet_v3_small_x1.0.pdparams", + "34fe0e7c1f8b00b2b056ad6788d0590c"), + "mobilenet_v3_large_x1.0": + ("https://paddle-hapi.bj.bcebos.com/models/mobilenet_v3_large_x1.0.pdparams", + "118db5792b4e183b925d8e8e334db3df"), +} + + +class SqueezeExcitation(nn.Layer): + """ + This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1). + Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3. + This code is based on the torchvision code with modifications. + You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L127 + Args: + input_channels (int): Number of channels in the input image + squeeze_channels (int): Number of squeeze channels + activation (Callable[..., paddle.nn.Layer], optional): ``delta`` activation. Default: ``paddle.nn.ReLU`` + scale_activation (Callable[..., paddle.nn.Layer]): ``sigma`` activation. Default: ``paddle.nn.Sigmoid`` + """ + + def __init__(self, + input_channels, + squeeze_channels, + activation=nn.ReLU, + scale_activation=nn.Sigmoid): + super().__init__() + self.avgpool = nn.AdaptiveAvgPool2D(1) + self.fc1 = nn.Conv2D(input_channels, squeeze_channels, 1) + self.fc2 = nn.Conv2D(squeeze_channels, input_channels, 1) + self.activation = activation() + self.scale_activation = scale_activation() + + def _scale(self, input): + scale = self.avgpool(input) + scale = self.fc1(scale) + scale = self.activation(scale) + scale = self.fc2(scale) + return self.scale_activation(scale) + + def forward(self, input): + scale = self._scale(input) + return scale * input + + +class InvertedResidualConfig: + def __init__(self, + in_channels, + kernel, + expanded_channels, + out_channels, + use_se, + activation, + stride, + scale=1.0): + self.in_channels = self.adjust_channels(in_channels, scale=scale) + self.kernel = kernel + self.expanded_channels = self.adjust_channels( + expanded_channels, scale=scale) + self.out_channels = self.adjust_channels(out_channels, scale=scale) + self.use_se = use_se + if activation is None: + self.activation_layer = None + elif activation == "relu": + self.activation_layer = nn.ReLU + elif activation == "hardswish": + self.activation_layer = nn.Hardswish + else: + raise RuntimeError("The activation function is not supported: {}". + format(activation)) + self.stride = stride + + @staticmethod + def adjust_channels(channels, scale=1.0): + return _make_divisible(channels * scale, 8) + + +class InvertedResidual(nn.Layer): + def __init__(self, in_channels, expanded_channels, out_channels, + filter_size, stride, use_se, activation_layer, norm_layer): + super().__init__() + self.use_res_connect = stride == 1 and in_channels == out_channels + self.use_se = use_se + self.expand = in_channels != expanded_channels + + if self.expand: + self.expand_conv = ConvNormActivation( + in_channels=in_channels, + out_channels=expanded_channels, + kernel_size=1, + stride=1, + padding=0, + norm_layer=norm_layer, + activation_layer=activation_layer) + + self.bottleneck_conv = ConvNormActivation( + in_channels=expanded_channels, + out_channels=expanded_channels, + kernel_size=filter_size, + stride=stride, + padding=int((filter_size - 1) // 2), + groups=expanded_channels, + norm_layer=norm_layer, + activation_layer=activation_layer) + + if self.use_se: + self.mid_se = SqueezeExcitation( + expanded_channels, + _make_divisible(expanded_channels // 4), + scale_activation=nn.Hardsigmoid) + + self.linear_conv = ConvNormActivation( + in_channels=expanded_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + norm_layer=norm_layer, + activation_layer=None) + + def forward(self, x): + identity = x + if self.expand: + x = self.expand_conv(x) + x = self.bottleneck_conv(x) + if self.use_se: + x = self.mid_se(x) + x = self.linear_conv(x) + if self.use_res_connect: + x = paddle.add(identity, x) + return x + + +class MobileNetV3(nn.Layer): + """MobileNetV3 model from + `"Searching for MobileNetV3" `_. + + Args: + config (list[InvertedResidualConfig]): MobileNetV3 depthwise blocks config. + last_channel (int): The number of channels on the penultimate layer. + scale (float, optional): Scale of channels in each layer. Default: 1.0. + num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer + will not be defined. Default: 1000. + with_pool (bool, optional): Use pool before the last fc layer or not. Default: True. + """ + + def __init__(self, + config, + last_channel, + scale=1.0, + num_classes=1000, + with_pool=True): + super().__init__() + + self.config = config + self.scale = scale + self.last_channel = last_channel + self.num_classes = num_classes + self.with_pool = with_pool + self.firstconv_in_channels = config[0].in_channels + self.lastconv_in_channels = config[-1].in_channels + self.lastconv_out_channels = self.lastconv_in_channels * 6 + norm_layer = partial(nn.BatchNorm2D, epsilon=0.001, momentum=0.99) + + self.conv = ConvNormActivation( + in_channels=3, + out_channels=self.firstconv_in_channels, + kernel_size=3, + stride=2, + padding=1, + groups=1, + activation_layer=nn.Hardswish, + norm_layer=norm_layer) + + self.blocks = nn.Sequential(*[ + InvertedResidual( + in_channels=cfg.in_channels, + expanded_channels=cfg.expanded_channels, + out_channels=cfg.out_channels, + filter_size=cfg.kernel, + stride=cfg.stride, + use_se=cfg.use_se, + activation_layer=cfg.activation_layer, + norm_layer=norm_layer) for cfg in self.config + ]) + + self.lastconv = ConvNormActivation( + in_channels=self.lastconv_in_channels, + out_channels=self.lastconv_out_channels, + kernel_size=1, + stride=1, + padding=0, + groups=1, + norm_layer=norm_layer, + activation_layer=nn.Hardswish) + + if with_pool: + self.avgpool = nn.AdaptiveAvgPool2D(1) + + if num_classes > 0: + self.classifier = nn.Sequential( + nn.Linear(self.lastconv_out_channels, self.last_channel), + nn.Hardswish(), + nn.Dropout(p=0.2), + nn.Linear(self.last_channel, num_classes)) + + def forward(self, x): + x = self.conv(x) + x = self.blocks(x) + x = self.lastconv(x) + + if self.with_pool: + x = self.avgpool(x) + + if self.num_classes > 0: + x = paddle.flatten(x, 1) + x = self.classifier(x) + + return x + + +class MobileNetV3Small(MobileNetV3): + """MobileNetV3 Small architecture model from + `"Searching for MobileNetV3" `_. + + Args: + scale (float, optional): Scale of channels in each layer. Default: 1.0. + num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer + will not be defined. Default: 1000. + with_pool (bool, optional): Use pool before the last fc layer or not. Default: True. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import MobileNetV3Small + + # build model + model = MobileNetV3Small(scale=1.0) + + x = paddle.rand([1, 3, 224, 224]) + out = model(x) + + print(out.shape) + """ + + def __init__(self, scale=1.0, num_classes=1000, with_pool=True): + config = [ + InvertedResidualConfig(16, 3, 16, 16, True, "relu", 2, scale), + InvertedResidualConfig(16, 3, 72, 24, False, "relu", 2, scale), + InvertedResidualConfig(24, 3, 88, 24, False, "relu", 1, scale), + InvertedResidualConfig(24, 5, 96, 40, True, "hardswish", 2, scale), + InvertedResidualConfig(40, 5, 240, 40, True, "hardswish", 1, scale), + InvertedResidualConfig(40, 5, 240, 40, True, "hardswish", 1, scale), + InvertedResidualConfig(40, 5, 120, 48, True, "hardswish", 1, scale), + InvertedResidualConfig(48, 5, 144, 48, True, "hardswish", 1, scale), + InvertedResidualConfig(48, 5, 288, 96, True, "hardswish", 2, scale), + InvertedResidualConfig(96, 5, 576, 96, True, "hardswish", 1, scale), + InvertedResidualConfig(96, 5, 576, 96, True, "hardswish", 1, scale), + ] + last_channel = _make_divisible(1024 * scale, 8) + super().__init__( + config, + last_channel=last_channel, + scale=scale, + with_pool=with_pool, + num_classes=num_classes) + + +class MobileNetV3Large(MobileNetV3): + """MobileNetV3 Large architecture model from + `"Searching for MobileNetV3" `_. + + Args: + scale (float, optional): Scale of channels in each layer. Default: 1.0. + num_classes (int, optional): Output dim of last fc layer. If num_classes <=0, last fc layer + will not be defined. Default: 1000. + with_pool (bool, optional): Use pool before the last fc layer or not. Default: True. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import MobileNetV3Large + + # build model + model = MobileNetV3Large(scale=1.0) + + x = paddle.rand([1, 3, 224, 224]) + out = model(x) + + print(out.shape) + """ + + def __init__(self, scale=1.0, num_classes=1000, with_pool=True): + config = [ + InvertedResidualConfig(16, 3, 16, 16, False, "relu", 1, scale), + InvertedResidualConfig(16, 3, 64, 24, False, "relu", 2, scale), + InvertedResidualConfig(24, 3, 72, 24, False, "relu", 1, scale), + InvertedResidualConfig(24, 5, 72, 40, True, "relu", 2, scale), + InvertedResidualConfig(40, 5, 120, 40, True, "relu", 1, scale), + InvertedResidualConfig(40, 5, 120, 40, True, "relu", 1, scale), + InvertedResidualConfig(40, 3, 240, 80, False, "hardswish", 2, + scale), + InvertedResidualConfig(80, 3, 200, 80, False, "hardswish", 1, + scale), + InvertedResidualConfig(80, 3, 184, 80, False, "hardswish", 1, + scale), + InvertedResidualConfig(80, 3, 184, 80, False, "hardswish", 1, + scale), + InvertedResidualConfig(80, 3, 480, 112, True, "hardswish", 1, + scale), + InvertedResidualConfig(112, 3, 672, 112, True, "hardswish", 1, + scale), + InvertedResidualConfig(112, 5, 672, 160, True, "hardswish", 2, + scale), + InvertedResidualConfig(160, 5, 960, 160, True, "hardswish", 1, + scale), + InvertedResidualConfig(160, 5, 960, 160, True, "hardswish", 1, + scale), + ] + last_channel = _make_divisible(1280 * scale, 8) + super().__init__( + config, + last_channel=last_channel, + scale=scale, + with_pool=with_pool, + num_classes=num_classes) + + +def _mobilenet_v3(arch, pretrained=False, scale=1.0, **kwargs): + if arch == "mobilenet_v3_large": + model = MobileNetV3Large(scale=scale, **kwargs) + else: + model = MobileNetV3Small(scale=scale, **kwargs) + if pretrained: + arch = "{}_x{}".format(arch, scale) + assert ( + arch in model_urls + ), "{} model do not have a pretrained model now, you should set pretrained=False".format( + arch) + weight_path = get_weights_path_from_url(model_urls[arch][0], + model_urls[arch][1]) + + param = paddle.load(weight_path) + model.set_dict(param) + return model + + +def mobilenet_v3_small(pretrained=False, scale=1.0, **kwargs): + """MobileNetV3 Small architecture model from + `"Searching for MobileNetV3" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False. + scale (float, optional): Scale of channels in each layer. Default: 1.0. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import mobilenet_v3_small + + # build model + model = mobilenet_v3_small() + + # build model and load imagenet pretrained weight + # model = mobilenet_v3_small(pretrained=True) + + # build mobilenet v3 small model with scale=0.5 + model = mobilenet_v3_small(scale=0.5) + + x = paddle.rand([1, 3, 224, 224]) + out = model(x) + + print(out.shape) + + """ + model = _mobilenet_v3( + "mobilenet_v3_small", scale=scale, pretrained=pretrained, **kwargs) + return model + + +def mobilenet_v3_large(pretrained=False, scale=1.0, **kwargs): + """MobileNetV3 Large architecture model from + `"Searching for MobileNetV3" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet. Default: False. + scale (float, optional): Scale of channels in each layer. Default: 1.0. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.models import mobilenet_v3_large + + # build model + model = mobilenet_v3_large() + + # build model and load imagenet pretrained weight + # model = mobilenet_v3_large(pretrained=True) + + # build mobilenet v3 large model with scale=0.5 + model = mobilenet_v3_large(scale=0.5) + + x = paddle.rand([1, 3, 224, 224]) + out = model(x) + + print(out.shape) + + """ + model = _mobilenet_v3( + "mobilenet_v3_large", scale=scale, pretrained=pretrained, **kwargs) + return model diff --git a/python/paddle/vision/models/utils.py b/python/paddle/vision/models/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f61d0d601a44f5b47b5f36a1488a736f28df298c --- /dev/null +++ b/python/paddle/vision/models/utils.py @@ -0,0 +1,32 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def _make_divisible(v, divisor=8, min_value=None): + """ + This function ensures that all layers have a channel number that is divisible by divisor + You can also see at https://github.com/keras-team/keras/blob/8ecef127f70db723c158dbe9ed3268b3d610ab55/keras/applications/mobilenet_v2.py#L505 + + Args: + divisor (int): The divisor for number of channels. Default: 8. + min_value (int, optional): The minimum value of number of channels, if it is None, + the default is divisor. Default: None. + """ + if min_value is None: + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < 0.9 * v: + new_v += divisor + return new_v diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 4983ca49ac32fb7806959349dc4efa6e27116456..b65bfa502c4dfefdc446d9661a6dc6dfbfa0872d 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -17,7 +17,7 @@ from ..fluid.layer_helper import LayerHelper from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype from ..fluid import core, layers from ..fluid.layers import nn, utils -from ..nn import Layer +from ..nn import Layer, Conv2D, Sequential, ReLU, BatchNorm2D from ..fluid.initializer import Normal from paddle.common_ops_import import * @@ -1297,3 +1297,57 @@ class RoIAlign(Layer): output_size=self._output_size, spatial_scale=self._spatial_scale, aligned=aligned) + + +class ConvNormActivation(Sequential): + """ + Configurable block used for Convolution-Normalzation-Activation blocks. + This code is based on the torchvision code with modifications. + You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L68 + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the Convolution-Normalzation-Activation block + kernel_size: (int, optional): Size of the convolving kernel. Default: 3 + stride (int, optional): Stride of the convolution. Default: 1 + padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, + in wich case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation`` + groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1 + norm_layer (Callable[..., paddle.nn.Layer], optional): Norm layer that will be stacked on top of the convolutiuon layer. + If ``None`` this layer wont be used. Default: ``paddle.nn.BatchNorm2d`` + activation_layer (Callable[..., paddle.nn.Layer], optional): Activation function which will be stacked on top of the normalization + layer (if not ``None``), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``paddle.nn.ReLU`` + dilation (int): Spacing between kernel elements. Default: 1 + bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=None, + groups=1, + norm_layer=BatchNorm2D, + activation_layer=ReLU, + dilation=1, + bias=None): + if padding is None: + padding = (kernel_size - 1) // 2 * dilation + if bias is None: + bias = norm_layer is None + layers = [ + Conv2D( + in_channels, + out_channels, + kernel_size, + stride, + padding, + dilation=dilation, + groups=groups, + bias_attr=bias) + ] + if norm_layer is not None: + layers.append(norm_layer(out_channels)) + if activation_layer is not None: + layers.append(activation_layer()) + super().__init__(*layers)