diff --git a/demo/semantic_segmentation/N0007.jpg b/demo/semantic_segmentation/N0007.jpg new file mode 100644 index 0000000000000000000000000000000000000000..106a6939c20632c444ffec485f00d5a9553d57cd Binary files /dev/null and b/demo/semantic_segmentation/N0007.jpg differ diff --git a/demo/semantic_segmentation/README.md b/demo/semantic_segmentation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b490236a04399c8cc7f1f8733e2b085605783a37 --- /dev/null +++ b/demo/semantic_segmentation/README.md @@ -0,0 +1,164 @@ +# PaddleHub 图像分割 + +本示例将展示如何使用PaddleHub对预训练模型进行finetune并完成预测任务。 + + +## 如何开始Fine-tune + +在完成安装PaddlePaddle与PaddleHub后,通过执行`python train.py`即可开始使用ocrnet_hrnetw18_voc模型对OpticDiscSeg等数据集进行Fine-tune。 + +## 代码步骤o + +使用PaddleHub Fine-tune API进行Fine-tune可以分为4个步骤。 + +### Step1: 定义数据预处理方式 +```python +from paddlehub.vision.segmentation_transforms import Compose, Resize, Normalize + +transform = Compose([Resize(target_size=(512, 512)), Normalize()]) +``` + +`segmentation_transforms` 数据增强模块定义了丰富的针对图像分割数据的预处理方式,用户可按照需求替换自己需要的数据预处理方式。 + +### Step2: 下载数据集并使用 +```python +from paddlehub.datasets import OpticDiscSeg + +train_reader = OpticDiscSeg(transform, mode='train') + +``` +* `transform`: 数据预处理方式。 +* `mode`: 选择数据模式,可选项有 `train`, `test`, `val`, 默认为`train`。 + +数据集的准备代码可以参考 [opticdiscseg.py](../../paddlehub/datasets/opticdiscseg.py)。`hub.datasets.OpticDiscSeg()`会自动从网络下载数据集并解压到用户目录下`$HOME/.paddlehub/dataset`目录。 + +### Step3: 加载预训练模型 + +```python +model = hub.Module(name='ocrnet_hrnetw18_voc', num_classes=2, pretrained=None) +``` +* `name`: 选择预训练模型的名字。 +* `num_classes`: 分割模型的类别数目。 +* `pretrained`: 是否加载自己训练的模型,若为None,则加载提供的模型默认参数。 + +### Step4: 选择优化策略和运行配置 + +```python +scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.01, decay_steps=1000, power=0.9, end_lr=0.0001) +optimizer = paddle.optimizer.Adam(learning_rate=scheduler, parameters=model.parameters()) +trainer = Trainer(model, optimizer, checkpoint_dir='test_ckpt_img_ocr', use_gpu=True) +``` + +#### 优化策略 + +Paddle2.0rc提供了多种优化器选择,如`SGD`, `Adam`, `Adamax`等,详细参见[策略](https://www.paddlepaddle.org.cn/documentation/docs/zh/2.0-rc/api/paddle/optimizer/optimizer/Optimizer_cn.html)。 + +其中`Adam`: + +* `learning_rate`: 全局学习率。 +* `parameters`: 待优化模型参数。 + +#### 运行配置 +`Trainer` 主要控制Fine-tune的训练,包含以下可控制的参数: + +* `model`: 被优化模型; +* `optimizer`: 优化器选择; +* `use_gpu`: 是否使用gpu,默认为False; +* `use_vdl`: 是否使用vdl可视化训练过程; +* `checkpoint_dir`: 保存模型参数的地址; +* `compare_metrics`: 保存最优模型的衡量指标; + +`trainer.train` 主要控制具体的训练过程,包含以下可控制的参数: + +* `train_dataset`: 训练时所用的数据集; +* `epochs`: 训练轮数; +* `batch_size`: 训练的批大小,如果使用GPU,请根据实际情况调整batch_size; +* `num_workers`: works的数量,默认为0; +* `eval_dataset`: 验证集; +* `log_interval`: 打印日志的间隔, 单位为执行批训练的次数。 +* `save_interval`: 保存模型的间隔频次,单位为执行训练的轮数。 + +## 模型预测 + +当完成Fine-tune后,Fine-tune过程在验证集上表现最优的模型会被保存在`${CHECKPOINT_DIR}/best_model`目录下,其中`${CHECKPOINT_DIR}`目录为Fine-tune时所选择的保存checkpoint的目录。 + +我们使用该模型来进行预测。predict.py脚本如下: + +```python +import paddle +import cv2 +import paddlehub as hub + +if __name__ == '__main__': + model = hub.Module(name='ocrnet_hrnetw18_voc', pretrained='/PATH/TO/CHECKPOINT') + img = cv2.imread("/PATH/TO/IMAGE") + model.predict(images=[img], visualization=True) +``` + +参数配置正确后,请执行脚本`python predict.py`。 +**Args** +* `images`:原始图像路径或BGR格式图片; +* `visualization`: 是否可视化,默认为True; +* `save_path`: 保存结果的路径,默认保存路径为'seg_result'。 + +**NOTE:** 进行预测时,所选择的module,checkpoint_dir,dataset必须和Fine-tune所用的一样。 + +## 服务部署 + +PaddleHub Serving可以部署一个在线图像分割服务。 + +### Step1: 启动PaddleHub Serving + +运行启动命令: + +```shell +$ hub serving start -m ocrnet_hrnetw18_voc +``` + +这样就完成了一个图像分割服务化API的部署,默认端口号为8866。 + +**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA_VISIBLE_DEVICES环境变量,否则不用设置。 + +### Step2: 发送预测请求 + +配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果 + +```python +import requests +import json +import cv2 +import base64 + +import numpy as np + + +def cv2_to_base64(image): + data = cv2.imencode('.jpg', image)[1] + return base64.b64encode(data.tostring()).decode('utf8') + +def base64_to_cv2(b64str): + data = base64.b64decode(b64str.encode('utf8')) + data = np.fromstring(data, np.uint8) + data = cv2.imdecode(data, cv2.IMREAD_COLOR) + return data + +# 发送HTTP请求 +org_im = cv2.imread('/PATH/TO/IMAGE') +data = {'images':[cv2_to_base64(org_im)]} +headers = {"Content-type": "application/json"} +url = "http://127.0.0.1:8866/predict/ocrnet_hrnetw18_voc" +r = requests.post(url=url, headers=headers, data=json.dumps(data)) +mask = base64_to_cv2(r.json()["results"][0]) +``` + +### 查看代码 + +https://github.com/PaddlePaddle/PaddleSeg + +### 依赖 + +paddlepaddle >= 2.0.0rc + +paddlehub >= 2.0.0 + + diff --git a/demo/semantic_segmentation/predict.py b/demo/semantic_segmentation/predict.py new file mode 100644 index 0000000000000000000000000000000000000000..a991f48cbc27d3724578788155f38385421b0a0c --- /dev/null +++ b/demo/semantic_segmentation/predict.py @@ -0,0 +1,6 @@ +import paddle +import paddlehub as hub + +if __name__ == '__main__': + model = hub.Module(name='ocrnet_hrnetw18_voc', num_classes=2, pretrained='/PATH/TO/CHECKPOINT') + model.predict(images=["N0007.jpg"], visualization=True) \ No newline at end of file diff --git a/demo/semantic_segmentation/train.py b/demo/semantic_segmentation/train.py new file mode 100644 index 0000000000000000000000000000000000000000..55f3ba596e84e52f12cbd9ce9b475f7a0eb50ec8 --- /dev/null +++ b/demo/semantic_segmentation/train.py @@ -0,0 +1,16 @@ +import paddle +import paddlehub as hub +from paddlehub.finetune.trainer import Trainer + +from paddlehub.datasets import OpticDiscSeg +from paddlehub.vision.segmentation_transforms import Compose, Resize, Normalize + +if __name__ == "__main__": + transform = Compose([Resize(target_size=(512, 512)), Normalize()]) + train_reader = OpticDiscSeg(transform) + + model = hub.Module(name='ocrnet_hrnetw18_voc', num_classes=2) + scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.01, decay_steps=1000, power=0.9, end_lr=0.0001) + optimizer = paddle.optimizer.Adam(learning_rate=scheduler, parameters=model.parameters()) + trainer = Trainer(model, optimizer, checkpoint_dir='test_ckpt_img_ocr', use_gpu=True) + trainer.train(train_reader, epochs=20, batch_size=4, eval_dataset=train_reader, log_interval=10, save_interval=4) \ No newline at end of file diff --git a/docs/docs_ch/reference/datasets.md b/docs/docs_ch/reference/datasets.md index cdbe0e0dfb0b978c03a483a6be523d3f0c1d92bd..c4409fc4bc3945b2f50875fc90678be982ce230e 100644 --- a/docs/docs_ch/reference/datasets.md +++ b/docs/docs_ch/reference/datasets.md @@ -39,3 +39,18 @@ Dataset for Style transfer. The dataset contains 2001 images for training set an **Args** * transforms(callmethod) : The method of preprocess images. * mode(str): The mode for preparing dataset. + +# Class `hub.datasets.OpticDiscSeg` + +```python +hub.datasets.OpticDiscSeg( + transforms: Callable, + mode: str = 'train') +``` + +Dataset for semantic segmentation. The dataset contains 267 images for training set, 76 images for validation set and 38 images for testing set. + +**Args** +* transforms(callmethod) : The method of preprocess images. +* mode(str): The mode for preparing dataset. + diff --git a/modules/image/semantic_segmentation/deeplabv3p_resnet50_voc/layers.py b/modules/image/semantic_segmentation/deeplabv3p_resnet50_voc/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..dd958e19c16fafeafdf67014ad40dd81faf94f29 --- /dev/null +++ b/modules/image/semantic_segmentation/deeplabv3p_resnet50_voc/layers.py @@ -0,0 +1,345 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.layer import activation +from paddle.nn import Conv2D, AvgPool2D + + +def SyncBatchNorm(*args, **kwargs): + """In cpu environment nn.SyncBatchNorm does not have kernel so use nn.BatchNorm2D instead""" + if paddle.get_device() == 'cpu': + return nn.BatchNorm2D(*args, **kwargs) + else: + return nn.SyncBatchNorm(*args, **kwargs) + + +class ConvBNLayer(nn.Layer): + """Basic conv bn relu layer.""" + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + dilation: int = 1, + groups: int = 1, + is_vd_mode: bool = False, + act: str = None, + name: str = None): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self._conv = Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2 if dilation == 1 else 0, + dilation=dilation, + groups=groups, + bias_attr=False) + + self._batch_norm = SyncBatchNorm(out_channels) + self._act_op = Activation(act=act) + + def forward(self, inputs: paddle.Tensor) -> paddle.Tensor: + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + y = self._act_op(y) + + return y + + +class BottleneckBlock(nn.Layer): + """Residual bottleneck block""" + + def __init__(self, + in_channels: int, + out_channels: int, + stride: int, + shortcut: bool = True, + if_first: bool = False, + dilation: int = 1, + name: str = None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + + self.dilation = dilation + + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + dilation=dilation, + name=name + "_branch2b") + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first or stride == 1 else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs: paddle.Tensor) -> paddle.Tensor: + y = self.conv0(inputs) + if self.dilation > 1: + padding = self.dilation + y = F.pad(y, [padding, padding, padding, padding]) + + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class SeparableConvBNReLU(nn.Layer): + """Depthwise Separable Convolution.""" + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int, + padding: str = 'same', + **kwargs: dict): + super(SeparableConvBNReLU, self).__init__() + self.depthwise_conv = ConvBN( + in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + padding=padding, + groups=in_channels, + **kwargs) + self.piontwise_conv = ConvBNReLU( + in_channels, out_channels, kernel_size=1, groups=1) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = self.depthwise_conv(x) + x = self.piontwise_conv(x) + return x + + +class ConvBN(nn.Layer): + """Basic conv bn layer""" + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int, + padding: str = 'same', + **kwargs: dict): + super(ConvBN, self).__init__() + self._conv = Conv2D( + in_channels, out_channels, kernel_size, padding=padding, **kwargs) + self._batch_norm = SyncBatchNorm(out_channels) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = self._conv(x) + x = self._batch_norm(x) + return x + + +class ConvBNReLU(nn.Layer): + """Basic conv bn relu layer.""" + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int, + padding: str = 'same', + **kwargs: dict): + super(ConvBNReLU, self).__init__() + + self._conv = Conv2D( + in_channels, out_channels, kernel_size, padding=padding, **kwargs) + self._batch_norm = SyncBatchNorm(out_channels) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = self._conv(x) + x = self._batch_norm(x) + x = F.relu(x) + return x + + +class Activation(nn.Layer): + """ + The wrapper of activations. + + Args: + act (str, optional): The activation name in lowercase. It must be one of ['elu', 'gelu', + 'hardshrink', 'tanh', 'hardtanh', 'prelu', 'relu', 'relu6', 'selu', 'leakyrelu', 'sigmoid', + 'softmax', 'softplus', 'softshrink', 'softsign', 'tanhshrink', 'logsigmoid', 'logsoftmax', + 'hsigmoid']. Default: None, means identical transformation. + + Returns: + A callable object of Activation. + + Raises: + KeyError: When parameter `act` is not in the optional range. + + Examples: + + from paddleseg.models.common.activation import Activation + + relu = Activation("relu") + print(relu) + # + + sigmoid = Activation("sigmoid") + print(sigmoid) + # + + not_exit_one = Activation("not_exit_one") + # KeyError: "not_exit_one does not exist in the current dict_keys(['elu', 'gelu', 'hardshrink', + # 'tanh', 'hardtanh', 'prelu', 'relu', 'relu6', 'selu', 'leakyrelu', 'sigmoid', 'softmax', + # 'softplus', 'softshrink', 'softsign', 'tanhshrink', 'logsigmoid', 'logsoftmax', 'hsigmoid'])" + """ + + def __init__(self, act: str = None): + super(Activation, self).__init__() + + self._act = act + upper_act_names = activation.__all__ + lower_act_names = [act.lower() for act in upper_act_names] + act_dict = dict(zip(lower_act_names, upper_act_names)) + + if act is not None: + if act in act_dict.keys(): + act_name = act_dict[act] + self.act_func = eval("activation.{}()".format(act_name)) + else: + raise KeyError("{} does not exist in the current {}".format( + act, act_dict.keys())) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + + if self._act is not None: + return self.act_func(x) + else: + return x + + +class ASPPModule(nn.Layer): + """ + Atrous Spatial Pyramid Pooling. + + Args: + aspp_ratios (tuple): The dilation rate using in ASSP module. + in_channels (int): The number of input channels. + out_channels (int): The number of output channels. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. + use_sep_conv (bool, optional): If using separable conv in ASPP module. Default: False. + image_pooling (bool, optional): If augmented with image-level features. Default: False + """ + + def __init__(self, + aspp_ratios: tuple, + in_channels: int, + out_channels: int, + align_corners: bool, + use_sep_conv: bool= False, + image_pooling: bool = False): + super().__init__() + + self.align_corners = align_corners + self.aspp_blocks = nn.LayerList() + + for ratio in aspp_ratios: + if use_sep_conv and ratio > 1: + conv_func = SeparableConvBNReLU + else: + conv_func = ConvBNReLU + + block = conv_func( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1 if ratio == 1 else 3, + dilation=ratio, + padding=0 if ratio == 1 else ratio) + self.aspp_blocks.append(block) + + out_size = len(self.aspp_blocks) + + if image_pooling: + self.global_avg_pool = nn.Sequential( + nn.AdaptiveAvgPool2D(output_size=(1, 1)), + ConvBNReLU(in_channels, out_channels, kernel_size=1, bias_attr=False)) + out_size += 1 + self.image_pooling = image_pooling + + self.conv_bn_relu = ConvBNReLU( + in_channels=out_channels * out_size, + out_channels=out_channels, + kernel_size=1) + + self.dropout = nn.Dropout(p=0.1) # drop rate + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + outputs = [] + for block in self.aspp_blocks: + y = block(x) + y = F.interpolate( + y, + x.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + outputs.append(y) + + if self.image_pooling: + img_avg = self.global_avg_pool(x) + img_avg = F.interpolate( + img_avg, + x.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + outputs.append(img_avg) + + x = paddle.concat(outputs, axis=1) + x = self.conv_bn_relu(x) + x = self.dropout(x) + + return x diff --git a/modules/image/semantic_segmentation/deeplabv3p_resnet50_voc/module.py b/modules/image/semantic_segmentation/deeplabv3p_resnet50_voc/module.py new file mode 100644 index 0000000000000000000000000000000000000000..38cfd42978279008bb860071301482f6ace145d4 --- /dev/null +++ b/modules/image/semantic_segmentation/deeplabv3p_resnet50_voc/module.py @@ -0,0 +1,186 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Union, List, Tuple + +import paddle +from paddle import nn +import paddle.nn.functional as F +import numpy as np +from paddlehub.module.module import moduleinfo +import paddlehub.vision.segmentation_transforms as T +from paddlehub.module.cv_module import ImageSegmentationModule + +from deeplabv3p_resnet50_voc.resnet import ResNet50_vd +import deeplabv3p_resnet50_voc.layers as L + + + +@moduleinfo( + name="deeplabv3p_resnet50_voc", + type="CV/semantic_segmentation", + author="paddlepaddle", + author_email="", + summary="DeepLabV3PResnet50 is a segmentation model.", + version="1.0.0", + meta=ImageSegmentationModule) +class DeepLabV3PResnet50(nn.Layer): + """ + The DeepLabV3PResnet50 implementation based on PaddlePaddle. + + The original article refers to + Liang-Chieh Chen, et, al. "Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation" + (https://arxiv.org/abs/1802.02611) + + Args: + num_classes (int): the unique number of target classes. + backbone_indices (tuple): two values in the tuple indicate the indices of output of backbone. + the first index will be taken as a low-level feature in Decoder component; + the second one will be taken as input of ASPP component. + Usually backbone consists of four downsampling stage, and return an output of + each stage, so we set default (0, 3), which means taking feature map of the first + stage in backbone as low-level feature used in Decoder, and feature map of the fourth + stage as input of ASPP. + aspp_ratios (tuple): the dilation rate using in ASSP module. + if output_stride=16, aspp_ratios should be set as (1, 6, 12, 18). + if output_stride=8, aspp_ratios is (1, 12, 24, 36). + aspp_out_channels (int): the output channels of ASPP module. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str): the path of pretrained model. Default to None. + """ + + def __init__(self, + num_classes: int = 21, + backbone_indices: Tuple[int] = (0, 3), + aspp_ratios: Tuple[int] = (1, 12, 24, 36), + aspp_out_channels: int = 256, + align_corners=False, + pretrained: str = None): + super(DeepLabV3PResnet50, self).__init__() + self.backbone = ResNet50_vd() + backbone_channels = [self.backbone.feat_channels[i] for i in backbone_indices] + self.head = DeepLabV3PHead(num_classes, backbone_indices, + backbone_channels, aspp_ratios, + aspp_out_channels, align_corners) + self.align_corners = align_corners + self.transforms = T.Compose([T.Padding(target_size=(512, 512)), T.Normalize()]) + + if pretrained is not None: + model_dict = paddle.load(pretrained) + self.set_dict(model_dict) + print("load custom parameters success") + + else: + checkpoint = os.path.join(self.directory, 'deeplabv3p_model.pdparams') + model_dict = paddle.load(checkpoint) + self.set_dict(model_dict) + print("load pretrained parameters success") + + def transform(self, img: Union[np.ndarray, str]) -> Union[np.ndarray, str]: + return self.transforms(img) + + def forward(self, x: paddle.Tensor) -> List[paddle.Tensor]: + feat_list = self.backbone(x) + logit_list = self.head(feat_list) + return [ + F.interpolate( + logit, + x.shape[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list] + + +class DeepLabV3PHead(nn.Layer): + """ + The DeepLabV3PHead implementation based on PaddlePaddle. + + Args: + num_classes (int): The unique number of target classes. + backbone_indices (tuple): Two values in the tuple indicate the indices of output of backbone. + the first index will be taken as a low-level feature in Decoder component; + the second one will be taken as input of ASPP component. + Usually backbone consists of four downsampling stage, and return an output of + each stage. If we set it as (0, 3), it means taking feature map of the first + stage in backbone as low-level feature used in Decoder, and feature map of the fourth + stage as input of ASPP. + backbone_channels (tuple): The same length with "backbone_indices". It indicates the channels of corresponding index. + aspp_ratios (tuple): The dilation rates using in ASSP module. + aspp_out_channels (int): The output channels of ASPP module. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. + """ + + def __init__(self, num_classes: int, backbone_indices: Tuple[paddle.Tensor], backbone_channels: Tuple[paddle.Tensor], + aspp_ratios: Tuple[float], aspp_out_channels: int, align_corners: bool): + super().__init__() + + self.aspp = L.ASPPModule( + aspp_ratios, + backbone_channels[1], + aspp_out_channels, + align_corners, + use_sep_conv=True, + image_pooling=True) + self.decoder = Decoder(num_classes, backbone_channels[0], align_corners) + self.backbone_indices = backbone_indices + + def forward(self, feat_list: List[paddle.Tensor]) -> List[paddle.Tensor]: + logit_list = [] + low_level_feat = feat_list[self.backbone_indices[0]] + x = feat_list[self.backbone_indices[1]] + x = self.aspp(x) + logit = self.decoder(x, low_level_feat) + logit_list.append(logit) + return logit_list + + +class Decoder(nn.Layer): + """ + Decoder module of DeepLabV3P model + + Args: + num_classes (int): The number of classes. + in_channels (int): The number of input channels in decoder module. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. + """ + + def __init__(self, num_classes: int, in_channels: int, align_corners: bool): + super(Decoder, self).__init__() + + self.conv_bn_relu1 = L.ConvBNReLU( + in_channels=in_channels, out_channels=48, kernel_size=1) + + self.conv_bn_relu2 = L.SeparableConvBNReLU( + in_channels=304, out_channels=256, kernel_size=3, padding=1) + self.conv_bn_relu3 = L.SeparableConvBNReLU( + in_channels=256, out_channels=256, kernel_size=3, padding=1) + self.conv = nn.Conv2D( + in_channels=256, out_channels=num_classes, kernel_size=1) + + self.align_corners = align_corners + + def forward(self, x: paddle.Tensor, low_level_feat: paddle.Tensor) -> paddle.Tensor: + low_level_feat = self.conv_bn_relu1(low_level_feat) + x = F.interpolate( + x, + low_level_feat.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + x = paddle.concat([x, low_level_feat], axis=1) + x = self.conv_bn_relu2(x) + x = self.conv_bn_relu3(x) + x = self.conv(x) + return x \ No newline at end of file diff --git a/modules/image/semantic_segmentation/deeplabv3p_resnet50_voc/resnet.py b/modules/image/semantic_segmentation/deeplabv3p_resnet50_voc/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..2fa6fa2fc8ece901e9e6baf8c841a443b66876cd --- /dev/null +++ b/modules/image/semantic_segmentation/deeplabv3p_resnet50_voc/resnet.py @@ -0,0 +1,137 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import deeplabv3p_resnet50_voc.layers as L + + +class BasicBlock(nn.Layer): + def __init__(self, + in_channels: int, + out_channels: int, + stride: int, + shortcut: bool = True, + if_first: bool = False, + name: str = None): + super(BasicBlock, self).__init__() + self.stride = stride + self.conv0 = L.ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + name=name + "_branch2a") + self.conv1 = L.ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + act=None, + name=name + "_branch2b") + + if not shortcut: + self.short = L.ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs: paddle.Tensor) -> paddle.Tensor: + y = self.conv0(inputs) + conv1 = self.conv1(y) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + y = paddle.elementwise_add(x=short, y=conv1, act='relu') + + return y + + +class ResNet50_vd(nn.Layer): + def __init__(self, + multi_grid: tuple = (1, 2, 4)): + super(ResNet50_vd, self).__init__() + depth = [3, 4, 6, 3] + num_channels = [64, 256, 512, 1024] + num_filters = [64, 128, 256, 512] + self.feat_channels = [c * 4 for c in num_filters] + dilation_dict = {2: 2, 3: 4} + self.conv1_1 = L.ConvBNLayer( + in_channels=3, + out_channels=32, + kernel_size=3, + stride=2, + act='relu', + name="conv1_1") + self.conv1_2 = L.ConvBNLayer( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + act='relu', + name="conv1_2") + self.conv1_3 = L.ConvBNLayer( + in_channels=32, + out_channels=64, + kernel_size=3, + stride=1, + act='relu', + name="conv1_3") + self.pool2d_max = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) + self.stage_list = [] + + for block in range(len(depth)): + shortcut = False + block_list = [] + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + dilation_rate = dilation_dict[ + block] if dilation_dict and block in dilation_dict else 1 + if block == 3: + dilation_rate = dilation_rate * multi_grid[i] + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + L.BottleneckBlock( + in_channels=num_channels[block] + if i == 0 else num_filters[block] * 4, + out_channels=num_filters[block], + stride=2 if i == 0 and block != 0 + and dilation_rate == 1 else 1, + shortcut=shortcut, + if_first=block == i == 0, + name=conv_name, + dilation=dilation_rate)) + block_list.append(bottleneck_block) + shortcut = True + self.stage_list.append(block_list) + + def forward(self, inputs: paddle.Tensor) -> paddle.Tensor: + y = self.conv1_1(inputs) + y = self.conv1_2(y) + y = self.conv1_3(y) + y = self.pool2d_max(y) + feat_list = [] + for stage in self.stage_list: + for block in stage: + y = block(y) + feat_list.append(y) + return feat_list \ No newline at end of file diff --git a/modules/image/semantic_segmentation/ocrnet_hrnetw18_voc/hrnet.py b/modules/image/semantic_segmentation/ocrnet_hrnetw18_voc/hrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..2e9ea7893d75c6dae8462fed790075b768d1cce2 --- /dev/null +++ b/modules/image/semantic_segmentation/ocrnet_hrnetw18_voc/hrnet.py @@ -0,0 +1,612 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +import ocrnet_hrnetw18_voc.layers as L + + +class HRNet_W18(nn.Layer): + """ + The HRNet implementation based on PaddlePaddle. + + The original article refers to + Jingdong Wang, et, al. "HRNet:Deep High-Resolution Representation Learning for Visual Recognition" + (https://arxiv.org/pdf/1908.07919.pdf). + + Args: + pretrained (str, optional): The path of pretrained model. + stage1_num_modules (int, optional): Number of modules for stage1. Default 1. + stage1_num_blocks (list, optional): Number of blocks per module for stage1. Default (4). + stage1_num_channels (list, optional): Number of channels per branch for stage1. Default (64). + stage2_num_modules (int, optional): Number of modules for stage2. Default 1. + stage2_num_blocks (list, optional): Number of blocks per module for stage2. Default (4, 4). + stage2_num_channels (list, optional): Number of channels per branch for stage2. Default (18, 36). + stage3_num_modules (int, optional): Number of modules for stage3. Default 4. + stage3_num_blocks (list, optional): Number of blocks per module for stage3. Default (4, 4, 4). + stage3_num_channels (list, optional): Number of channels per branch for stage3. Default [18, 36, 72). + stage4_num_modules (int, optional): Number of modules for stage4. Default 3. + stage4_num_blocks (list, optional): Number of blocks per module for stage4. Default (4, 4, 4, 4). + stage4_num_channels (list, optional): Number of channels per branch for stage4. Default (18, 36, 72. 144). + has_se (bool, optional): Whether to use Squeeze-and-Excitation module. Default False. + align_corners (bool, optional): An argument of F.interpolate. It should be set to False when the feature size is even, + e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + """ + + def __init__(self, + pretrained: str = None, + stage1_num_modules: int = 1, + stage1_num_blocks: tuple = (4,), + stage1_num_channels: tuple = (64,), + stage2_num_modules: int = 1, + stage2_num_blocks: tuple = (4, 4), + stage2_num_channels: tuple = (18, 36), + stage3_num_modules: int = 4, + stage3_num_blocks: tuple = (4, 4, 4), + stage3_num_channels: tuple = (18, 36, 72), + stage4_num_modules: int = 3, + stage4_num_blocks: tuple = (4, 4, 4, 4), + stage4_num_channels: tuple = (18, 36, 72, 144), + has_se: bool = False, + align_corners: bool = False): + super(HRNet_W18, self).__init__() + self.pretrained = pretrained + self.stage1_num_modules = stage1_num_modules + self.stage1_num_blocks = stage1_num_blocks + self.stage1_num_channels = stage1_num_channels + self.stage2_num_modules = stage2_num_modules + self.stage2_num_blocks = stage2_num_blocks + self.stage2_num_channels = stage2_num_channels + self.stage3_num_modules = stage3_num_modules + self.stage3_num_blocks = stage3_num_blocks + self.stage3_num_channels = stage3_num_channels + self.stage4_num_modules = stage4_num_modules + self.stage4_num_blocks = stage4_num_blocks + self.stage4_num_channels = stage4_num_channels + self.has_se = has_se + self.align_corners = align_corners + self.feat_channels = [sum(stage4_num_channels)] + + self.conv_layer1_1 = L.ConvBNReLU( + in_channels=3, + out_channels=64, + kernel_size=3, + stride=2, + padding='same', + bias_attr=False) + + self.conv_layer1_2 = L.ConvBNReLU( + in_channels=64, + out_channels=64, + kernel_size=3, + stride=2, + padding='same', + bias_attr=False) + + self.la1 = Layer1( + num_channels=64, + num_blocks=self.stage1_num_blocks[0], + num_filters=self.stage1_num_channels[0], + has_se=has_se, + name="layer2") + + self.tr1 = TransitionLayer( + in_channels=[self.stage1_num_channels[0] * 4], + out_channels=self.stage2_num_channels, + name="tr1") + + self.st2 = Stage( + num_channels=self.stage2_num_channels, + num_modules=self.stage2_num_modules, + num_blocks=self.stage2_num_blocks, + num_filters=self.stage2_num_channels, + has_se=self.has_se, + name="st2", + align_corners=align_corners) + + self.tr2 = TransitionLayer( + in_channels=self.stage2_num_channels, + out_channels=self.stage3_num_channels, + name="tr2") + self.st3 = Stage( + num_channels=self.stage3_num_channels, + num_modules=self.stage3_num_modules, + num_blocks=self.stage3_num_blocks, + num_filters=self.stage3_num_channels, + has_se=self.has_se, + name="st3", + align_corners=align_corners) + + self.tr3 = TransitionLayer( + in_channels=self.stage3_num_channels, + out_channels=self.stage4_num_channels, + name="tr3") + self.st4 = Stage( + num_channels=self.stage4_num_channels, + num_modules=self.stage4_num_modules, + num_blocks=self.stage4_num_blocks, + num_filters=self.stage4_num_channels, + has_se=self.has_se, + name="st4", + align_corners=align_corners) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + conv1 = self.conv_layer1_1(x) + conv2 = self.conv_layer1_2(conv1) + + la1 = self.la1(conv2) + + tr1 = self.tr1([la1]) + st2 = self.st2(tr1) + + tr2 = self.tr2(st2) + st3 = self.st3(tr2) + + tr3 = self.tr3(st3) + st4 = self.st4(tr3) + + x0_h, x0_w = st4[0].shape[2:] + x1 = F.interpolate( + st4[1], (x0_h, x0_w), + mode='bilinear', + align_corners=self.align_corners) + x2 = F.interpolate( + st4[2], (x0_h, x0_w), + mode='bilinear', + align_corners=self.align_corners) + x3 = F.interpolate( + st4[3], (x0_h, x0_w), + mode='bilinear', + align_corners=self.align_corners) + x = paddle.concat([st4[0], x1, x2, x3], axis=1) + + return [x] + + +class Layer1(nn.Layer): + def __init__(self, + num_channels: int, + num_filters: int, + num_blocks: int, + has_se: bool = False, + name: str = None): + super(Layer1, self).__init__() + + self.bottleneck_block_list = [] + + for i in range(num_blocks): + bottleneck_block = self.add_sublayer( + "bb_{}_{}".format(name, i + 1), + BottleneckBlock( + num_channels=num_channels if i == 0 else num_filters * 4, + num_filters=num_filters, + has_se=has_se, + stride=1, + downsample=True if i == 0 else False, + name=name + '_' + str(i + 1))) + self.bottleneck_block_list.append(bottleneck_block) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + conv = x + for block_func in self.bottleneck_block_list: + conv = block_func(conv) + return conv + + +class TransitionLayer(nn.Layer): + def __init__(self, in_channels: int, out_channels: int, name=None): + super(TransitionLayer, self).__init__() + + num_in = len(in_channels) + num_out = len(out_channels) + self.conv_bn_func_list = [] + for i in range(num_out): + residual = None + if i < num_in: + if in_channels[i] != out_channels[i]: + residual = self.add_sublayer( + "transition_{}_layer_{}".format(name, i + 1), + L.ConvBNReLU( + in_channels=in_channels[i], + out_channels=out_channels[i], + kernel_size=3, + padding='same', + bias_attr=False)) + else: + residual = self.add_sublayer( + "transition_{}_layer_{}".format(name, i + 1), + L.ConvBNReLU( + in_channels=in_channels[-1], + out_channels=out_channels[i], + kernel_size=3, + stride=2, + padding='same', + bias_attr=False)) + self.conv_bn_func_list.append(residual) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + outs = [] + for idx, conv_bn_func in enumerate(self.conv_bn_func_list): + if conv_bn_func is None: + outs.append(x[idx]) + else: + if idx < len(x): + outs.append(conv_bn_func(x[idx])) + else: + outs.append(conv_bn_func(x[-1])) + return outs + + +class Branches(nn.Layer): + def __init__(self, + num_blocks: int, + in_channels: int, + out_channels: int, + has_se: bool = False, + name: str = None): + super(Branches, self).__init__() + + self.basic_block_list = [] + + for i in range(len(out_channels)): + self.basic_block_list.append([]) + for j in range(num_blocks[i]): + in_ch = in_channels[i] if j == 0 else out_channels[i] + basic_block_func = self.add_sublayer( + "bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1), + BasicBlock( + num_channels=in_ch, + num_filters=out_channels[i], + has_se=has_se, + name=name + '_branch_layer_' + str(i + 1) + '_' + + str(j + 1))) + self.basic_block_list[i].append(basic_block_func) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + outs = [] + for idx, input in enumerate(x): + conv = input + for basic_block_func in self.basic_block_list[idx]: + conv = basic_block_func(conv) + outs.append(conv) + return outs + + +class BottleneckBlock(nn.Layer): + def __init__(self, + num_channels: int, + num_filters: int, + has_se: bool, + stride: int = 1, + downsample: bool = False, + name: str = None): + super(BottleneckBlock, self).__init__() + + self.has_se = has_se + self.downsample = downsample + + self.conv1 = L.ConvBNReLU( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=1, + padding='same', + bias_attr=False) + + self.conv2 = L.ConvBNReLU( + in_channels=num_filters, + out_channels=num_filters, + kernel_size=3, + stride=stride, + padding='same', + bias_attr=False) + + self.conv3 = L.ConvBN( + in_channels=num_filters, + out_channels=num_filters * 4, + kernel_size=1, + padding='same', + bias_attr=False) + + if self.downsample: + self.conv_down = L.ConvBN( + in_channels=num_channels, + out_channels=num_filters * 4, + kernel_size=1, + padding='same', + bias_attr=False) + + if self.has_se: + self.se = SELayer( + num_channels=num_filters * 4, + num_filters=num_filters * 4, + reduction_ratio=16, + name=name + '_fc') + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + residual = x + conv1 = self.conv1(x) + conv2 = self.conv2(conv1) + conv3 = self.conv3(conv2) + + if self.downsample: + residual = self.conv_down(x) + + if self.has_se: + conv3 = self.se(conv3) + + y = conv3 + residual + y = F.relu(y) + return y + + +class BasicBlock(nn.Layer): + def __init__(self, + num_channels: int, + num_filters: int, + stride: int = 1, + has_se: bool = False, + downsample: bool = False, + name: str = None): + super(BasicBlock, self).__init__() + + self.has_se = has_se + self.downsample = downsample + + self.conv1 = L.ConvBNReLU( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=3, + stride=stride, + padding='same', + bias_attr=False) + self.conv2 = L.ConvBN( + in_channels=num_filters, + out_channels=num_filters, + kernel_size=3, + padding='same', + bias_attr=False) + + if self.downsample: + self.conv_down = L.ConvBNReLU( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=1, + padding='same', + bias_attr=False) + + if self.has_se: + self.se = SELayer( + num_channels=num_filters, + num_filters=num_filters, + reduction_ratio=16, + name=name + '_fc') + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + residual = x + conv1 = self.conv1(x) + conv2 = self.conv2(conv1) + + if self.downsample: + residual = self.conv_down(x) + + if self.has_se: + conv2 = self.se(conv2) + + y = conv2 + residual + y = F.relu(y) + return y + + +class SELayer(nn.Layer): + def __init__(self, num_channels: int, num_filters: int, reduction_ratio: int, name: str = None): + super(SELayer, self).__init__() + + self.pool2d_gap = nn.AdaptiveAvgPool2D(1) + + self._num_channels = num_channels + + med_ch = int(num_channels / reduction_ratio) + stdv = 1.0 / math.sqrt(num_channels * 1.0) + self.squeeze = nn.Linear( + num_channels, + med_ch, + weight_attr=paddle.ParamAttr( + initializer=nn.initializer.Uniform(-stdv, stdv))) + + stdv = 1.0 / math.sqrt(med_ch * 1.0) + self.excitation = nn.Linear( + med_ch, + num_filters, + weight_attr=paddle.ParamAttr( + initializer=nn.initializer.Uniform(-stdv, stdv))) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + pool = self.pool2d_gap(x) + pool = paddle.reshape(pool, shape=[-1, self._num_channels]) + squeeze = self.squeeze(pool) + squeeze = F.relu(squeeze) + excitation = self.excitation(squeeze) + excitation = F.sigmoid(excitation) + excitation = paddle.reshape( + excitation, shape=[-1, self._num_channels, 1, 1]) + out = x * excitation + return out + + +class Stage(nn.Layer): + def __init__(self, + num_channels: int, + num_modules: int, + num_blocks: int, + num_filters: int, + has_se: bool = False, + multi_scale_output: bool = True, + name: str = None, + align_corners: bool = False): + super(Stage, self).__init__() + + self._num_modules = num_modules + + self.stage_func_list = [] + for i in range(num_modules): + if i == num_modules - 1 and not multi_scale_output: + stage_func = self.add_sublayer( + "stage_{}_{}".format(name, i + 1), + HighResolutionModule( + num_channels=num_channels, + num_blocks=num_blocks, + num_filters=num_filters, + has_se=has_se, + multi_scale_output=False, + name=name + '_' + str(i + 1), + align_corners=align_corners)) + else: + stage_func = self.add_sublayer( + "stage_{}_{}".format(name, i + 1), + HighResolutionModule( + num_channels=num_channels, + num_blocks=num_blocks, + num_filters=num_filters, + has_se=has_se, + name=name + '_' + str(i + 1), + align_corners=align_corners)) + + self.stage_func_list.append(stage_func) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + out = x + for idx in range(self._num_modules): + out = self.stage_func_list[idx](out) + return out + + +class HighResolutionModule(nn.Layer): + def __init__(self, + num_channels: int, + num_blocks: int, + num_filters: int, + has_se: bool = False, + multi_scale_output: bool = True, + name: str = None, + align_corners: str = False): + super(HighResolutionModule, self).__init__() + + self.branches_func = Branches( + num_blocks=num_blocks, + in_channels=num_channels, + out_channels=num_filters, + has_se=has_se, + name=name) + + self.fuse_func = FuseLayers( + in_channels=num_filters, + out_channels=num_filters, + multi_scale_output=multi_scale_output, + name=name, + align_corners=align_corners) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + out = self.branches_func(x) + out = self.fuse_func(out) + return out + + +class FuseLayers(nn.Layer): + def __init__(self, + in_channels: int, + out_channels: int, + multi_scale_output: bool = True, + name: str = None, + align_corners: bool = False): + super(FuseLayers, self).__init__() + + self._actual_ch = len(in_channels) if multi_scale_output else 1 + self._in_channels = in_channels + self.align_corners = align_corners + + self.residual_func_list = [] + for i in range(self._actual_ch): + for j in range(len(in_channels)): + if j > i: + residual_func = self.add_sublayer( + "residual_{}_layer_{}_{}".format(name, i + 1, j + 1), + L.ConvBN( + in_channels=in_channels[j], + out_channels=out_channels[i], + kernel_size=1, + padding='same', + bias_attr=False)) + self.residual_func_list.append(residual_func) + elif j < i: + pre_num_filters = in_channels[j] + for k in range(i - j): + if k == i - j - 1: + residual_func = self.add_sublayer( + "residual_{}_layer_{}_{}_{}".format( + name, i + 1, j + 1, k + 1), + L.ConvBN( + in_channels=pre_num_filters, + out_channels=out_channels[i], + kernel_size=3, + stride=2, + padding='same', + bias_attr=False)) + pre_num_filters = out_channels[i] + else: + residual_func = self.add_sublayer( + "residual_{}_layer_{}_{}_{}".format( + name, i + 1, j + 1, k + 1), + L.ConvBNReLU( + in_channels=pre_num_filters, + out_channels=out_channels[j], + kernel_size=3, + stride=2, + padding='same', + bias_attr=False)) + pre_num_filters = out_channels[j] + self.residual_func_list.append(residual_func) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + outs = [] + residual_func_idx = 0 + for i in range(self._actual_ch): + residual = x[i] + residual_shape = residual.shape[-2:] + for j in range(len(self._in_channels)): + if j > i: + y = self.residual_func_list[residual_func_idx](x[j]) + residual_func_idx += 1 + + y = F.interpolate( + y, + residual_shape, + mode='bilinear', + align_corners=self.align_corners) + residual = residual + y + elif j < i: + y = x[j] + for k in range(i - j): + y = self.residual_func_list[residual_func_idx](y) + residual_func_idx += 1 + + residual = residual + y + + residual = F.relu(residual) + outs.append(residual) + + return outs \ No newline at end of file diff --git a/modules/image/semantic_segmentation/ocrnet_hrnetw18_voc/layers.py b/modules/image/semantic_segmentation/ocrnet_hrnetw18_voc/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..d5554a1b4c226cfb692e2bcd7da96659ffe129e7 --- /dev/null +++ b/modules/image/semantic_segmentation/ocrnet_hrnetw18_voc/layers.py @@ -0,0 +1,345 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +from paddle.nn.layer import activation +from paddle.nn import Conv2D, AvgPool2D + + +def SyncBatchNorm(*args, **kwargs): + """In cpu environment nn.SyncBatchNorm does not have kernel so use nn.BatchNorm2D instead""" + if paddle.get_device() == 'cpu': + return nn.BatchNorm2D(*args, **kwargs) + else: + return nn.SyncBatchNorm(*args, **kwargs) + + +class ConvBNLayer(nn.Layer): + """Basic conv bn relu layer.""" + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: int, + stride: int = 1, + dilation: int = 1, + groups: int = 1, + is_vd_mode: bool = False, + act: str = None, + name: str = None): + super(ConvBNLayer, self).__init__() + + self.is_vd_mode = is_vd_mode + self._pool2d_avg = AvgPool2D( + kernel_size=2, stride=2, padding=0, ceil_mode=True) + self._conv = Conv2D( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2 if dilation == 1 else 0, + dilation=dilation, + groups=groups, + bias_attr=False) + + self._batch_norm = SyncBatchNorm(out_channels) + self._act_op = Activation(act=act) + + def forward(self, inputs: paddle.Tensor) -> paddle.Tensor: + if self.is_vd_mode: + inputs = self._pool2d_avg(inputs) + y = self._conv(inputs) + y = self._batch_norm(y) + y = self._act_op(y) + + return y + + +class BottleneckBlock(nn.Layer): + """Residual bottleneck block""" + + def __init__(self, + in_channels: int, + out_channels: int, + stride: int, + shortcut: bool = True, + if_first: bool = False, + dilation: int = 1, + name: str = None): + super(BottleneckBlock, self).__init__() + + self.conv0 = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + act='relu', + name=name + "_branch2a") + + self.dilation = dilation + + self.conv1 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride, + act='relu', + dilation=dilation, + name=name + "_branch2b") + self.conv2 = ConvBNLayer( + in_channels=out_channels, + out_channels=out_channels * 4, + kernel_size=1, + act=None, + name=name + "_branch2c") + + if not shortcut: + self.short = ConvBNLayer( + in_channels=in_channels, + out_channels=out_channels * 4, + kernel_size=1, + stride=1, + is_vd_mode=False if if_first or stride == 1 else True, + name=name + "_branch1") + + self.shortcut = shortcut + + def forward(self, inputs: paddle.Tensor) -> paddle.Tensor: + y = self.conv0(inputs) + if self.dilation > 1: + padding = self.dilation + y = F.pad(y, [padding, padding, padding, padding]) + + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = paddle.add(x=short, y=conv2) + y = F.relu(y) + return y + + +class SeparableConvBNReLU(nn.Layer): + """Depthwise Separable Convolution.""" + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int, + padding: str = 'same', + **kwargs: dict): + super(SeparableConvBNReLU, self).__init__() + self.depthwise_conv = ConvBN( + in_channels, + out_channels=in_channels, + kernel_size=kernel_size, + padding=padding, + groups=in_channels, + **kwargs) + self.piontwise_conv = ConvBNReLU( + in_channels, out_channels, kernel_size=1, groups=1) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = self.depthwise_conv(x) + x = self.piontwise_conv(x) + return x + + +class ConvBN(nn.Layer): + """Basic conv bn layer""" + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int, + padding: str = 'same', + **kwargs: dict): + super(ConvBN, self).__init__() + self._conv = Conv2D( + in_channels, out_channels, kernel_size, padding=padding, **kwargs) + self._batch_norm = SyncBatchNorm(out_channels) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = self._conv(x) + x = self._batch_norm(x) + return x + + +class ConvBNReLU(nn.Layer): + """Basic conv bn relu layer.""" + + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: int, + padding: str = 'same', + **kwargs: dict): + super(ConvBNReLU, self).__init__() + + self._conv = Conv2D( + in_channels, out_channels, kernel_size, padding=padding, **kwargs) + self._batch_norm = SyncBatchNorm(out_channels) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + x = self._conv(x) + x = self._batch_norm(x) + x = F.relu(x) + return x + + +class Activation(nn.Layer): + """ + The wrapper of activations. + + Args: + act (str, optional): The activation name in lowercase. It must be one of ['elu', 'gelu', + 'hardshrink', 'tanh', 'hardtanh', 'prelu', 'relu', 'relu6', 'selu', 'leakyrelu', 'sigmoid', + 'softmax', 'softplus', 'softshrink', 'softsign', 'tanhshrink', 'logsigmoid', 'logsoftmax', + 'hsigmoid']. Default: None, means identical transformation. + + Returns: + A callable object of Activation. + + Raises: + KeyError: When parameter `act` is not in the optional range. + + Examples: + + from paddleseg.models.common.activation import Activation + + relu = Activation("relu") + print(relu) + # + + sigmoid = Activation("sigmoid") + print(sigmoid) + # + + not_exit_one = Activation("not_exit_one") + # KeyError: "not_exit_one does not exist in the current dict_keys(['elu', 'gelu', 'hardshrink', + # 'tanh', 'hardtanh', 'prelu', 'relu', 'relu6', 'selu', 'leakyrelu', 'sigmoid', 'softmax', + # 'softplus', 'softshrink', 'softsign', 'tanhshrink', 'logsigmoid', 'logsoftmax', 'hsigmoid'])" + """ + + def __init__(self, act: str = None): + super(Activation, self).__init__() + + self._act = act + upper_act_names = activation.__all__ + lower_act_names = [act.lower() for act in upper_act_names] + act_dict = dict(zip(lower_act_names, upper_act_names)) + + if act is not None: + if act in act_dict.keys(): + act_name = act_dict[act] + self.act_func = eval("activation.{}()".format(act_name)) + else: + raise KeyError("{} does not exist in the current {}".format( + act, act_dict.keys())) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + + if self._act is not None: + return self.act_func(x) + else: + return x + + +class ASPPModule(nn.Layer): + """ + Atrous Spatial Pyramid Pooling. + + Args: + aspp_ratios (tuple): The dilation rate using in ASSP module. + in_channels (int): The number of input channels. + out_channels (int): The number of output channels. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. + use_sep_conv (bool, optional): If using separable conv in ASPP module. Default: False. + image_pooling (bool, optional): If augmented with image-level features. Default: False + """ + + def __init__(self, + aspp_ratios, + in_channels, + out_channels, + align_corners, + use_sep_conv=False, + image_pooling=False): + super().__init__() + + self.align_corners = align_corners + self.aspp_blocks = nn.LayerList() + + for ratio in aspp_ratios: + if use_sep_conv and ratio > 1: + conv_func = SeparableConvBNReLU + else: + conv_func = ConvBNReLU + + block = conv_func( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1 if ratio == 1 else 3, + dilation=ratio, + padding=0 if ratio == 1 else ratio) + self.aspp_blocks.append(block) + + out_size = len(self.aspp_blocks) + + if image_pooling: + self.global_avg_pool = nn.Sequential( + nn.AdaptiveAvgPool2D(output_size=(1, 1)), + ConvBNReLU(in_channels, out_channels, kernel_size=1, bias_attr=False)) + out_size += 1 + self.image_pooling = image_pooling + + self.conv_bn_relu = ConvBNReLU( + in_channels=out_channels * out_size, + out_channels=out_channels, + kernel_size=1) + + self.dropout = nn.Dropout(p=0.1) # drop rate + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + outputs = [] + for block in self.aspp_blocks: + y = block(x) + y = F.interpolate( + y, + x.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + outputs.append(y) + + if self.image_pooling: + img_avg = self.global_avg_pool(x) + img_avg = F.interpolate( + img_avg, + x.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + outputs.append(img_avg) + + x = paddle.concat(outputs, axis=1) + x = self.conv_bn_relu(x) + x = self.dropout(x) + + return x \ No newline at end of file diff --git a/modules/image/semantic_segmentation/ocrnet_hrnetw18_voc/module.py b/modules/image/semantic_segmentation/ocrnet_hrnetw18_voc/module.py new file mode 100644 index 0000000000000000000000000000000000000000..1660d1c607db7778d4ea3b37f4dfcecd16344700 --- /dev/null +++ b/modules/image/semantic_segmentation/ocrnet_hrnetw18_voc/module.py @@ -0,0 +1,243 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from typing import List + +import paddle +import numpy as np +import paddle.nn as nn +import paddle.nn.functional as F +from paddlehub.module.module import moduleinfo +import paddlehub.vision.segmentation_transforms as T +from paddlehub.module.cv_module import ImageSegmentationModule + +import ocrnet_hrnetw18_voc.layers as L +from ocrnet_hrnetw18_voc.hrnet import HRNet_W18 + +@moduleinfo( + name="ocrnet_hrnetw18_voc", + type="CV/semantic_segmentation", + author="paddlepaddle", + author_email="", + summary="OCRNetHRNetW18 is a segmentation model pretrained by pascal voc.", + version="1.0.0", + meta=ImageSegmentationModule) +class OCRNetHRNetW18(nn.Layer): + """ + The OCRNet implementation based on PaddlePaddle. + The original article refers to + Yuan, Yuhui, et al. "Object-Contextual Representations for Semantic Segmentation" + (https://arxiv.org/pdf/1909.11065.pdf) + Args: + num_classes (int): The unique number of target classes. + backbone_indices (list): A list indicates the indices of output of backbone. + It can be either one or two values, if two values, the first index will be taken as + a deep-supervision feature in auxiliary layer; the second one will be taken as + input of pixel representation. If one value, it is taken by both above. + ocr_mid_channels (int, optional): The number of middle channels in OCRHead. Default: 512. + ocr_key_channels (int, optional): The number of key channels in ObjectAttentionBlock. Default: 256. + align_corners (bool): An argument of F.interpolate. It should be set to False when the output size of feature + is even, e.g. 1024x512, otherwise it is True, e.g. 769x769. Default: False. + pretrained (str, optional): The path or url of pretrained model. Default: None. + """ + + def __init__(self, + num_classes: int = 21, + backbone_indices: List[int] = [0], + ocr_mid_channels: int = 512, + ocr_key_channels: int = 256, + align_corners: bool = False, + pretrained: str = None): + super(OCRNetHRNetW18, self).__init__() + self.backbone = HRNet_W18() + self.backbone_indices = backbone_indices + in_channels = [self.backbone.feat_channels[i] for i in backbone_indices] + self.head = OCRHead( + num_classes=num_classes, + in_channels=in_channels, + ocr_mid_channels=ocr_mid_channels, + ocr_key_channels=ocr_key_channels) + self.align_corners = align_corners + self.transforms = T.Compose([T.Padding(target_size=(512, 512)), T.Normalize()]) + + if pretrained is not None: + model_dict = paddle.load(pretrained) + self.set_dict(model_dict) + print("load custom parameters success") + + else: + checkpoint = os.path.join(self.directory, 'ocrnet_hrnetw18.pdparams') + model_dict = paddle.load(checkpoint) + self.set_dict(model_dict) + print("load pretrained parameters success") + + def transform(self, img: np.ndarray) -> np.ndarray: + return self.transforms(img) + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + feats = self.backbone(x) + feats = [feats[i] for i in self.backbone_indices] + logit_list = self.head(feats) + logit_list = [ + F.interpolate( + logit, + x.shape[2:], + mode='bilinear', + align_corners=self.align_corners) for logit in logit_list] + return logit_list + + +class OCRHead(nn.Layer): + """ + The Object contextual representation head. + Args: + num_classes(int): The unique number of target classes. + in_channels(tuple): The number of input channels. + ocr_mid_channels(int, optional): The number of middle channels in OCRHead. Default: 512. + ocr_key_channels(int, optional): The number of key channels in ObjectAttentionBlock. Default: 256. + """ + + def __init__(self, + num_classes: int, + in_channels: int, + ocr_mid_channels: int = 512, + ocr_key_channels: int = 256): + super().__init__() + + self.num_classes = num_classes + self.spatial_gather = SpatialGatherBlock() + self.spatial_ocr = SpatialOCRModule(ocr_mid_channels, ocr_key_channels, + ocr_mid_channels) + + self.indices = [-2, -1] if len(in_channels) > 1 else [-1, -1] + + self.conv3x3_ocr = L.ConvBNReLU( + in_channels[self.indices[1]], ocr_mid_channels, 3, padding=1) + self.cls_head = nn.Conv2D(ocr_mid_channels, self.num_classes, 1) + self.aux_head = nn.Sequential( + L.ConvBNReLU(in_channels[self.indices[0]], + in_channels[self.indices[0]], 1), + nn.Conv2D(in_channels[self.indices[0]], self.num_classes, 1)) + + + def forward(self, feat_list: List[paddle.Tensor]) -> paddle.Tensor: + feat_shallow, feat_deep = feat_list[self.indices[0]], feat_list[ + self.indices[1]] + + soft_regions = self.aux_head(feat_shallow) + pixels = self.conv3x3_ocr(feat_deep) + + object_regions = self.spatial_gather(pixels, soft_regions) + ocr = self.spatial_ocr(pixels, object_regions) + + logit = self.cls_head(ocr) + return [logit, soft_regions] + + +class SpatialGatherBlock(nn.Layer): + """Aggregation layer to compute the pixel-region representation.""" + + def forward(self, pixels: paddle.Tensor, regions: paddle.Tensor) -> paddle.Tensor: + n, c, h, w = pixels.shape + _, k, _, _ = regions.shape + + # pixels: from (n, c, h, w) to (n, h*w, c) + pixels = paddle.reshape(pixels, (n, c, h * w)) + pixels = paddle.transpose(pixels, [0, 2, 1]) + + # regions: from (n, k, h, w) to (n, k, h*w) + regions = paddle.reshape(regions, (n, k, h * w)) + regions = F.softmax(regions, axis=2) + + # feats: from (n, k, c) to (n, c, k, 1) + feats = paddle.bmm(regions, pixels) + feats = paddle.transpose(feats, [0, 2, 1]) + feats = paddle.unsqueeze(feats, axis=-1) + + return feats + + +class SpatialOCRModule(nn.Layer): + """Aggregate the global object representation to update the representation for each pixel.""" + + def __init__(self, + in_channels: int, + key_channels: int, + out_channels: int, + dropout_rate: float = 0.1): + super().__init__() + + self.attention_block = ObjectAttentionBlock(in_channels, key_channels) + self.conv1x1 = nn.Sequential( + L.ConvBNReLU(2 * in_channels, out_channels, 1), + nn.Dropout2D(dropout_rate)) + + def forward(self, pixels: paddle.Tensor, regions: paddle.Tensor) -> paddle.Tensor: + context = self.attention_block(pixels, regions) + feats = paddle.concat([context, pixels], axis=1) + feats = self.conv1x1(feats) + + return feats + + +class ObjectAttentionBlock(nn.Layer): + """A self-attention module.""" + + def __init__(self, in_channels: int, key_channels: int): + super().__init__() + + self.in_channels = in_channels + self.key_channels = key_channels + + self.f_pixel = nn.Sequential( + L.ConvBNReLU(in_channels, key_channels, 1), + L.ConvBNReLU(key_channels, key_channels, 1)) + + self.f_object = nn.Sequential( + L.ConvBNReLU(in_channels, key_channels, 1), + L.ConvBNReLU(key_channels, key_channels, 1)) + + self.f_down = L.ConvBNReLU(in_channels, key_channels, 1) + + self.f_up = L.ConvBNReLU(key_channels, in_channels, 1) + + def forward(self, x: paddle.Tensor, proxy: paddle.Tensor) -> paddle.Tensor: + n, _, h, w = x.shape + + # query : from (n, c1, h1, w1) to (n, h1*w1, key_channels) + query = self.f_pixel(x) + query = paddle.reshape(query, (n, self.key_channels, -1)) + query = paddle.transpose(query, [0, 2, 1]) + + # key : from (n, c2, h2, w2) to (n, key_channels, h2*w2) + key = self.f_object(proxy) + key = paddle.reshape(key, (n, self.key_channels, -1)) + + # value : from (n, c2, h2, w2) to (n, h2*w2, key_channels) + value = self.f_down(proxy) + value = paddle.reshape(value, (n, self.key_channels, -1)) + value = paddle.transpose(value, [0, 2, 1]) + + # sim_map (n, h1*w1, h2*w2) + sim_map = paddle.bmm(query, key) + sim_map = (self.key_channels**-.5) * sim_map + sim_map = F.softmax(sim_map, axis=-1) + + # context from (n, h1*w1, key_channels) to (n , out_channels, h1, w1) + context = paddle.bmm(sim_map, value) + context = paddle.transpose(context, [0, 2, 1]) + context = paddle.reshape(context, (n, self.key_channels, h, w)) + context = self.f_up(context) + + return context \ No newline at end of file diff --git a/paddlehub/datasets/__init__.py b/paddlehub/datasets/__init__.py index 4f097c2fda1bdd93cd6986f0f8bed845182a9051..26b8bfa63c68f00637d0ce2ceb62e710236e04c2 100644 --- a/paddlehub/datasets/__init__.py +++ b/paddlehub/datasets/__init__.py @@ -18,3 +18,5 @@ from paddlehub.datasets.minicoco import MiniCOCO from paddlehub.datasets.chnsenticorp import ChnSentiCorp from paddlehub.datasets.msra_ner import MSRA_NER from paddlehub.datasets.lcqmc import LCQMC +from paddlehub.datasets.base_seg_dataset import SegDataset +from paddlehub.datasets.opticdiscseg import OpticDiscSeg diff --git a/paddlehub/datasets/base_seg_dataset.py b/paddlehub/datasets/base_seg_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..1cea3e9e88819b30219adc2affcb165c573729fb --- /dev/null +++ b/paddlehub/datasets/base_seg_dataset.py @@ -0,0 +1,141 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Tuple, Callable + +import paddle +import numpy as np +from PIL import Image + + +class SegDataset(paddle.io.Dataset): + """ + Pass in a custom dataset that conforms to the format. + + Args: + transforms (Callable): Transforms for image. + dataset_root (str): The dataset directory. + num_classes (int): Number of classes. + mode (str, optional): which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'. + train_path (str, optional): The train dataset file. When mode is 'train', train_path is necessary. + The contents of train_path file are as follow: + image1.jpg ground_truth1.png + image2.jpg ground_truth2.png + val_path (str. optional): The evaluation dataset file. When mode is 'val', val_path is necessary. + The contents is the same as train_path + test_path (str, optional): The test dataset file. When mode is 'test', test_path is necessary. + The annotation file is not necessary in test_path file. + separator (str, optional): The separator of dataset list. Default: ' '. + edge (bool, optional): Whether to compute edge while training. Default: False + + """ + + def __init__(self, + transforms: Callable, + dataset_root: str, + num_classes: int, + mode: str = 'train', + train_path: str = None, + val_path: str = None, + test_path: str = None, + separator: str = ' ', + ignore_index: int = 255, + edge: bool = False): + self.dataset_root = dataset_root + self.transforms = transforms + self.file_list = list() + mode = mode.lower() + self.mode = mode + self.num_classes = num_classes + self.ignore_index = ignore_index + self.edge = edge + + if mode.lower() not in ['train', 'val', 'test']: + raise ValueError( + "mode should be 'train', 'val' or 'test', but got {}.".format( + mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + self.dataset_root = dataset_root + if not os.path.exists(self.dataset_root): + raise FileNotFoundError('there is not `dataset_root`: {}.'.format( + self.dataset_root)) + + if mode == 'train': + if train_path is None: + raise ValueError( + 'When `mode` is "train", `train_path` is necessary, but it is None.' + ) + elif not os.path.exists(train_path): + raise FileNotFoundError( + '`train_path` is not found: {}'.format(train_path)) + else: + file_path = train_path + elif mode == 'val': + if val_path is None: + raise ValueError( + 'When `mode` is "val", `val_path` is necessary, but it is None.' + ) + elif not os.path.exists(val_path): + raise FileNotFoundError( + '`val_path` is not found: {}'.format(val_path)) + else: + file_path = val_path + else: + if test_path is None: + raise ValueError( + 'When `mode` is "test", `test_path` is necessary, but it is None.' + ) + elif not os.path.exists(test_path): + raise FileNotFoundError( + '`test_path` is not found: {}'.format(test_path)) + else: + file_path = test_path + + with open(file_path, 'r') as f: + for line in f: + items = line.strip().split(separator) + if len(items) != 2: + if mode == 'train' or mode == 'val': + raise ValueError( + "File list format incorrect! In training or evaluation task it should be" + " image_name{}label_name\\n".format(separator)) + image_path = os.path.join(self.dataset_root, items[0]) + label_path = None + else: + image_path = os.path.join(self.dataset_root, items[0]) + label_path = os.path.join(self.dataset_root, items[1]) + self.file_list.append([image_path, label_path]) + + def __getitem__(self, idx: int) -> Tuple[np.ndarray]: + image_path, label_path = self.file_list[idx] + if self.mode == 'test': + im, _ = self.transforms(im=image_path) + im = im[np.newaxis, ...] + return im, image_path + elif self.mode == 'val': + im, _ = self.transforms(im=image_path) + label = np.asarray(Image.open(label_path)) + label = label[np.newaxis, :, :] + return im, label + else: + im, label = self.transforms(im=image_path, label=label_path) + return im, label + + def __len__(self) -> int: + return len(self.file_list) + diff --git a/paddlehub/datasets/opticdiscseg.py b/paddlehub/datasets/opticdiscseg.py new file mode 100644 index 0000000000000000000000000000000000000000..2d100194806aca1ce130a703dfcbb1a672edb45c --- /dev/null +++ b/paddlehub/datasets/opticdiscseg.py @@ -0,0 +1,78 @@ +# coding:utf-8 +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Callable + +import paddle +import numpy as np +from PIL import Image + +import paddlehub.env as hubenv +from paddlehub.utils.download import download_data +from paddlehub.datasets.base_seg_dataset import SegDataset + +@download_data(url='https://paddleseg.bj.bcebos.com/dataset/optic_disc_seg.zip') +class OpticDiscSeg(SegDataset): + """ + OpticDiscSeg dataset is extraced from iChallenge-AMD + (https://ai.baidu.com/broad/subordinate?dataset=amd). + + Args: + transforms (Callable): Transforms for image. + mode (str, optional): Which part of dataset to use. it is one of ('train', 'val', 'test'). Default: 'train'. + edge (bool, optional): Whether to compute edge while training. Default: False + """ + + def __init__(self, + transforms: Callable = None, + mode: str = 'train'): + self.transforms = transforms + mode = mode.lower() + self.mode = mode + self.file_list = list() + self.num_classes = 2 + self.ignore_index = 255 + + if mode not in ['train', 'val', 'test']: + raise ValueError( + "`mode` should be 'train', 'val' or 'test', but got {}.".format( + mode)) + + if self.transforms is None: + raise ValueError("`transforms` is necessary, but it is None.") + + + if mode == 'train': + file_path = os.path.join(hubenv.DATA_HOME, 'optic_disc_seg', 'train_list.txt') + elif mode == 'test': + file_path = os.path.join(hubenv.DATA_HOME, 'optic_disc_seg', 'test_list.txt') + else: + file_path = os.path.join(hubenv.DATA_HOME, 'optic_disc_seg', 'val_list.txt') + + with open(file_path, 'r') as f: + for line in f: + items = line.strip().split() + if len(items) != 2: + if mode == 'train' or mode == 'val': + raise Exception( + "File list format incorrect! It should be" + " image_name label_name\\n") + image_path = os.path.join(hubenv.DATA_HOME, 'optic_disc_seg', items[0]) + grt_path = None + else: + image_path = os.path.join(hubenv.DATA_HOME, 'optic_disc_seg', items[0]) + grt_path = os.path.join(hubenv.DATA_HOME, 'optic_disc_seg', items[1]) + self.file_list.append([image_path, grt_path]) \ No newline at end of file diff --git a/paddlehub/module/cv_module.py b/paddlehub/module/cv_module.py index e49f173c82d25328e86aad894ea939a444dcdfd7..4dceb295796de51fd93e3a51945f41650cadcb1d 100644 --- a/paddlehub/module/cv_module.py +++ b/paddlehub/module/cv_module.py @@ -17,7 +17,7 @@ import time import os import base64 import argparse -from typing import List, Union +from typing import List, Union, Tuple from collections import OrderedDict import cv2 @@ -629,4 +629,113 @@ class StyleTransferModule(RunModule, ImageServing): self.arg_input_group.add_argument( '--input_path', type=str, help="path to image.") self.arg_input_group.add_argument( - '--style_path', type=str, help="path to style image.") \ No newline at end of file + '--style_path', type=str, help="path to style image.") + + +class ImageSegmentationModule(ImageServing, RunModule): + def training_step(self, batch: List[paddle.Tensor], batch_idx: int) -> dict: + ''' + One step for training, which should be called as forward computation. + + Args: + batch(list[paddle.Tensor]): The one batch data, which contains images, ground truth boxes, labels and scores. + batch_idx(int): The index of batch. + + Returns: + results(dict): The model outputs, such as loss. + ''' + + return self.validation_step(batch, batch_idx) + + def validation_step(self, batch: List[paddle.Tensor], batch_idx: int) -> dict: + """ + One step for validation, which should be called as forward computation. + + Args: + batch(list[paddle.Tensor]): The one batch data, which contains images and labels. + batch_idx(int): The index of batch. + + Returns: + results(dict) : The model outputs, such as metrics. + """ + + label = batch[1].astype('int64') + criterionCE = nn.loss.CrossEntropyLoss() + logits = self(batch[0]) + loss = 0 + for i in range(len(logits)): + logit = logits[i] + if logit.shape[-2:] != label.shape[-2:]: + logit = F.resize_bilinear(logit, label.shape[-2:]) + logit = logit.transpose([0,2,3,1]) + loss_ce = criterionCE(logit, label) + loss += loss_ce / len(logits) + return {"loss": loss} + + def predict(self, images: Union[str, np.ndarray], batch_size: int = 1, visualization: bool = True, save_path: str = 'seg_result') -> List[np.ndarray]: + ''' + Obtain segmentation results. + + Args: + images(list[str|np.array]): Content image path or BGR image. + batch_size(int): Batch size for prediciton. + visualization(bool): Whether to save colorized images. + save_path(str) : Path to save colorized images. + + Returns: + output(list[np.ndarray]) : The segmentation mask. + ''' + self.eval() + result=[] + + total_num = len(images) + loop_num = int(np.ceil(total_num / batch_size)) + for iter_id in range(loop_num): + batch_data = [] + handle_id = iter_id * batch_size + for image_id in range(batch_size): + try: + image, _ = self.transform(images[handle_id + image_id]) + batch_data.append(image) + except: + pass + batch_image = np.array(batch_data).astype('float32') + pred = self(paddle.to_tensor(batch_image)) + pred = paddle.argmax(pred[0], axis=1, keepdim=True, dtype='int32') + + for num in range(pred.shape[0]): + if isinstance(images[handle_id+num], str): + image = cv2.imread(images[handle_id+num]) + else: + image = images[handle_id+num] + h, w, c = image.shape + pred_final = utils.reverse_transform(pred[num: num+1], (h,w), self.transforms.transforms) + pred_final = paddle.squeeze(pred_final) + pred_final = pred_final.numpy().astype('uint8') + + if visualization: + added_image = utils.visualize(images[handle_id+num], pred_final, weight=0.6) + pred_mask = utils.get_pseudo_color_map(pred_final) + pred_image_path = os.path.join(save_path, 'image', str(time.time()) + ".png") + pred_mask_path = os.path.join(save_path, 'mask', str(time.time()) + ".png") + if not os.path.exists(os.path.dirname(pred_image_path)): + os.makedirs(os.path.dirname(pred_image_path)) + if not os.path.exists(os.path.dirname(pred_mask_path)): + os.makedirs(os.path.dirname(pred_mask_path)) + cv2.imwrite(pred_image_path, added_image) + pred_mask.save(pred_mask_path) + + result.append(pred_final) + return result + + @serving + def serving_method(self, images: List[str], **kwargs): + """ + Run as a service. + """ + images_decode = [base64_to_cv2(image) for image in images] + visual = self.predict(images=images_decode, **kwargs) + final=[] + for mask in visual: + final.append(cv2_to_base64(mask)) + return final \ No newline at end of file diff --git a/paddlehub/vision/segmentation_transforms.py b/paddlehub/vision/segmentation_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..a1d4ce2a35101aef9f6ce2b29dd16c6af430abbe --- /dev/null +++ b/paddlehub/vision/segmentation_transforms.py @@ -0,0 +1,307 @@ +# coding: utf8 +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +from typing import Callable, Union, List, Tuple + +import cv2 +import numpy as np +from PIL import Image +import paddlehub.vision.functional as F + + +class Compose: + """ + Do transformation on input data with corresponding pre-processing and augmentation operations. + The shape of input data to all operations is [height, width, channels]. + + Args: + transforms (list): A list contains data pre-processing or augmentation. + to_rgb (bool, optional): If converting image to RGB color space. Default: True. + + Raises: + TypeError: When 'transforms' is not a list. + ValueError: when the length of 'transforms' is less than 1. + """ + + def __init__(self, transforms: Callable, to_rgb: bool = True): + if not isinstance(transforms, list): + raise TypeError('The transforms must be a list!') + if len(transforms) < 1: + raise ValueError('The length of transforms ' + \ + 'must be equal or larger than 1!') + self.transforms = transforms + self.to_rgb = to_rgb + + def __call__(self, im: Union[np.ndarray, str], label: Union[np.ndarray, str] = None) -> Tuple: + """ + Args: + im (str|np.ndarray): It is either image path or image object. + label (str|np.ndarray): It is either label path or label ndarray. + + Returns: + (tuple). A tuple including image, image info, and label after transformation. + """ + if isinstance(im, str): + im = cv2.imread(im).astype('float32') + if isinstance(label, str): + label = np.asarray(Image.open(label)) + if im is None: + raise ValueError('Can\'t read The image file {}!'.format(im)) + if self.to_rgb: + im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) + + for op in self.transforms: + outputs = op(im, label) + im = outputs[0] + if len(outputs) == 2: + label = outputs[1] + im = np.transpose(im, (2, 0, 1)) + return (im, label) + + +class ColorMap: + "Calculate color map for mapping segmentation result." + + def __init__(self, num_classes: int = 256): + self.num_classes = num_classes + 1 + + def __call__(self) -> np.ndarray: + color_map = self.num_classes * [0, 0, 0] + for i in range(0, self.num_classes): + j = 0 + lab = i + while lab: + color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j)) + color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j)) + color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j)) + j += 1 + lab >>= 3 + color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)] + color_map = color_map[1:] + return color_map + + +class SegmentVisual: + """Visualization the segmentation result. + Args: + weight(float): weight of original image in combining image, default is 0.6. + """ + + def __init__(self, weight: float = 0.6): + self.weight = weight + self.get_color_map_list = ColorMap(256) + + def __call__(self, image: str, result: np.ndarray, save_dir: str) -> np.ndarray: + color_map = self.get_color_map_list() + color_map = np.array(color_map).astype("uint8") + # Use OpenCV LUT for color mapping + c1 = cv2.LUT(result, color_map[:, 0]) + c2 = cv2.LUT(result, color_map[:, 1]) + c3 = cv2.LUT(result, color_map[:, 2]) + pseudo_img = np.dstack((c1, c2, c3)) + im = cv2.imread(image) + vis_result = cv2.addWeighted(im, self.weight, pseudo_img, 1 - self.weight, 0) + + if save_dir is not None: + if not os.path.exists(save_dir): + os.makedirs(save_dir) + image_name = os.path.split(image)[-1] + out_path = os.path.join(save_dir, image_name) + cv2.imwrite(out_path, vis_result) + + return vis_result + + +class Padding: + """ + Add bottom-right padding to a raw image or annotation image. + Args: + target_size (list|tuple): The target size after padding. + im_padding_value (list, optional): The padding value of raw image. + Default: [127.5, 127.5, 127.5]. + label_padding_value (int, optional): The padding value of annotation image. Default: 255. + Raises: + TypeError: When target_size is neither list nor tuple. + ValueError: When the length of target_size is not 2. + """ + + def __init__(self, + target_size: Union[List[int], Tuple[int], int], + im_padding_value: Union[List[int], Tuple[int], int] = (128, 128, 128), + label_padding_value: int = 255): + if isinstance(target_size, list) or isinstance(target_size, tuple): + if len(target_size) != 2: + raise ValueError( + '`target_size` should include 2 elements, but it is {}'. + format(target_size)) + else: + raise TypeError( + "Type of target_size is invalid. It should be list or tuple, now is {}" + .format(type(target_size))) + self.target_size = target_size + self.im_padding_value = im_padding_value + self.label_padding_value = label_padding_value + + def __call__(self, im: np.ndarray , label: np.ndarray = None) -> Tuple: + """ + Args: + im (np.ndarray): The Image data. + label (np.ndarray, optional): The label data. Default: None. + Returns: + (tuple). When label is None, it returns (im, ), otherwise it returns (im, label). + """ + + im_height, im_width = im.shape[0], im.shape[1] + if isinstance(self.target_size, int): + target_height = self.target_size + target_width = self.target_size + else: + target_height = self.target_size[1] + target_width = self.target_size[0] + pad_height = target_height - im_height + pad_width = target_width - im_width + if pad_height < 0 or pad_width < 0: + raise ValueError( + 'The size of image should be less than `target_size`, but the size of image ({}, {}) is larger than `target_size` ({}, {})' + .format(im_width, im_height, target_width, target_height)) + else: + im = cv2.copyMakeBorder(im, 0, pad_height, 0, pad_width, cv2.BORDER_CONSTANT, + value=self.im_padding_value) + if label is not None: + label = cv2.copyMakeBorder(label, 0, pad_height, 0, pad_width, cv2.BORDER_CONSTANT, + value=self.label_padding_value) + if label is None: + return (im,) + else: + return (im, label) + + +class Normalize: + """ + Normalize an image. + Args: + mean (list|tuple): The mean value of a data set. Default: [0.5, 0.5, 0.5]. + std (list|tuple): The standard deviation of a data set. Default: [0.5, 0.5, 0.5]. + Raises: + ValueError: When mean/std is not list or any value in std is 0. + """ + + def __init__(self, mean: Union[List[float], Tuple[float]] = (0.5, 0.5, 0.5), + std: Union[List[float], Tuple[float]] = (0.5, 0.5, 0.5)): + self.mean = mean + self.std = std + if not (isinstance(self.mean, (list, tuple)) + and isinstance(self.std, (list, tuple))): + raise ValueError( + "{}: input type is invalid. It should be list or tuple".format( + self)) + from functools import reduce + if reduce(lambda x, y: x * y, self.std) == 0: + raise ValueError('{}: std is invalid!'.format(self)) + + def __call__(self, im: np.ndarray, label: np.ndarray = None) -> Tuple: + """ + Args: + im (np.ndarray): The Image data. + label (np.ndarray, optional): The label data. Default: None. + Returns: + (tuple). When label is None, it returns (im, ), otherwise it returns (im, label). + """ + + mean = np.array(self.mean)[np.newaxis, np.newaxis, :] + std = np.array(self.std)[np.newaxis, np.newaxis, :] + im = F.normalize(im, mean, std) + + if label is None: + return (im,) + else: + return (im, label) + + +class Resize: + """ + Resize an image. + + Args: + target_size (list|tuple, optional): The target size of image. Default: (512, 512). + interp (str, optional): The interpolation mode of resize is consistent with opencv. + ['NEAREST', 'LINEAR', 'CUBIC', 'AREA', 'LANCZOS4', 'RANDOM']. Note that when it is + 'RANDOM', a random interpolation mode would be specified. Default: "LINEAR". + + Raises: + TypeError: When 'target_size' type is neither list nor tuple. + ValueError: When "interp" is out of pre-defined methods ('NEAREST', 'LINEAR', 'CUBIC', + 'AREA', 'LANCZOS4', 'RANDOM'). + """ + + # The interpolation mode + interp_dict = { + 'NEAREST': cv2.INTER_NEAREST, + 'LINEAR': cv2.INTER_LINEAR, + 'CUBIC': cv2.INTER_CUBIC, + 'AREA': cv2.INTER_AREA, + 'LANCZOS4': cv2.INTER_LANCZOS4 + } + + def __init__(self, target_size: Union[List[int], Tuple[int]] = (512, 512), interp: str = 'LINEAR'): + self.interp = interp + if not (interp == "RANDOM" or interp in self.interp_dict): + raise ValueError("`interp` should be one of {}".format( + self.interp_dict.keys())) + if isinstance(target_size, list) or isinstance(target_size, tuple): + if len(target_size) != 2: + raise ValueError( + '`target_size` should include 2 elements, but it is {}'. + format(target_size)) + else: + raise TypeError( + "Type of `target_size` is invalid. It should be list or tuple, but it is {}" + .format(type(target_size))) + + self.target_size = target_size + + def __call__(self, im: np.ndarray, label: np.ndarray = None) -> Tuple: + """ + Args: + im (np.ndarray): The Image data. + label (np.ndarray, optional): The label data. Default: None. + + Returns: + (tuple). When label is None, it returns (im, ), otherwise it returns (im, label), + + Raises: + TypeError: When the 'img' type is not numpy. + ValueError: When the length of "im" shape is not 3. + """ + + if not isinstance(im, np.ndarray): + raise TypeError("Resize: image type is not numpy.") + if len(im.shape) != 3: + raise ValueError('Resize: image is not 3-dimensional.') + if self.interp == "RANDOM": + interp = random.choice(list(self.interp_dict.keys())) + else: + interp = self.interp + im = F.resize(im, self.target_size, self.interp_dict[interp]) + if label is not None: + label = F.resize(label, self.target_size, + cv2.INTER_NEAREST) + + if label is None: + return (im,) + else: + return (im, label) \ No newline at end of file diff --git a/paddlehub/vision/utils.py b/paddlehub/vision/utils.py index 2b3c1fa1e79580ccbe5675136357d3a5586f62ca..39a8b549a018a42dda5e4ac18365bee0a052e69b 100644 --- a/paddlehub/vision/utils.py +++ b/paddlehub/vision/utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,11 +13,14 @@ # limitations under the License. import os +from typing import Callable, Union, List, Tuple +import cv2 import paddle import PIL import numpy as np import matplotlib as plt +import paddle.nn.functional as F def is_image_file(filename: str) -> bool: @@ -26,7 +29,7 @@ def is_image_file(filename: str) -> bool: return ext in ['.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff'] -def get_img_file(dir_name: str) -> list: +def get_img_file(dir_name: str) -> List[str]: '''Get all image file paths in several directories which have the same parent directory.''' images = [] for parent, _, filenames in os.walk(dir_name): @@ -39,7 +42,7 @@ def get_img_file(dir_name: str) -> list: return images -def box_crop(boxes: np.ndarray, labels: np.ndarray, scores: np.ndarray, crop: list, img_shape: list): +def box_crop(boxes: np.ndarray, labels: np.ndarray, scores: np.ndarray, crop: List[int], img_shape: List[int]) -> Tuple: """Crop the boxes ,labels, scores according to the given shape""" x, y, w, h = map(float, crop) @@ -99,7 +102,7 @@ def draw_boxes_on_image(image_path: str, boxes: np.ndarray, scores: np.ndarray, labels: np.ndarray, - label_names: list, + label_names: List[str], score_thresh: float = 0.5, save_path: str = 'result'): """Draw boxes on images.""" @@ -145,7 +148,7 @@ def draw_boxes_on_image(image_path: str, plt.close('all') -def get_label_infos(file_list: str): +def get_label_infos(file_list: str) -> str: """Get label names by corresponding category ids.""" from pycocotools.coco import COCO map_label = COCO(file_list) @@ -175,10 +178,115 @@ def gram_matrix(data: paddle.Tensor) -> paddle.Tensor: return gram -def npmax(array: np.ndarray): +def npmax(array: np.ndarray) -> Tuple[int]: """Get max value and index.""" arrayindex = array.argmax(1) arrayvalue = array.max(1) i = arrayvalue.argmax() j = arrayindex[i] return i, j + + +def visualize(image: Union[np.ndarray, str], result: np.ndarray, weight: float = 0.6) -> np.ndarray: + """ + Convert segmentation result to color image, and save added image. + + Args: + image (str|np.ndarray): The path of origin image or bgr image. + result (np.ndarray): The predict result of image. + weight (float): The image weight of visual image, and the result weight is (1 - weight). Default: 0.6 + + Returns: + vis_result (np.ndarray): return the visualized result. + """ + + color_map = get_color_map_list(256) + color_map = [color_map[i:i + 3] for i in range(0, len(color_map), 3)] + color_map = np.array(color_map).astype("uint8") + # Use OpenCV LUT for color mapping + c1 = cv2.LUT(result, color_map[:, 0]) + c2 = cv2.LUT(result, color_map[:, 1]) + c3 = cv2.LUT(result, color_map[:, 2]) + pseudo_img = np.dstack((c1, c2, c3)) + if isinstance(image, str): + im = cv2.imread(image) + else: + im = image + vis_result = cv2.addWeighted(im, weight, pseudo_img, 1 - weight, 0) + + return vis_result + + +def get_pseudo_color_map(pred: np.ndarray) -> PIL.Image.Image: + '''visualization the segmentation mask.''' + pred_mask = PIL.Image.fromarray(pred.astype(np.uint8), mode='P') + color_map = get_color_map_list(256) + pred_mask.putpalette(color_map) + return pred_mask + + +def get_color_map_list(num_classes: int) -> List[int]: + """ + Returns the color map for visualizing the segmentation mask, + which can support arbitrary number of classes. + + Args: + num_classes (int): Number of classes. + + Returns: + (list). The color map. + """ + + num_classes += 1 + color_map = num_classes * [0, 0, 0] + for i in range(0, num_classes): + j = 0 + lab = i + while lab: + color_map[i * 3] |= (((lab >> 0) & 1) << (7 - j)) + color_map[i * 3 + 1] |= (((lab >> 1) & 1) << (7 - j)) + color_map[i * 3 + 2] |= (((lab >> 2) & 1) << (7 - j)) + j += 1 + lab >>= 3 + color_map = color_map[3:] + return color_map + + +def get_reverse_list(ori_shape: List[int], transforms: List[Callable]) -> List[tuple]: + """ + get reverse list of transform. + + Args: + ori_shape (list): Origin shape of image. + transforms (list): List of transform. + + Returns: + list: List of tuple, there are two format: + ('resize', (h, w)) The image shape before resize, + ('padding', (h, w)) The image shape before padding. + """ + reverse_list = [] + h, w = ori_shape[0], ori_shape[1] + for op in transforms: + if op.__class__.__name__ in ['Resize', 'ResizeByLong']: + reverse_list.append(('resize', (h, w))) + h, w = op.target_size[0], op.target_size[1] + if op.__class__.__name__ in ['Padding']: + reverse_list.append(('padding', (h, w))) + w, h = op.target_size[0], op.target_size[1] + return reverse_list + + +def reverse_transform(pred: paddle.Tensor, ori_shape: List[int], transforms: List[int]) -> paddle.Tensor: + """recover pred to origin shape""" + reverse_list = get_reverse_list(ori_shape, transforms) + for item in reverse_list[::-1]: + if item[0] == 'resize': + h, w = item[1][0], item[1][1] + pred = F.interpolate(pred, (h, w), mode='nearest') + elif item[0] == 'padding': + h, w = item[1][0], item[1][1] + pred = pred[:, :, 0:h, 0:w] + else: + raise Exception("Unexpected info '{}' in im_info".format(item[0])) + return pred \ No newline at end of file