From 98d598b7fe14ddca68f8107a66a1f8a3e4ce2bd8 Mon Sep 17 00:00:00 2001
From: jm12138 <2286040843@qq.com>
Date: Fri, 23 Sep 2022 17:42:16 +0800
Subject: [PATCH] Add LSeg Module (#2038)
* add LSeg
* add LSeg README
* add requirements.txt
* update README
* update module
* update
* update
* update
* update
* pre-commit
* update
* save jpg -> save png
* bgr -> bgra
* fix typo
* pre-commit
---
.../semantic_segmentation/lseg/README.md | 178 ++++++++++
.../lseg/models/__init__.py | 3 +
.../semantic_segmentation/lseg/models/clip.py | 45 +++
.../semantic_segmentation/lseg/models/lseg.py | 19 ++
.../lseg/models/scratch.py | 318 ++++++++++++++++++
.../semantic_segmentation/lseg/models/vit.py | 228 +++++++++++++
.../semantic_segmentation/lseg/module.py | 194 +++++++++++
.../lseg/requirements.txt | 4 +
.../image/semantic_segmentation/lseg/test.py | 67 ++++
9 files changed, 1056 insertions(+)
create mode 100644 modules/image/semantic_segmentation/lseg/README.md
create mode 100644 modules/image/semantic_segmentation/lseg/models/__init__.py
create mode 100644 modules/image/semantic_segmentation/lseg/models/clip.py
create mode 100644 modules/image/semantic_segmentation/lseg/models/lseg.py
create mode 100644 modules/image/semantic_segmentation/lseg/models/scratch.py
create mode 100644 modules/image/semantic_segmentation/lseg/models/vit.py
create mode 100644 modules/image/semantic_segmentation/lseg/module.py
create mode 100644 modules/image/semantic_segmentation/lseg/requirements.txt
create mode 100644 modules/image/semantic_segmentation/lseg/test.py
diff --git a/modules/image/semantic_segmentation/lseg/README.md b/modules/image/semantic_segmentation/lseg/README.md
new file mode 100644
index 00000000..63a92931
--- /dev/null
+++ b/modules/image/semantic_segmentation/lseg/README.md
@@ -0,0 +1,178 @@
+# lseg
+
+|模型名称|lseg|
+| :--- | :---: |
+|类别|图像-图像分割|
+|网络|LSeg|
+|数据集|-|
+|是否支持Fine-tuning|否|
+|模型大小|1.63GB|
+|指标|-|
+|最新更新日期|2022-09-22|
+
+
+## 一、模型基本信息
+
+- ### 应用效果展示
+
+ - 网络结构:
+
+
+
+
+ - 样例结果示例:
+
+
+
+
+- ### 模型介绍
+
+ - 文本驱动的图像语义分割模型(Language-driven Semantic Segmentation),即通过文本控制模型的分割类别实现指定类别的图像语义分割算法。
+
+
+
+## 二、安装
+
+- ### 1、环境依赖
+
+ - paddlepaddle >= 2.0.0
+
+ - paddlehub >= 2.0.0
+
+- ### 2.安装
+
+ - ```shell
+ $ hub install lseg
+ ```
+ - 如您安装时遇到问题,可参考:[零基础windows安装](../../../../docs/docs_ch/get_start/windows_quickstart.md)
+ | [零基础Linux安装](../../../../docs/docs_ch/get_start/linux_quickstart.md) | [零基础MacOS安装](../../../../docs/docs_ch/get_start/mac_quickstart.md)
+
+## 三、模型API预测
+ - ### 1、命令行预测
+
+ ```shell
+ $ hub run lseg \
+ --input_path "/PATH/TO/IMAGE" \
+ --labels "Category 1" "Category 2" "Category n" \
+ --output_dir "lseg_output"
+ ```
+
+ - ### 2、预测代码示例
+
+ ```python
+ import paddlehub as hub
+ import cv2
+
+ module = hub.Module(name="lseg")
+ result = module.segment(
+ image=cv2.imread('/PATH/TO/IMAGE'),
+ labels=["Category 1", "Category 2", "Category n"],
+ visualization=True,
+ output_dir='lseg_output'
+ )
+ ```
+
+ - ### 3、API
+
+ ```python
+ def segment(
+ image: Union[str, numpy.ndarray],
+ labels: Union[str, List[str]],
+ visualization: bool = False,
+ output_dir: str = 'lseg_output'
+ ) -> Dict[str, Union[numpy.ndarray, Dict[str, numpy.ndarray]]]
+ ```
+
+ - 语义分割 API
+
+ - **参数**
+
+ * image (Union\[str, numpy.ndarray\]): 图片数据,ndarray.shape 为 \[H, W, C\],BGR格式;
+ * labels (Union\[str, List\[str\]\]): 类别文本标签;
+ * visualization (bool): 是否将识别结果保存为图片文件;
+ * output\_dir (str): 保存处理结果的文件目录。
+
+ - **返回**
+
+ * res (Dict\[str, Union\[numpy.ndarray, Dict\[str, numpy.ndarray\]\]\]): 识别结果的字典,字典中包含如下元素:
+ * gray (numpy.ndarray): 灰度分割结果 (GRAY);
+ * color (numpy.ndarray): 伪彩色图分割结果 (BGR);
+ * mix (numpy.ndarray): 叠加原图和伪彩色图的分割结果 (BGR);
+ * classes (Dict\[str, numpy.ndarray\]): 各个类别标签的分割抠图结果 (BGRA)。
+
+## 四、服务部署
+
+- PaddleHub Serving可以部署一个语义驱动的语义分割的在线服务。
+
+- ### 第一步:启动PaddleHub Serving
+
+ - 运行启动命令:
+
+ ```shell
+ $ hub serving start -m lseg
+ ```
+
+ - 这样就完成了一个语义驱动的语义分割服务化API的部署,默认端口号为8866。
+
+- ### 第二步:发送预测请求
+
+ - 配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果
+
+ ```python
+ import requests
+ import json
+ import base64
+
+ import cv2
+ import numpy as np
+
+ def cv2_to_base64(image):
+ data = cv2.imencode('.jpg', image)[1]
+ return base64.b64encode(data.tobytes()).decode('utf8')
+
+ def base64_to_cv2(b64str):
+ data = base64.b64decode(b64str.encode('utf8'))
+ data = np.frombuffer(data, np.uint8)
+ data = cv2.imdecode(data, cv2.IMREAD_COLOR)
+ return data
+
+ # 发送HTTP请求
+ org_im = cv2.imread('/PATH/TO/IMAGE')
+ data = {
+ 'image': cv2_to_base64(org_im),
+ 'labels': ["Category 1", "Category 2", "Category n"]
+ }
+ headers = {"Content-type": "application/json"}
+ url = "http://127.0.0.1:8866/predict/lseg"
+ r = requests.post(url=url, headers=headers, data=json.dumps(data))
+
+ # 结果转换
+ results = r.json()['results']
+ results = {
+ 'gray': base64_to_cv2(results['gray']),
+ 'color': base64_to_cv2(results['color']),
+ 'mix': base64_to_cv2(results['mix']),
+ 'classes': {
+ k: base64_to_cv2(v) for k, v in results['classes'].items()
+ }
+ }
+
+ # 保存输出
+ cv2.imwrite('mix.jpg', results['mix'])
+ ```
+
+## 五、参考资料
+
+* 论文:[Language-driven Semantic Segmentation](https://arxiv.org/abs/2201.03546)
+
+* 官方实现:[isl-org/lang-seg](https://github.com/isl-org/lang-seg)
+
+## 六、更新历史
+
+* 1.0.0
+
+ 初始发布
+
+ ```shell
+ $ hub install lseg==1.0.0
+ ```
diff --git a/modules/image/semantic_segmentation/lseg/models/__init__.py b/modules/image/semantic_segmentation/lseg/models/__init__.py
new file mode 100644
index 00000000..7718276c
--- /dev/null
+++ b/modules/image/semantic_segmentation/lseg/models/__init__.py
@@ -0,0 +1,3 @@
+from .lseg import LSeg
+
+__all__ = ['LSeg']
diff --git a/modules/image/semantic_segmentation/lseg/models/clip.py b/modules/image/semantic_segmentation/lseg/models/clip.py
new file mode 100644
index 00000000..791f3c4b
--- /dev/null
+++ b/modules/image/semantic_segmentation/lseg/models/clip.py
@@ -0,0 +1,45 @@
+import paddle
+import paddle.nn as nn
+from paddlenlp.transformers.clip.modeling import TextTransformer
+
+
+class CLIPText(nn.Layer):
+
+ def __init__(self,
+ max_text_length: int = 77,
+ vocab_size: int = 49408,
+ text_embed_dim: int = 512,
+ text_heads: int = 8,
+ text_layers: int = 12,
+ text_hidden_act: str = "quick_gelu",
+ projection_dim: int = 512):
+ super().__init__()
+
+ self.text_model = TextTransformer(context_length=max_text_length,
+ transformer_width=text_embed_dim,
+ transformer_heads=text_heads,
+ transformer_layers=text_layers,
+ vocab_size=vocab_size,
+ activation=text_hidden_act,
+ normalize_before=True)
+
+ self.text_projection = paddle.create_parameter((text_embed_dim, projection_dim), paddle.get_default_dtype())
+
+ def get_text_features(
+ self,
+ input_ids,
+ attention_mask=None,
+ position_ids=None,
+ output_attentions=False,
+ output_hidden_states=False,
+ return_dict=False,
+ ):
+ text_outputs = self.text_model(input_ids=input_ids,
+ position_ids=position_ids,
+ attention_mask=attention_mask,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict)
+ pooled_output = text_outputs[1]
+ text_features = paddle.matmul(pooled_output, self.text_projection)
+ return text_features
diff --git a/modules/image/semantic_segmentation/lseg/models/lseg.py b/modules/image/semantic_segmentation/lseg/models/lseg.py
new file mode 100644
index 00000000..f2ace02b
--- /dev/null
+++ b/modules/image/semantic_segmentation/lseg/models/lseg.py
@@ -0,0 +1,19 @@
+import paddle.nn as nn
+
+from .clip import CLIPText
+from .scratch import Scratch
+from .vit import ViT
+
+
+class LSeg(nn.Layer):
+
+ def __init__(self):
+ super().__init__()
+ self.clip = CLIPText()
+ self.vit = ViT()
+ self.scratch = Scratch()
+
+ def forward(self, images, texts):
+ layer_1, layer_2, layer_3, layer_4 = self.vit.forward(images)
+ text_features = self.clip.get_text_features(texts)
+ return self.scratch.forward(layer_1, layer_2, layer_3, layer_4, text_features)
diff --git a/modules/image/semantic_segmentation/lseg/models/scratch.py b/modules/image/semantic_segmentation/lseg/models/scratch.py
new file mode 100644
index 00000000..3e407461
--- /dev/null
+++ b/modules/image/semantic_segmentation/lseg/models/scratch.py
@@ -0,0 +1,318 @@
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+
+class Interpolate(nn.Layer):
+ """Interpolation module."""
+
+ def __init__(self, scale_factor, mode, align_corners=False):
+ """Init.
+
+ Args:
+ scale_factor (float): scaling
+ mode (str): interpolation mode
+ """
+ super(Interpolate, self).__init__()
+
+ self.interp = nn.functional.interpolate
+ self.scale_factor = scale_factor
+ self.mode = mode
+ self.align_corners = align_corners
+
+ def forward(self, x):
+ """Forward pass.
+
+ Args:
+ x (tensor): input
+
+ Returns:
+ tensor: interpolated data
+ """
+
+ x = self.interp(
+ x,
+ scale_factor=self.scale_factor,
+ mode=self.mode,
+ align_corners=self.align_corners,
+ )
+
+ return x
+
+
+class ResidualConvUnit(nn.Layer):
+ """Residual convolution module."""
+
+ def __init__(self, features):
+ """Init.
+
+ Args:
+ features (int): number of features
+ """
+ super().__init__()
+
+ self.conv1 = nn.Conv2D(features, features, kernel_size=3, stride=1, padding=1)
+
+ self.conv2 = nn.Conv2D(features, features, kernel_size=3, stride=1, padding=1)
+
+ self.relu = nn.ReLU(inplace=True)
+
+ def forward(self, x):
+ """Forward pass.
+
+ Args:
+ x (tensor): input
+
+ Returns:
+ tensor: output
+ """
+ out = self.relu(x)
+ out = self.conv1(out)
+ out = self.relu(out)
+ out = self.conv2(out)
+
+ return out + x
+
+
+class FeatureFusionBlock(nn.Layer):
+ """Feature fusion block."""
+
+ def __init__(self, features):
+ """Init.
+
+ Args:
+ features (int): number of features
+ """
+ super(FeatureFusionBlock, self).__init__()
+
+ self.resConfUnit1 = ResidualConvUnit(features)
+ self.resConfUnit2 = ResidualConvUnit(features)
+
+ def forward(self, *xs):
+ """Forward pass.
+
+ Returns:
+ tensor: output
+ """
+ output = xs[0]
+
+ if len(xs) == 2:
+ output += self.resConfUnit1(xs[1])
+
+ output = self.resConfUnit2(output)
+
+ output = nn.functional.interpolate(output, scale_factor=2, mode="bilinear", align_corners=True)
+
+ return output
+
+
+class ResidualConvUnit_custom(nn.Layer):
+ """Residual convolution module."""
+
+ def __init__(self, features, activation, bn):
+ """Init.
+
+ Args:
+ features (int): number of features
+ """
+ super().__init__()
+
+ self.bn = bn
+
+ self.groups = 1
+
+ self.conv1 = nn.Conv2D(
+ features,
+ features,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias_attr=not self.bn,
+ groups=self.groups,
+ )
+
+ self.conv2 = nn.Conv2D(
+ features,
+ features,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias_attr=not self.bn,
+ groups=self.groups,
+ )
+
+ if self.bn == True:
+ self.bn1 = nn.BatchNorm2D(features)
+ self.bn2 = nn.BatchNorm2D(features)
+
+ self.activation = activation
+
+ def forward(self, x):
+ """Forward pass.
+
+ Args:
+ x (tensor): input
+
+ Returns:
+ tensor: output
+ """
+
+ out = self.activation(x)
+ out = self.conv1(out)
+ if self.bn == True:
+ out = self.bn1(out)
+
+ out = self.activation(out)
+ out = self.conv2(out)
+ if self.bn == True:
+ out = self.bn2(out)
+
+ if self.groups > 1:
+ out = self.conv_merge(out)
+
+ return out + x
+
+
+class FeatureFusionBlock_custom(nn.Layer):
+ """Feature fusion block."""
+
+ def __init__(
+ self,
+ features,
+ activation=nn.ReLU(),
+ deconv=False,
+ bn=False,
+ expand=False,
+ align_corners=True,
+ ):
+ """Init.
+
+ Args:
+ features (int): number of features
+ """
+ super(FeatureFusionBlock_custom, self).__init__()
+
+ self.deconv = deconv
+ self.align_corners = align_corners
+
+ self.groups = 1
+
+ self.expand = expand
+ out_features = features
+ if self.expand == True:
+ out_features = features // 2
+
+ self.out_conv = nn.Conv2D(
+ features,
+ out_features,
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ bias_attr=True,
+ groups=1,
+ )
+
+ self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+ self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+
+ def forward(self, *xs):
+ """Forward pass.
+
+ Returns:
+ tensor: output
+ """
+ output = xs[0]
+
+ if len(xs) == 2:
+ res = self.resConfUnit1(xs[1])
+ output += res
+
+ output = self.resConfUnit2(output)
+
+ output = nn.functional.interpolate(output, scale_factor=2, mode="bilinear", align_corners=self.align_corners)
+
+ output = self.out_conv(output)
+
+ return output
+
+
+class Scratch(nn.Layer):
+
+ def __init__(self, in_channels=[256, 512, 1024, 1024], out_channels=256):
+ super().__init__()
+ self.out_c = 512
+ self.logit_scale = paddle.to_tensor(np.exp(np.log([1 / 0.07])))
+ self.layer1_rn = nn.Conv2D(
+ in_channels[0],
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias_attr=False,
+ groups=1,
+ )
+ self.layer2_rn = nn.Conv2D(
+ in_channels[1],
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias_attr=False,
+ groups=1,
+ )
+ self.layer3_rn = nn.Conv2D(
+ in_channels[2],
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias_attr=False,
+ groups=1,
+ )
+ self.layer4_rn = nn.Conv2D(
+ in_channels[3],
+ out_channels,
+ kernel_size=3,
+ stride=1,
+ padding=1,
+ bias_attr=False,
+ groups=1,
+ )
+
+ self.refinenet1 = FeatureFusionBlock_custom(out_channels, bn=True)
+ self.refinenet2 = FeatureFusionBlock_custom(out_channels, bn=True)
+ self.refinenet3 = FeatureFusionBlock_custom(out_channels, bn=True)
+ self.refinenet4 = FeatureFusionBlock_custom(out_channels, bn=True)
+
+ self.head1 = nn.Conv2D(out_channels, self.out_c, kernel_size=1)
+
+ self.output_conv = nn.Sequential(Interpolate(scale_factor=2, mode="bilinear", align_corners=True))
+
+ def forward(self, layer_1, layer_2, layer_3, layer_4, text_features):
+
+ layer_1_rn = self.layer1_rn(layer_1)
+ layer_2_rn = self.layer2_rn(layer_2)
+ layer_3_rn = self.layer3_rn(layer_3)
+ layer_4_rn = self.layer4_rn(layer_4)
+
+ path_4 = self.refinenet4(layer_4_rn)
+ path_3 = self.refinenet3(path_4, layer_3_rn)
+ path_2 = self.refinenet2(path_3, layer_2_rn)
+ path_1 = self.refinenet1(path_2, layer_1_rn)
+
+ image_features = self.head1(path_1)
+
+ imshape = image_features.shape
+ image_features = image_features.transpose((0, 2, 3, 1)).reshape((-1, self.out_c))
+
+ # normalized features
+ image_features = image_features / image_features.norm(axis=-1, keepdim=True)
+ text_features = text_features / text_features.norm(axis=-1, keepdim=True)
+
+ logits_per_image = self.logit_scale * image_features @ text_features.t()
+
+ out = logits_per_image.reshape((imshape[0], imshape[2], imshape[3], -1)).transpose((0, 3, 1, 2))
+
+ out = self.output_conv(out)
+
+ return out
diff --git a/modules/image/semantic_segmentation/lseg/models/vit.py b/modules/image/semantic_segmentation/lseg/models/vit.py
new file mode 100644
index 00000000..75c5d019
--- /dev/null
+++ b/modules/image/semantic_segmentation/lseg/models/vit.py
@@ -0,0 +1,228 @@
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddleclas.ppcls.arch.backbone.model_zoo.vision_transformer import VisionTransformer
+
+
+class Slice(nn.Layer):
+
+ def __init__(self, start_index=1):
+ super(Slice, self).__init__()
+ self.start_index = start_index
+
+ def forward(self, x):
+ return x[:, self.start_index:]
+
+
+class AddReadout(nn.Layer):
+
+ def __init__(self, start_index=1):
+ super(AddReadout, self).__init__()
+ self.start_index = start_index
+
+ def forward(self, x):
+ if self.start_index == 2:
+ readout = (x[:, 0] + x[:, 1]) / 2
+ else:
+ readout = x[:, 0]
+ return x[:, self.start_index:] + readout.unsqueeze(1)
+
+
+class Transpose(nn.Layer):
+
+ def __init__(self, dim0, dim1):
+ super(Transpose, self).__init__()
+ self.dim0 = dim0
+ self.dim1 = dim1
+
+ def forward(self, x):
+ prems = list(range(x.dim()))
+ prems[self.dim0], prems[self.dim1] = prems[self.dim1], prems[self.dim0]
+ x = x.transpose(prems)
+ return x
+
+
+class Unflatten(nn.Layer):
+
+ def __init__(self, start_axis, shape):
+ super(Unflatten, self).__init__()
+ self.start_axis = start_axis
+ self.shape = shape
+
+ def forward(self, x):
+ return paddle.reshape(x, x.shape[:self.start_axis] + [self.shape])
+
+
+class ProjectReadout(nn.Layer):
+
+ def __init__(self, in_features, start_index=1):
+ super(ProjectReadout, self).__init__()
+ self.start_index = start_index
+
+ self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
+
+ def forward(self, x):
+ readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
+ features = paddle.concat((x[:, self.start_index:], readout), -1)
+
+ return self.project(features)
+
+
+class ViT(VisionTransformer):
+
+ def __init__(self,
+ img_size=384,
+ patch_size=16,
+ in_chans=3,
+ class_num=1000,
+ embed_dim=1024,
+ depth=24,
+ num_heads=16,
+ mlp_ratio=4,
+ qkv_bias=True,
+ qk_scale=None,
+ drop_rate=0,
+ attn_drop_rate=0,
+ drop_path_rate=0,
+ norm_layer='nn.LayerNorm',
+ epsilon=1e-6,
+ **kwargs):
+ super().__init__(img_size, patch_size, in_chans, class_num, embed_dim, depth, num_heads, mlp_ratio, qkv_bias,
+ qk_scale, drop_rate, attn_drop_rate, drop_path_rate, norm_layer, epsilon, **kwargs)
+ self.patch_size = patch_size
+ self.start_index = 1
+ features = [256, 512, 1024, 1024]
+ readout_oper = [ProjectReadout(embed_dim, self.start_index) for out_feat in features]
+ self.act_postprocess1 = nn.Sequential(
+ readout_oper[0],
+ Transpose(1, 2),
+ Unflatten(2, [img_size // 16, img_size // 16]),
+ nn.Conv2D(
+ in_channels=embed_dim,
+ out_channels=features[0],
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ ),
+ nn.Conv2DTranspose(
+ in_channels=features[0],
+ out_channels=features[0],
+ kernel_size=4,
+ stride=4,
+ padding=0,
+ dilation=1,
+ groups=1,
+ ),
+ )
+
+ self.act_postprocess2 = nn.Sequential(
+ readout_oper[1],
+ Transpose(1, 2),
+ Unflatten(2, [img_size // 16, img_size // 16]),
+ nn.Conv2D(
+ in_channels=embed_dim,
+ out_channels=features[1],
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ ),
+ nn.Conv2DTranspose(
+ in_channels=features[1],
+ out_channels=features[1],
+ kernel_size=2,
+ stride=2,
+ padding=0,
+ dilation=1,
+ groups=1,
+ ),
+ )
+
+ self.act_postprocess3 = nn.Sequential(
+ readout_oper[2],
+ Transpose(1, 2),
+ Unflatten(2, [img_size // 16, img_size // 16]),
+ nn.Conv2D(
+ in_channels=embed_dim,
+ out_channels=features[2],
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ ),
+ )
+
+ self.act_postprocess4 = nn.Sequential(
+ readout_oper[3],
+ Transpose(1, 2),
+ Unflatten(2, [img_size // 16, img_size // 16]),
+ nn.Conv2D(
+ in_channels=embed_dim,
+ out_channels=features[3],
+ kernel_size=1,
+ stride=1,
+ padding=0,
+ ),
+ nn.Conv2D(
+ in_channels=features[3],
+ out_channels=features[3],
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ ),
+ )
+
+ self.norm = nn.Identity()
+ self.head = nn.Identity()
+
+ def _resize_pos_embed(self, posemb, gs_h, gs_w):
+ posemb_tok, posemb_grid = (
+ posemb[:, :self.start_index],
+ posemb[0, self.start_index:],
+ )
+
+ gs_old = int(math.sqrt(len(posemb_grid)))
+
+ posemb_grid = posemb_grid.reshape((1, gs_old, gs_old, -1)).transpose((0, 3, 1, 2))
+ posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
+ posemb_grid = posemb_grid.transpose((0, 2, 3, 1)).reshape((1, gs_h * gs_w, -1))
+
+ posemb = paddle.concat([posemb_tok, posemb_grid], axis=1)
+
+ return posemb
+
+ def forward(self, x):
+ b, c, h, w = x.shape
+
+ pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size, w // self.patch_size)
+ x = self.patch_embed.proj(x).flatten(2).transpose((0, 2, 1))
+
+ cls_tokens = self.cls_token.expand((b, -1, -1))
+ x = paddle.concat((cls_tokens, x), axis=1)
+
+ x = x + pos_embed
+ x = self.pos_drop(x)
+
+ outputs = []
+ for index, blk in enumerate(self.blocks):
+ x = blk(x)
+ if index in [5, 11, 17, 23]:
+ outputs.append(x)
+
+ layer_1 = self.act_postprocess1[0:2](outputs[0])
+ layer_2 = self.act_postprocess2[0:2](outputs[1])
+ layer_3 = self.act_postprocess3[0:2](outputs[2])
+ layer_4 = self.act_postprocess4[0:2](outputs[3])
+
+ shape = (-1, 1024, h // self.patch_size, w // self.patch_size)
+ layer_1 = layer_1.reshape(shape)
+ layer_2 = layer_2.reshape(shape)
+ layer_3 = layer_3.reshape(shape)
+ layer_4 = layer_4.reshape(shape)
+
+ layer_1 = self.act_postprocess1[3:len(self.act_postprocess1)](layer_1)
+ layer_2 = self.act_postprocess2[3:len(self.act_postprocess2)](layer_2)
+ layer_3 = self.act_postprocess3[3:len(self.act_postprocess3)](layer_3)
+ layer_4 = self.act_postprocess4[3:len(self.act_postprocess4)](layer_4)
+
+ return layer_1, layer_2, layer_3, layer_4
diff --git a/modules/image/semantic_segmentation/lseg/module.py b/modules/image/semantic_segmentation/lseg/module.py
new file mode 100644
index 00000000..55ba891e
--- /dev/null
+++ b/modules/image/semantic_segmentation/lseg/module.py
@@ -0,0 +1,194 @@
+import argparse
+import base64
+import os
+import time
+from typing import Dict
+from typing import List
+from typing import Union
+
+import cv2
+import numpy as np
+import paddle
+import paddle.vision.transforms as transforms
+from paddlenlp.transformers.clip.tokenizer import CLIPTokenizer
+
+import paddlehub as hub
+from . import models
+from paddlehub.module.module import moduleinfo
+from paddlehub.module.module import runnable
+from paddlehub.module.module import serving
+
+
+def cv2_to_base64(image):
+ data = cv2.imencode('.jpg', image)[1]
+ return base64.b64encode(data.tobytes()).decode('utf8')
+
+
+def base64_to_cv2(b64str):
+ data = base64.b64decode(b64str.encode('utf8'))
+ data = np.frombuffer(data, np.uint8)
+ data = cv2.imdecode(data, cv2.IMREAD_COLOR)
+ return data
+
+
+@moduleinfo(
+ name='lseg',
+ version='1.0.0',
+ type="CV/semantic_segmentation",
+ author="",
+ author_email="",
+ summary="Language-driven Semantic Segmentation.",
+)
+class LSeg(models.LSeg):
+
+ def __init__(self):
+ super(LSeg, self).__init__()
+ self.default_pretrained_model_path = os.path.join(self.directory, 'ckpts', 'LSeg.pdparams')
+ state_dict = paddle.load(self.default_pretrained_model_path)
+ self.set_state_dict(state_dict)
+ self.eval()
+ self.transforms = transforms.Compose([
+ transforms.ToTensor(),
+ transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
+ ])
+ self.tokenizer = CLIPTokenizer.from_pretrained('openai/clip-vit-base-patch32')
+
+ self.language_recognition = hub.Module(name='baidu_language_recognition')
+ self.translate = hub.Module(name='baidu_translate')
+
+ @staticmethod
+ def get_colormap(n):
+ assert n <= 256, "num_class should be less than 256."
+
+ pallete = [0] * (256 * 3)
+
+ for j in range(0, n):
+ lab = j
+ pallete[j * 3 + 0] = 0
+ pallete[j * 3 + 1] = 0
+ pallete[j * 3 + 2] = 0
+ i = 0
+ while (lab > 0):
+ pallete[j * 3 + 0] |= (((lab >> 0) & 1) << (7 - i))
+ pallete[j * 3 + 1] |= (((lab >> 1) & 1) << (7 - i))
+ pallete[j * 3 + 2] |= (((lab >> 2) & 1) << (7 - i))
+ i = i + 1
+ lab >>= 3
+
+ return np.asarray(pallete, dtype=np.uint8).reshape(256, 1, 3)
+
+ def segment(self,
+ image: Union[str, np.ndarray],
+ labels: Union[str, List[str]],
+ visualization: bool = False,
+ output_dir: str = 'lseg_output') -> Dict[str, Union[np.ndarray, Dict[str, np.ndarray]]]:
+ if isinstance(image, str):
+ image = cv2.imread(image)
+ elif isinstance(image, np.ndarray):
+ image = image
+ else:
+ raise Exception("image should be a str / np.ndarray")
+
+ if isinstance(labels, str):
+ labels = [labels, 'other']
+ print('"other" category label is automatically added because the length of labels is equal to 1')
+ print('new labels: ', labels)
+ elif isinstance(labels, list):
+ if len(labels) == 1:
+ labels.append('other')
+ print('"other" category label is automatically added because the length of labels is equal to 1')
+ print('new labels: ', labels)
+ elif len(labels) == 0:
+ raise Exception("labels should not be empty.")
+ else:
+ raise Exception("labels should be a str or list.")
+
+ class_num = len(labels)
+
+ labels_ = list(set(labels))
+ labels_.sort(key=labels.index)
+ labels = labels_
+
+ input_labels = []
+ for label in labels:
+ from_lang = self.language_recognition.recognize(query=label)
+ if from_lang != 'en':
+ label = self.translate.translate(query=label, from_lang=from_lang, to_lang='en')
+ input_labels.append(label)
+
+ labels_dict = {k: v for k, v in zip(input_labels, labels)}
+
+ input_labels_ = list(set(input_labels))
+ input_labels_.sort(key=input_labels.index)
+ input_labels = input_labels_
+
+ labels = []
+ for input_label in input_labels:
+ labels.append(labels_dict[input_label])
+
+ if len(labels) < class_num:
+ print('remove the same labels...')
+ print('new labels: ', labels)
+
+ h, w = image.shape[:2]
+ image = image[:-(h % 32) if h % 32 else None, :-(w % 32) if w % 32 else None]
+ images = self.transforms(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)).unsqueeze(0)
+ texts = self.tokenizer(input_labels, padding=True, return_tensors="pd")['input_ids']
+
+ with paddle.no_grad():
+ results = self.forward(images, texts)
+ results = paddle.argmax(results, 1).cast(paddle.uint8)
+ gray_seg = results.numpy()[0]
+
+ colormap = self.get_colormap(len(labels))
+ color_seg = cv2.applyColorMap(gray_seg, colormap)
+ mix_seg = cv2.addWeighted(image, 0.5, color_seg, 0.5, 0.0)
+
+ classes_seg = {}
+ for i, label in enumerate(labels):
+ mask = ((gray_seg == i).astype('uint8') * 255)[..., None]
+ classes_seg[label] = np.concatenate([image, mask], 2)
+
+ if visualization:
+ save_dir = os.path.join(output_dir, str(int(time.time())))
+ if not os.path.isdir(save_dir):
+ os.makedirs(save_dir)
+ for label, dst in classes_seg.items():
+ cv2.imwrite(os.path.join(save_dir, '%s.png' % label), dst)
+
+ return {'gray': gray_seg, 'color': color_seg, 'mix': mix_seg, 'classes': classes_seg}
+
+ @runnable
+ def run_cmd(self, argvs):
+ """
+ Run as a command.
+ """
+ self.parser = argparse.ArgumentParser(description="Run the {} module.".format(self.name),
+ prog='hub run {}'.format(self.name),
+ usage='%(prog)s',
+ add_help=True)
+ self.parser.add_argument('--input_path', type=str, help="path to image.")
+ self.parser.add_argument('--labels', type=str, nargs='+', help="segmentation labels.")
+ self.parser.add_argument('--output_dir',
+ type=str,
+ default='lseg_output',
+ help="The directory to save output images.")
+ args = self.parser.parse_args(argvs)
+ self.segment(image=args.input_path, labels=args.labels, visualization=True, output_dir=args.output_dir)
+ return 'segmentation results are saved in %s' % args.output_dir
+
+ @serving
+ def serving_method(self, image, **kwargs):
+ """
+ Run as a service.
+ """
+ image = base64_to_cv2(image)
+ results = self.segment(image=image, **kwargs)
+
+ return {
+ 'gray': cv2_to_base64(results['gray']),
+ 'color': cv2_to_base64(results['color']),
+ 'mix': cv2_to_base64(results['mix']),
+ 'classes': {k: cv2_to_base64(v)
+ for k, v in results['classes'].items()}
+ }
diff --git a/modules/image/semantic_segmentation/lseg/requirements.txt b/modules/image/semantic_segmentation/lseg/requirements.txt
new file mode 100644
index 00000000..1bd663bd
--- /dev/null
+++ b/modules/image/semantic_segmentation/lseg/requirements.txt
@@ -0,0 +1,4 @@
+paddleclas>=2.4.0
+paddlenlp>=2.4.0
+ftfy
+regex
diff --git a/modules/image/semantic_segmentation/lseg/test.py b/modules/image/semantic_segmentation/lseg/test.py
new file mode 100644
index 00000000..d6860e60
--- /dev/null
+++ b/modules/image/semantic_segmentation/lseg/test.py
@@ -0,0 +1,67 @@
+import os
+import shutil
+import unittest
+
+import cv2
+import numpy as np
+import requests
+
+import paddlehub as hub
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+class TestHubModule(unittest.TestCase):
+
+ @classmethod
+ def setUpClass(cls) -> None:
+ img_url = 'https://unsplash.com/photos/mJaD10XeD7w/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8M3x8Y2F0fGVufDB8fHx8MTY2MzczNDc3Mw&force=true&w=640'
+ if not os.path.exists('tests'):
+ os.makedirs('tests')
+ response = requests.get(img_url)
+ assert response.status_code == 200, 'Network Error.'
+ with open('tests/test.jpg', 'wb') as f:
+ f.write(response.content)
+ cls.module = hub.Module(name="lseg")
+
+ @classmethod
+ def tearDownClass(cls) -> None:
+ shutil.rmtree('tests')
+ shutil.rmtree('lseg_output')
+
+ def test_segment1(self):
+ results = self.module.segment(image='tests/test.jpg', labels=['other', 'cat'], visualization=False)
+
+ self.assertIsInstance(results['mix'], np.ndarray)
+ self.assertIsInstance(results['color'], np.ndarray)
+ self.assertIsInstance(results['gray'], np.ndarray)
+ self.assertIsInstance(results['classes']['other'], np.ndarray)
+ self.assertIsInstance(results['classes']['cat'], np.ndarray)
+
+ def test_segment2(self):
+ results = self.module.segment(image=cv2.imread('tests/test.jpg'), labels=['other', 'cat'], visualization=True)
+
+ self.assertIsInstance(results['mix'], np.ndarray)
+ self.assertIsInstance(results['color'], np.ndarray)
+ self.assertIsInstance(results['gray'], np.ndarray)
+ self.assertIsInstance(results['classes']['other'], np.ndarray)
+ self.assertIsInstance(results['classes']['cat'], np.ndarray)
+
+ def test_segment3(self):
+ results = self.module.segment(image=cv2.imread('tests/test.jpg'), labels=['其他', '猫'], visualization=False)
+
+ self.assertIsInstance(results['mix'], np.ndarray)
+ self.assertIsInstance(results['color'], np.ndarray)
+ self.assertIsInstance(results['gray'], np.ndarray)
+ self.assertIsInstance(results['classes']['其他'], np.ndarray)
+ self.assertIsInstance(results['classes']['猫'], np.ndarray)
+
+ def test_segment4(self):
+ self.assertRaises(Exception, self.module.segment, image=['tests/test.jpg'], labels=['other', 'cat'])
+
+ def test_segment5(self):
+ self.assertRaises(AttributeError, self.module.segment, image='no.jpg', labels=['other', 'cat'])
+
+
+if __name__ == "__main__":
+ unittest.main()
--
GitLab