未验证 提交 3d6a027c 编写于 作者: S shangliang Xu 提交者: GitHub

[dev] add ms_deformable_attn cuda op (#7521)

上级 e1a8f660
...@@ -84,7 +84,7 @@ class DETR(BaseArch): ...@@ -84,7 +84,7 @@ class DETR(BaseArch):
preds, self.inputs['im_shape'], self.inputs['scale_factor']) preds, self.inputs['im_shape'], self.inputs['scale_factor'])
return bbox, bbox_num return bbox, bbox_num
def get_loss(self, ): def get_loss(self):
losses = self._forward() losses = self._forward()
losses.update({ losses.update({
'loss': 'loss':
......
...@@ -492,19 +492,21 @@ class DETRBBoxPostProcess(object): ...@@ -492,19 +492,21 @@ class DETRBBoxPostProcess(object):
if scores.shape[1] > self.num_top_queries: if scores.shape[1] > self.num_top_queries:
scores, index = paddle.topk( scores, index = paddle.topk(
scores, self.num_top_queries, axis=-1) scores, self.num_top_queries, axis=-1)
labels = paddle.stack( batch_ind = paddle.arange(
[paddle.gather(l, i) for l, i in zip(labels, index)]) end=scores.shape[0]).unsqueeze(-1).tile(
bbox_pred = paddle.stack( [1, self.num_top_queries])
[paddle.gather(b, i) for b, i in zip(bbox_pred, index)]) index = paddle.stack([batch_ind, index], axis=-1)
labels = paddle.gather_nd(labels, index)
bbox_pred = paddle.gather_nd(bbox_pred, index)
else: else:
scores, index = paddle.topk( scores, index = paddle.topk(
scores.reshape([logits.shape[0], -1]), scores.flatten(1), self.num_top_queries, axis=-1)
self.num_top_queries, labels = index % self.num_classes
axis=-1) index = index // self.num_classes
labels = index % logits.shape[2] batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile(
index = index // logits.shape[2] [1, self.num_top_queries])
bbox_pred = paddle.stack( index = paddle.stack([batch_ind, index], axis=-1)
[paddle.gather(b, i) for b, i in zip(bbox_pred, index)]) bbox_pred = paddle.gather_nd(bbox_pred, index)
bbox_pred = paddle.concat( bbox_pred = paddle.concat(
[ [
......
...@@ -28,7 +28,7 @@ from paddle import ParamAttr ...@@ -28,7 +28,7 @@ from paddle import ParamAttr
from ppdet.core.workspace import register from ppdet.core.workspace import register
from ..layers import MultiHeadAttention from ..layers import MultiHeadAttention
from .position_encoding import PositionEmbedding from .position_encoding import PositionEmbedding
from .utils import _get_clones, deformable_attention_core_func from .utils import _get_clones, get_valid_ratio
from ..initializer import linear_init_, constant_, xavier_uniform_, normal_ from ..initializer import linear_init_, constant_, xavier_uniform_, normal_
__all__ = ['DeformableTransformer'] __all__ = ['DeformableTransformer']
...@@ -63,6 +63,13 @@ class MSDeformableAttention(nn.Layer): ...@@ -63,6 +63,13 @@ class MSDeformableAttention(nn.Layer):
self.attention_weights = nn.Linear(embed_dim, self.total_points) self.attention_weights = nn.Linear(embed_dim, self.total_points)
self.value_proj = nn.Linear(embed_dim, embed_dim) self.value_proj = nn.Linear(embed_dim, embed_dim)
self.output_proj = nn.Linear(embed_dim, embed_dim) self.output_proj = nn.Linear(embed_dim, embed_dim)
try:
# use cuda op
from deformable_detr_ops import ms_deformable_attn
except:
# use paddle func
from .utils import deformable_attention_core_func as ms_deformable_attn
self.ms_deformable_attn_core = ms_deformable_attn
self._reset_parameters() self._reset_parameters()
...@@ -95,6 +102,7 @@ class MSDeformableAttention(nn.Layer): ...@@ -95,6 +102,7 @@ class MSDeformableAttention(nn.Layer):
reference_points, reference_points,
value, value,
value_spatial_shapes, value_spatial_shapes,
value_level_start_index,
value_mask=None): value_mask=None):
""" """
Args: Args:
...@@ -103,6 +111,7 @@ class MSDeformableAttention(nn.Layer): ...@@ -103,6 +111,7 @@ class MSDeformableAttention(nn.Layer):
bottom-right (1, 1), including padding area bottom-right (1, 1), including padding area
value (Tensor): [bs, value_length, C] value (Tensor): [bs, value_length, C]
value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
Returns: Returns:
...@@ -131,8 +140,9 @@ class MSDeformableAttention(nn.Layer): ...@@ -131,8 +140,9 @@ class MSDeformableAttention(nn.Layer):
bs, Len_q, 1, self.num_levels, 1, 2 bs, Len_q, 1, self.num_levels, 1, 2
]) + sampling_offsets / offset_normalizer ]) + sampling_offsets / offset_normalizer
output = deformable_attention_core_func( output = self.ms_deformable_attn_core(
value, value_spatial_shapes, sampling_locations, attention_weights) value, value_spatial_shapes, value_level_start_index,
sampling_locations, attention_weights)
output = self.output_proj(output) output = self.output_proj(output)
return output return output
...@@ -185,12 +195,13 @@ class DeformableTransformerEncoderLayer(nn.Layer): ...@@ -185,12 +195,13 @@ class DeformableTransformerEncoderLayer(nn.Layer):
src, src,
reference_points, reference_points,
spatial_shapes, spatial_shapes,
level_start_index,
src_mask=None, src_mask=None,
pos_embed=None): pos_embed=None):
# self attention # self attention
src2 = self.self_attn( src2 = self.self_attn(
self.with_pos_embed(src, pos_embed), reference_points, src, self.with_pos_embed(src, pos_embed), reference_points, src,
spatial_shapes, src_mask) spatial_shapes, level_start_index, src_mask)
src = src + self.dropout1(src2) src = src + self.dropout1(src2)
src = self.norm1(src) src = self.norm1(src)
# ffn # ffn
...@@ -206,13 +217,12 @@ class DeformableTransformerEncoder(nn.Layer): ...@@ -206,13 +217,12 @@ class DeformableTransformerEncoder(nn.Layer):
self.num_layers = num_layers self.num_layers = num_layers
@staticmethod @staticmethod
def get_reference_points(spatial_shapes, valid_ratios): def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
valid_ratios = valid_ratios.unsqueeze(1) valid_ratios = valid_ratios.unsqueeze(1)
reference_points = [] reference_points = []
for i, (H, W) in enumerate(spatial_shapes.tolist()): for i, (H, W) in enumerate(spatial_shapes):
ref_y, ref_x = paddle.meshgrid( ref_y, ref_x = paddle.meshgrid(
paddle.linspace(0.5, H - 0.5, H), paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)
paddle.linspace(0.5, W - 0.5, W))
ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] * ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *
H) H)
ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] * ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *
...@@ -225,6 +235,7 @@ class DeformableTransformerEncoder(nn.Layer): ...@@ -225,6 +235,7 @@ class DeformableTransformerEncoder(nn.Layer):
def forward(self, def forward(self,
src, src,
spatial_shapes, spatial_shapes,
level_start_index,
src_mask=None, src_mask=None,
pos_embed=None, pos_embed=None,
valid_ratios=None): valid_ratios=None):
...@@ -235,8 +246,8 @@ class DeformableTransformerEncoder(nn.Layer): ...@@ -235,8 +246,8 @@ class DeformableTransformerEncoder(nn.Layer):
reference_points = self.get_reference_points(spatial_shapes, reference_points = self.get_reference_points(spatial_shapes,
valid_ratios) valid_ratios)
for layer in self.layers: for layer in self.layers:
output = layer(output, reference_points, spatial_shapes, src_mask, output = layer(output, reference_points, spatial_shapes,
pos_embed) level_start_index, src_mask, pos_embed)
return output return output
...@@ -296,6 +307,7 @@ class DeformableTransformerDecoderLayer(nn.Layer): ...@@ -296,6 +307,7 @@ class DeformableTransformerDecoderLayer(nn.Layer):
reference_points, reference_points,
memory, memory,
memory_spatial_shapes, memory_spatial_shapes,
memory_level_start_index,
memory_mask=None, memory_mask=None,
query_pos_embed=None): query_pos_embed=None):
# self attention # self attention
...@@ -307,7 +319,7 @@ class DeformableTransformerDecoderLayer(nn.Layer): ...@@ -307,7 +319,7 @@ class DeformableTransformerDecoderLayer(nn.Layer):
# cross attention # cross attention
tgt2 = self.cross_attn( tgt2 = self.cross_attn(
self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
memory_spatial_shapes, memory_mask) memory_spatial_shapes, memory_level_start_index, memory_mask)
tgt = tgt + self.dropout2(tgt2) tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt) tgt = self.norm2(tgt)
...@@ -329,13 +341,15 @@ class DeformableTransformerDecoder(nn.Layer): ...@@ -329,13 +341,15 @@ class DeformableTransformerDecoder(nn.Layer):
reference_points, reference_points,
memory, memory,
memory_spatial_shapes, memory_spatial_shapes,
memory_level_start_index,
memory_mask=None, memory_mask=None,
query_pos_embed=None): query_pos_embed=None):
output = tgt output = tgt
intermediate = [] intermediate = []
for lid, layer in enumerate(self.layers): for lid, layer in enumerate(self.layers):
output = layer(output, reference_points, memory, output = layer(output, reference_points, memory,
memory_spatial_shapes, memory_mask, query_pos_embed) memory_spatial_shapes, memory_level_start_index,
memory_mask, query_pos_embed)
if self.return_intermediate: if self.return_intermediate:
intermediate.append(output) intermediate.append(output)
...@@ -447,14 +461,7 @@ class DeformableTransformer(nn.Layer): ...@@ -447,14 +461,7 @@ class DeformableTransformer(nn.Layer):
def from_config(cls, cfg, input_shape): def from_config(cls, cfg, input_shape):
return {'backbone_num_channels': [i.channels for i in input_shape], } return {'backbone_num_channels': [i.channels for i in input_shape], }
def _get_valid_ratio(self, mask): def forward(self, src_feats, src_mask=None, *args, **kwargs):
_, H, W = mask.shape
valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H
valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W
valid_ratio = paddle.stack([valid_ratio_w, valid_ratio_h], -1)
return valid_ratio
def forward(self, src_feats, src_mask=None):
srcs = [] srcs = []
for i in range(len(src_feats)): for i in range(len(src_feats)):
srcs.append(self.input_proj[i](src_feats[i])) srcs.append(self.input_proj[i](src_feats[i]))
...@@ -471,33 +478,38 @@ class DeformableTransformer(nn.Layer): ...@@ -471,33 +478,38 @@ class DeformableTransformer(nn.Layer):
spatial_shapes = [] spatial_shapes = []
valid_ratios = [] valid_ratios = []
for level, src in enumerate(srcs): for level, src in enumerate(srcs):
bs, c, h, w = src.shape bs, _, h, w = paddle.shape(src)
spatial_shapes.append([h, w]) spatial_shapes.append(paddle.concat([h, w]))
src = src.flatten(2).transpose([0, 2, 1]) src = src.flatten(2).transpose([0, 2, 1])
src_flatten.append(src) src_flatten.append(src)
if src_mask is not None: if src_mask is not None:
mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0] mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
else: else:
mask = paddle.ones([bs, h, w]) mask = paddle.ones([bs, h, w])
valid_ratios.append(self._get_valid_ratio(mask)) valid_ratios.append(get_valid_ratio(mask))
pos_embed = self.position_embedding(mask).flatten(2).transpose( pos_embed = self.position_embedding(mask).flatten(1, 2)
[0, 2, 1]) lvl_pos_embed = pos_embed + self.level_embed.weight[level]
lvl_pos_embed = pos_embed + self.level_embed.weight[level].reshape(
[1, 1, -1])
lvl_pos_embed_flatten.append(lvl_pos_embed) lvl_pos_embed_flatten.append(lvl_pos_embed)
mask = mask.flatten(1) mask = mask.flatten(1)
mask_flatten.append(mask) mask_flatten.append(mask)
src_flatten = paddle.concat(src_flatten, 1) src_flatten = paddle.concat(src_flatten, 1)
mask_flatten = paddle.concat(mask_flatten, 1) mask_flatten = None if src_mask is None else paddle.concat(mask_flatten,
1)
lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
# [l, 2] # [l, 2]
spatial_shapes = paddle.to_tensor(spatial_shapes, dtype='int64') spatial_shapes = paddle.to_tensor(
paddle.stack(spatial_shapes).astype('int64'))
# [l], 每一个level的起始index
level_start_index = paddle.concat([
paddle.zeros(
[1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
])
# [b, l, 2] # [b, l, 2]
valid_ratios = paddle.stack(valid_ratios, 1) valid_ratios = paddle.stack(valid_ratios, 1)
# encoder # encoder
memory = self.encoder(src_flatten, spatial_shapes, mask_flatten, memory = self.encoder(src_flatten, spatial_shapes, level_start_index,
lvl_pos_embed_flatten, valid_ratios) mask_flatten, lvl_pos_embed_flatten, valid_ratios)
# prepare input for decoder # prepare input for decoder
bs, _, c = memory.shape bs, _, c = memory.shape
...@@ -509,6 +521,6 @@ class DeformableTransformer(nn.Layer): ...@@ -509,6 +521,6 @@ class DeformableTransformer(nn.Layer):
# decoder # decoder
hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes, hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes,
mask_flatten, query_embed) level_start_index, mask_flatten, query_embed)
return (hs, memory, reference_points) return (hs, memory, reference_points)
...@@ -295,7 +295,7 @@ class DETRTransformer(nn.Layer): ...@@ -295,7 +295,7 @@ class DETRTransformer(nn.Layer):
def _convert_attention_mask(self, mask): def _convert_attention_mask(self, mask):
return (mask - 1.0) * 1e9 return (mask - 1.0) * 1e9
def forward(self, src, src_mask=None): def forward(self, src, src_mask=None, *args, **kwargs):
r""" r"""
Applies a Transformer model on the inputs. Applies a Transformer model on the inputs.
...@@ -325,8 +325,7 @@ class DETRTransformer(nn.Layer): ...@@ -325,8 +325,7 @@ class DETRTransformer(nn.Layer):
src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0] src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
else: else:
src_mask = paddle.ones([bs, h, w]) src_mask = paddle.ones([bs, h, w])
pos_embed = self.position_embedding(src_mask).flatten(2).transpose( pos_embed = self.position_embedding(src_mask).flatten(1, 2)
[0, 2, 1])
if self.training: if self.training:
src_mask = self._convert_attention_mask(src_mask) src_mask = self._convert_attention_mask(src_mask)
......
# Multi-scale deformable attention自定义OP编译
该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html)
## 1. 环境依赖
- Paddle >= 2.3.2
- gcc 8.2
## 2. 安装
请在当前路径下进行编译安装
```
cd PaddleDetection/ppdet/modeling/transformers/ext_op/
python setup_ms_deformable_attn_op.py install
```
编译完成后即可使用,以下为`ms_deformable_attn`的使用示例
```
# 引入自定义op
from deformable_detr_ops import ms_deformable_attn
# 构造fake input tensor
bs, n_heads, c = 2, 8, 8
query_length, n_levels, n_points = 2, 2, 2
spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
level_start_index = paddle.concat((paddle.to_tensor(
[0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
value_length = sum([(H * W).item() for H, W in spatial_shapes])
def get_test_tensors(channels):
value = paddle.rand(
[bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
sampling_locations = paddle.rand(
[bs, query_length, n_heads, n_levels, n_points, 2],
dtype=paddle.float32)
attention_weights = paddle.rand(
[bs, query_length, n_heads, n_levels, n_points],
dtype=paddle.float32) + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
-2, keepdim=True)
return [value, sampling_locations, attention_weights]
value, sampling_locations, attention_weights = get_test_tensors(c)
output = ms_deformable_attn(value,
spatial_shapes,
level_start_index,
sampling_locations,
attention_weights)
```
## 3. 单元测试
可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示:
```
python test_ms_deformable_attn_op.py
```
运行成功后,打印如下:
```
*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07
*tensor1 True check_gradient_numerical(D=30)
*tensor2 True check_gradient_numerical(D=30)
*tensor3 True check_gradient_numerical(D=30)
*tensor1 True check_gradient_numerical(D=32)
*tensor2 True check_gradient_numerical(D=32)
*tensor3 True check_gradient_numerical(D=32)
*tensor1 True check_gradient_numerical(D=64)
*tensor2 True check_gradient_numerical(D=64)
*tensor3 True check_gradient_numerical(D=64)
*tensor1 True check_gradient_numerical(D=71)
*tensor2 True check_gradient_numerical(D=71)
*tensor3 True check_gradient_numerical(D=71)
*tensor1 True check_gradient_numerical(D=128)
*tensor2 True check_gradient_numerical(D=128)
*tensor3 True check_gradient_numerical(D=128)
*tensor1 True check_gradient_numerical(D=1024)
*tensor2 True check_gradient_numerical(D=1024)
*tensor3 True check_gradient_numerical(D=1024)
*tensor1 True check_gradient_numerical(D=1025)
*tensor2 True check_gradient_numerical(D=1025)
*tensor3 True check_gradient_numerical(D=1025)
*tensor1 True check_gradient_numerical(D=2048)
*tensor2 True check_gradient_numerical(D=2048)
*tensor3 True check_gradient_numerical(D=2048)
*tensor1 True check_gradient_numerical(D=3096)
*tensor2 True check_gradient_numerical(D=3096)
*tensor3 True check_gradient_numerical(D=3096)
```
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/extension.h"
#include <vector>
// declare GPU implementation
std::vector<paddle::Tensor>
MSDeformableAttnCUDAForward(const paddle::Tensor &value,
const paddle::Tensor &value_spatial_shapes,
const paddle::Tensor &value_level_start_index,
const paddle::Tensor &sampling_locations,
const paddle::Tensor &attention_weights);
std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
const paddle::Tensor &value_level_start_index,
const paddle::Tensor &sampling_locations,
const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);
//// CPU not implemented
std::vector<std::vector<int64_t>>
MSDeformableAttnInferShape(std::vector<int64_t> value_shape,
std::vector<int64_t> value_spatial_shapes_shape,
std::vector<int64_t> value_level_start_index_shape,
std::vector<int64_t> sampling_locations_shape,
std::vector<int64_t> attention_weights_shape) {
return {{value_shape[0], sampling_locations_shape[1],
value_shape[2] * value_shape[3]}};
}
std::vector<paddle::DataType>
MSDeformableAttnInferDtype(paddle::DataType value_dtype,
paddle::DataType value_spatial_shapes_dtype,
paddle::DataType value_level_start_index_dtype,
paddle::DataType sampling_locations_dtype,
paddle::DataType attention_weights_dtype) {
return {value_dtype};
}
PD_BUILD_OP(ms_deformable_attn)
.Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
"AttentionWeights"})
.Outputs({"Out"})
.SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))
.SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));
PD_BUILD_GRAD_OP(ms_deformable_attn)
.Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
"AttentionWeights", paddle::Grad("Out")})
.Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"),
paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"),
paddle::Grad("AttentionWeights")})
.SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));
from paddle.utils.cpp_extension import CUDAExtension, setup
if __name__ == "__main__":
setup(
name='deformable_detr_ops',
ext_modules=CUDAExtension(
sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import os
import sys
import random
import numpy as np
import paddle
# add python path of PadleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))
if parent_path not in sys.path:
sys.path.append(parent_path)
from ppdet.modeling.transformers.utils import deformable_attention_core_func
ms_deform_attn_core_paddle = deformable_attention_core_func
try:
gpu_index = int(sys.argv[1])
except:
gpu_index = 0
print(f'Use gpu {gpu_index} to test...')
paddle.set_device(f'gpu:{gpu_index}')
try:
from deformable_detr_ops import ms_deformable_attn
except Exception as e:
print('import deformable_detr_ops error', e)
sys.exit(-1)
paddle.seed(1)
random.seed(1)
np.random.seed(1)
bs, n_heads, c = 2, 8, 8
query_length, n_levels, n_points = 2, 2, 2
spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
level_start_index = paddle.concat((paddle.to_tensor(
[0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
value_length = sum([(H * W).item() for H, W in spatial_shapes])
def get_test_tensors(channels):
value = paddle.rand(
[bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
sampling_locations = paddle.rand(
[bs, query_length, n_heads, n_levels, n_points, 2],
dtype=paddle.float32)
attention_weights = paddle.rand(
[bs, query_length, n_heads, n_levels, n_points],
dtype=paddle.float32) + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
-2, keepdim=True)
return [value, sampling_locations, attention_weights]
@paddle.no_grad()
def check_forward_equal_with_paddle_float():
value, sampling_locations, attention_weights = get_test_tensors(c)
output_paddle = ms_deform_attn_core_paddle(
value, spatial_shapes, level_start_index, sampling_locations,
attention_weights).detach().cpu()
output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,
sampling_locations,
attention_weights).detach().cpu()
fwdok = paddle.allclose(
output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
max_abs_err = (output_cuda - output_paddle).abs().max().item()
max_rel_err = (
(output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()
print(
f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
)
def check_gradient_numerical(channels=4):
value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(
channels)
value_paddle.stop_gradient = False
sampling_locations_paddle.stop_gradient = False
attention_weights_paddle.stop_gradient = False
value_cuda = value_paddle.detach().clone()
sampling_locations_cuda = sampling_locations_paddle.detach().clone()
attention_weights_cuda = attention_weights_paddle.detach().clone()
value_cuda.stop_gradient = False
sampling_locations_cuda.stop_gradient = False
attention_weights_cuda.stop_gradient = False
output_paddle = ms_deform_attn_core_paddle(
value_paddle, spatial_shapes, level_start_index,
sampling_locations_paddle, attention_weights_paddle)
output_paddle.sum().backward()
output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,
level_start_index, sampling_locations_cuda,
attention_weights_cuda)
output_cuda.sum().backward()
res = paddle.allclose(
value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
print(f'*tensor1 {res} check_gradient_numerical(D={channels})')
res = paddle.allclose(
sampling_locations_paddle.grad,
sampling_locations_cuda.grad,
rtol=1e-2,
atol=1e-3).item()
print(f'*tensor2 {res} check_gradient_numerical(D={channels})')
res = paddle.allclose(
attention_weights_paddle.grad,
attention_weights_cuda.grad,
rtol=1e-2,
atol=1e-3).item()
print(f'*tensor3 {res} check_gradient_numerical(D={channels})')
if __name__ == '__main__':
check_forward_equal_with_paddle_float()
for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:
check_gradient_numerical(channels)
...@@ -33,37 +33,34 @@ class PositionEmbedding(nn.Layer): ...@@ -33,37 +33,34 @@ class PositionEmbedding(nn.Layer):
num_pos_feats=128, num_pos_feats=128,
temperature=10000, temperature=10000,
normalize=True, normalize=True,
scale=None, scale=2 * math.pi,
embed_type='sine', embed_type='sine',
num_embeddings=50, num_embeddings=50,
offset=0.): offset=0.,
eps=1e-6):
super(PositionEmbedding, self).__init__() super(PositionEmbedding, self).__init__()
assert embed_type in ['sine', 'learned'] assert embed_type in ['sine', 'learned']
self.embed_type = embed_type self.embed_type = embed_type
self.offset = offset self.offset = offset
self.eps = 1e-6 self.eps = eps
if self.embed_type == 'sine': if self.embed_type == 'sine':
self.num_pos_feats = num_pos_feats self.num_pos_feats = num_pos_feats
self.temperature = temperature self.temperature = temperature
self.normalize = normalize self.normalize = normalize
if scale is not None and normalize is False:
raise ValueError("normalize should be True if scale is passed")
if scale is None:
scale = 2 * math.pi
self.scale = scale self.scale = scale
elif self.embed_type == 'learned': elif self.embed_type == 'learned':
self.row_embed = nn.Embedding(num_embeddings, num_pos_feats) self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)
self.col_embed = nn.Embedding(num_embeddings, num_pos_feats) self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)
else: else:
raise ValueError(f"not supported {self.embed_type}") raise ValueError(f"{self.embed_type} is not supported.")
def forward(self, mask): def forward(self, mask):
""" """
Args: Args:
mask (Tensor): [B, H, W] mask (Tensor): [B, H, W]
Returns: Returns:
pos (Tensor): [B, C, H, W] pos (Tensor): [B, H, W, C]
""" """
if self.embed_type == 'sine': if self.embed_type == 'sine':
y_embed = mask.cumsum(1) y_embed = mask.cumsum(1)
...@@ -86,20 +83,18 @@ class PositionEmbedding(nn.Layer): ...@@ -86,20 +83,18 @@ class PositionEmbedding(nn.Layer):
pos_y = paddle.stack( pos_y = paddle.stack(
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
axis=4).flatten(3) axis=4).flatten(3)
pos = paddle.concat((pos_y, pos_x), axis=3).transpose([0, 3, 1, 2]) return paddle.concat((pos_y, pos_x), axis=3)
return pos
elif self.embed_type == 'learned': elif self.embed_type == 'learned':
h, w = mask.shape[-2:] h, w = mask.shape[-2:]
i = paddle.arange(w) i = paddle.arange(w)
j = paddle.arange(h) j = paddle.arange(h)
x_emb = self.col_embed(i) x_emb = self.col_embed(i)
y_emb = self.row_embed(j) y_emb = self.row_embed(j)
pos = paddle.concat( return paddle.concat(
[ [
x_emb.unsqueeze(0).tile([h, 1, 1]), x_emb.unsqueeze(0).tile([h, 1, 1]),
y_emb.unsqueeze(1).tile([1, w, 1]), y_emb.unsqueeze(1).tile([1, w, 1]),
], ],
axis=-1).transpose([2, 0, 1]).unsqueeze(0) axis=-1).unsqueeze(0)
return pos
else: else:
raise ValueError(f"not supported {self.embed_type}") raise ValueError(f"not supported {self.embed_type}")
...@@ -38,15 +38,14 @@ def _get_clones(module, N): ...@@ -38,15 +38,14 @@ def _get_clones(module, N):
def bbox_cxcywh_to_xyxy(x): def bbox_cxcywh_to_xyxy(x):
x_c, y_c, w, h = x.split(4, axis=-1) cxcy, wh = paddle.split(x, 2, axis=-1)
b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)
return paddle.concat(b, axis=-1)
def bbox_xyxy_to_cxcywh(x): def bbox_xyxy_to_cxcywh(x):
x0, y0, x1, y1 = x.split(4, axis=-1) x1, y1, x2, y2 = x.split(4, axis=-1)
b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)] return paddle.concat(
return paddle.concat(b, axis=-1) [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)], axis=-1)
def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0): def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):
...@@ -67,24 +66,27 @@ def inverse_sigmoid(x, eps=1e-6): ...@@ -67,24 +66,27 @@ def inverse_sigmoid(x, eps=1e-6):
def deformable_attention_core_func(value, value_spatial_shapes, def deformable_attention_core_func(value, value_spatial_shapes,
sampling_locations, attention_weights): value_level_start_index, sampling_locations,
attention_weights):
""" """
Args: Args:
value (Tensor): [bs, value_length, n_head, c] value (Tensor): [bs, value_length, n_head, c]
value_spatial_shapes (Tensor): [n_levels, 2] value_spatial_shapes (Tensor): [n_levels, 2]
value_level_start_index (Tensor): [n_levels]
sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2] sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points] attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
Returns: Returns:
output (Tensor): [bs, Length_{query}, C] output (Tensor): [bs, Length_{query}, C]
""" """
bs, Len_v, n_head, c = value.shape bs, _, n_head, c = value.shape
_, Len_q, n_head, n_levels, n_points, _ = sampling_locations.shape _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
value_list = value.split(value_spatial_shapes.prod(1).tolist(), axis=1) value_list = value.split(
value_spatial_shapes.prod(1).split(n_levels), axis=1)
sampling_grids = 2 * sampling_locations - 1 sampling_grids = 2 * sampling_locations - 1
sampling_value_list = [] sampling_value_list = []
for level, (h, w) in enumerate(value_spatial_shapes.tolist()): for level, (h, w) in enumerate(value_spatial_shapes):
# N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
value_l_ = value_list[level].flatten(2).transpose( value_l_ = value_list[level].flatten(2).transpose(
[0, 2, 1]).reshape([bs * n_head, c, h, w]) [0, 2, 1]).reshape([bs * n_head, c, h, w])
...@@ -107,3 +109,11 @@ def deformable_attention_core_func(value, value_spatial_shapes, ...@@ -107,3 +109,11 @@ def deformable_attention_core_func(value, value_spatial_shapes,
attention_weights).sum(-1).reshape([bs, n_head * c, Len_q]) attention_weights).sum(-1).reshape([bs, n_head * c, Len_q])
return output.transpose([0, 2, 1]) return output.transpose([0, 2, 1])
def get_valid_ratio(mask):
_, H, W = paddle.shape(mask)
valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H
valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W
# [b, 2]
return paddle.stack([valid_ratio_w, valid_ratio_h], -1)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册