Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
OpenCV
opencv
提交
cd44aa0b
O
opencv
项目概览
OpenCV
/
opencv
上一次同步 大约 1 年
通知
1005
Star
71102
Fork
55580
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
cd44aa0b
编写于
1月 28, 2023
作者:
A
Alexander Alekhin
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #23162 from zihaomu:issue_23151
上级
d3ae175b
f45a1243
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
49 addition
and
39 deletion
+49
-39
modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
...dnn/src/layers/fast_convolution/depthwise_convolution.cpp
+44
-39
modules/dnn/test/test_onnx_importer.cpp
modules/dnn/test/test_onnx_importer.cpp
+5
-0
未找到文件。
modules/dnn/src/layers/fast_convolution/depthwise_convolution.cpp
浏览文件 @
cd44aa0b
...
...
@@ -29,7 +29,7 @@ static void depthWiseBlockConv2D(const float* wptr,
const
float
w00_
=
wptr
[
0
],
w01_
=
wptr
[
1
],
w02_
=
wptr
[
2
],
w10
=
wptr
[
3
],
w11
=
wptr
[
4
],
w12
=
wptr
[
5
],
w20_
=
wptr
[
6
],
w21_
=
wptr
[
7
],
w22_
=
wptr
[
8
];
int
outW1
=
min
(
outW
,
(
width
-
dilation_w
*
(
kernel_w
-
1
)
+
pad_l
)
/
stride_w
);
const
int
outW1
=
min
(
outW
,
(
width
-
dilation_w
*
(
kernel_w
-
1
)
+
pad_l
)
/
stride_w
);
float
relu_coeff
=
relu
?
relu
[
out_d
]
:
1.
f
,
bias
=
biasptr
[
out_d
];
for
(
int
out_i
=
0
;
out_i
<
outH
;
out_i
++
)
...
...
@@ -67,35 +67,37 @@ static void depthWiseBlockConv2D(const float* wptr,
#if CV_SIMD128
const
int
VEC_NLANES
=
4
;
if
(
fusedAdd
)
outW1
=
max
(
out_j
,
outW1
-
outW1
%
VEC_NLANES
);
v_float32x4
vw00
=
v_setall_f32
(
w00
);
v_float32x4
vw01
=
v_setall_f32
(
w01
);
v_float32x4
vw02
=
v_setall_f32
(
w02
);
v_float32x4
vw10
=
v_setall_f32
(
w10
);
v_float32x4
vw11
=
v_setall_f32
(
w11
);
v_float32x4
vw12
=
v_setall_f32
(
w12
);
v_float32x4
vw20
=
v_setall_f32
(
w20
);
v_float32x4
vw21
=
v_setall_f32
(
w21
);
v_float32x4
vw22
=
v_setall_f32
(
w22
);
v_float32x4
z
=
v_setzero_f32
();
v_float32x4
vbias
=
v_setall_f32
(
bias
);
v_float32x4
vrc
=
v_setall_f32
(
relu_coeff
);
if
(
stride_w
==
1
||
(
stride_w
==
2
&&
dilation_w
==
1
))
if
((
stride_w
==
1
||
(
stride_w
==
2
&&
dilation_w
==
1
))
&&
(
outW1
-
out_j
)
>=
VEC_NLANES
)
{
if
(
stride_w
==
1
)
v_float32x4
vw00
=
v_setall_f32
(
w00
);
v_float32x4
vw01
=
v_setall_f32
(
w01
);
v_float32x4
vw02
=
v_setall_f32
(
w02
);
v_float32x4
vw10
=
v_setall_f32
(
w10
);
v_float32x4
vw11
=
v_setall_f32
(
w11
);
v_float32x4
vw12
=
v_setall_f32
(
w12
);
v_float32x4
vw20
=
v_setall_f32
(
w20
);
v_float32x4
vw21
=
v_setall_f32
(
w21
);
v_float32x4
vw22
=
v_setall_f32
(
w22
);
v_float32x4
z
=
v_setzero_f32
();
v_float32x4
vbias
=
v_setall_f32
(
bias
);
v_float32x4
vrc
=
v_setall_f32
(
relu_coeff
);
if
(
stride_w
==
1
)
{
for
(
;
out_j
<
outW1
;
out_j
+=
VEC_NLANES
)
for
(;
out_j
<
outW1
;
out_j
+=
VEC_NLANES
)
{
if
(
out_j
+
VEC_NLANES
>
outW1
)
// Tail processing.
if
(
out_j
>
outW1
-
VEC_NLANES
)
{
if
(
out_j
<=
pad_l
||
outW1
-
VEC_NLANES
<
0
)
// If fusedAdd is true, what is stored in outptr is not a meaningless value,
// but the number being added. And we should avoid use tail processing in this case.
// Because the tail process will make some elements compute twice,
// which will lead to result errors.
if
(
fusedAdd
)
break
;
out_j
=
outW1
-
VEC_NLANES
;
}
int
in_j
=
out_j
*
stride_w
-
pad_l
;
v_float32x4
v00
=
v_load
(
imgptr0
+
in_j
),
v01
=
v_load
(
imgptr0
+
in_j
+
dilation_w
),
...
...
@@ -119,11 +121,12 @@ static void depthWiseBlockConv2D(const float* wptr,
}
else
// (stride_w == 2 && dilation_w == 1)
{
for
(
;
out_j
<
outW1
;
out_j
+=
VEC_NLANES
)
for
(;
out_j
<
outW1
;
out_j
+=
VEC_NLANES
)
{
if
(
out_j
+
VEC_NLANES
>
outW1
&&
out_j
>
pad_l
)
// Tail processing.
if
(
out_j
>
outW1
-
VEC_NLANES
)
{
if
(
outW1
-
VEC_NLANES
<
0
)
if
(
fusedAdd
)
break
;
out_j
=
outW1
-
VEC_NLANES
;
}
...
...
@@ -204,7 +207,7 @@ static void depthWiseBlockConv1D(const float* wptr,
int
out_d
,
int
outW
,
bool
fusedAdd
)
{
const
float
w00_
=
wptr
[
0
],
w01_
=
wptr
[
1
],
w02_
=
wptr
[
2
];
int
outW1
=
min
(
outW
,
(
width
-
dilation_w
*
(
kernel_w
-
1
)
+
pad_l
)
/
stride_w
);
const
int
outW1
=
min
(
outW
,
(
width
-
dilation_w
*
(
kernel_w
-
1
)
+
pad_l
)
/
stride_w
);
float
relu_coeff
=
relu
?
relu
[
out_d
]
:
1.
f
,
bias
=
biasptr
[
out_d
];
int
out_j
=
0
;
...
...
@@ -225,27 +228,27 @@ static void depthWiseBlockConv1D(const float* wptr,
#if CV_SIMD128
const
int
VEC_NLANES
=
4
;
if
(
fusedAdd
)
outW1
=
max
(
out_j
,
outW1
-
outW1
%
VEC_NLANES
);
v_float32x4
vw00
=
v_setall_f32
(
w00
);
v_float32x4
vw01
=
v_setall_f32
(
w01
);
v_float32x4
vw02
=
v_setall_f32
(
w02
);
v_float32x4
z
=
v_setzero_f32
();
v_float32x4
vbias
=
v_setall_f32
(
bias
);
v_float32x4
vrc
=
v_setall_f32
(
relu_coeff
);
if
(
stride_w
==
1
||
(
stride_w
==
2
&&
dilation_w
==
1
))
if
((
stride_w
==
1
||
(
stride_w
==
2
&&
dilation_w
==
1
))
&&
(
outW1
-
out_j
)
>=
VEC_NLANES
)
{
v_float32x4
vw00
=
v_setall_f32
(
w00
);
v_float32x4
vw01
=
v_setall_f32
(
w01
);
v_float32x4
vw02
=
v_setall_f32
(
w02
);
v_float32x4
z
=
v_setzero_f32
();
v_float32x4
vbias
=
v_setall_f32
(
bias
);
v_float32x4
vrc
=
v_setall_f32
(
relu_coeff
);
if
(
stride_w
==
1
)
{
for
(
;
out_j
<
outW1
;
out_j
+=
VEC_NLANES
)
{
// Tail processing.
if
(
out_j
+
VEC_NLANES
>
outW1
)
{
if
(
out_j
<=
pad_l
||
outW1
-
VEC_NLANES
<
0
)
if
(
fusedAdd
)
break
;
out_j
=
outW1
-
VEC_NLANES
;
}
int
in_j
=
out_j
*
stride_w
-
pad_l
;
v_float32x4
v00
=
v_load
(
imgptr0
+
in_j
),
v01
=
v_load
(
imgptr0
+
in_j
+
dilation_w
),
...
...
@@ -263,12 +266,14 @@ static void depthWiseBlockConv1D(const float* wptr,
{
for
(
;
out_j
<
outW1
;
out_j
+=
VEC_NLANES
)
{
// Tail processing.
if
(
out_j
+
VEC_NLANES
>
outW1
)
{
if
(
out_j
<=
pad_l
||
outW1
-
VEC_NLANES
<
0
)
if
(
fusedAdd
)
break
;
out_j
=
outW1
-
VEC_NLANES
;
}
int
in_j
=
out_j
*
stride_w
-
pad_l
;
v_float32x4
v00
,
v01
,
v02
,
unused
;
...
...
modules/dnn/test/test_onnx_importer.cpp
浏览文件 @
cd44aa0b
...
...
@@ -1731,6 +1731,11 @@ TEST_P(Test_ONNX_layers, DepthWiseAdd)
testONNXModels
(
"depthwiseconv_add"
);
}
TEST_P
(
Test_ONNX_layers
,
DepthStride2
)
{
testONNXModels
(
"depthwise_stride2"
);
}
TEST_P
(
Test_ONNX_layers
,
SubFromConst
)
{
testONNXModels
(
"sub_from_const1"
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录