Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
ac2dc295
O
Opencv
项目概览
Greenplum
/
Opencv
11 个月 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
ac2dc295
编写于
11月 14, 2019
作者:
A
Alexander Alekhin
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #15852 from akhakim:gauss_blur_kernel_5x5
上级
d1c4e4b5
beb14c70
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
640 addition
and
14 deletion
+640
-14
modules/gapi/src/backends/fluid/gfluidimgproc.cpp
modules/gapi/src/backends/fluid/gfluidimgproc.cpp
+12
-2
modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
...s/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
+22
-0
modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
+19
-0
modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
+587
-12
未找到文件。
modules/gapi/src/backends/fluid/gfluidimgproc.cpp
浏览文件 @
ac2dc295
...
...
@@ -599,6 +599,7 @@ static void run_sepfilter(Buffer& dst, const View& src,
{
constexpr
int
kMax
=
11
;
GAPI_Assert
(
kxLen
<=
kMax
&&
kyLen
<=
kMax
);
GAPI_Assert
(
kxLen
==
kyLen
);
const
SRC
*
in
[
kMax
];
DST
*
out
;
...
...
@@ -625,6 +626,13 @@ static void run_sepfilter(Buffer& dst, const View& src,
int
border
=
xborder
;
run_sepfilter3x3_impl
(
out
,
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
}
else
if
(
kxLen
==
5
&&
kyLen
==
5
)
{
int
y
=
dst
.
y
();
int
y0
=
dst
.
priv
().
writeStart
();
run_sepfilter5x5_impl
(
out
,
in
,
width
,
chan
,
kx
,
ky
,
xborder
,
scale
,
delta
,
buf
,
y
,
y0
);
}
else
{
int
length
=
chan
*
width
;
...
...
@@ -788,7 +796,9 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true)
Buffer
&
dst
,
Buffer
&
scratch
)
{
int
kxsize
=
ksize
.
width
;
GAPI_Assert
(
ksize
.
height
==
ksize
.
width
);
GAPI_Assert
((
ksize
.
height
==
3
)
||
(
ksize
.
height
==
5
));
const
int
kxsize
=
ksize
.
width
;
int
kysize
=
ksize
.
height
;
auto
*
kx
=
scratch
.
OutLine
<
float
>
();
// cached kernX data
...
...
@@ -801,7 +811,7 @@ GAPI_FLUID_KERNEL(GFluidGaussBlur, cv::gapi::imgproc::GGaussBlur, true)
constexpr
int
buffSize
=
5
;
GAPI_Assert
(
ksize
.
height
<=
buffSize
);
float
*
buf
[
buffSize
]
{
};
float
*
buf
[
buffSize
]
=
{
nullptr
};
buf
[
0
]
=
ky
+
kysize
;
for
(
int
i
=
1
;
i
<
ksize
.
height
;
++
i
)
...
...
modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
浏览文件 @
ac2dc295
...
...
@@ -119,6 +119,28 @@ RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SEPFILTER3X3_IMPL
#define RUN_SEPFILTER5x5_IMPL(DST, SRC) \
void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, \
float *buf[], int y, int y0) \
{ \
CV_CPU_DISPATCH(run_sepfilter5x5_impl, \
(out, in, width, chan, kx, ky, border, scale, delta, buf,y, y0), \
CV_CPU_DISPATCH_MODES_ALL); \
}
RUN_SEPFILTER5x5_IMPL
(
uchar
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
short
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
float
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
ushort
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
float
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
float
)
#undef RUN_SEPFILTER5x5_IMPL
//-------------------------
//
// Fluid kernels: Filter 2D
...
...
modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
浏览文件 @
ac2dc295
...
...
@@ -78,6 +78,25 @@ RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SEPFILTER3X3_IMPL
#define RUN_SEPFILTER5x5_IMPL(DST, SRC) \
void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, \
float *buf[], int y, int y0);
RUN_SEPFILTER5x5_IMPL
(
uchar
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
short
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
float
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
ushort
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
float
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
float
)
#undef RUN_SEPFILTER5x5_IMPL
//-------------------------
//
// Fluid kernels: Filter 2D
...
...
modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
浏览文件 @
ac2dc295
...
...
@@ -100,6 +100,23 @@ RUN_SEPFILTER3X3_IMPL( float, float)
#undef RUN_SEPFILTER3X3_IMPL
#define RUN_SEPFILTER5x5_IMPL(DST, SRC) \
void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, \
const float kx[], const float ky[], int border, \
float scale, float delta, \
float *buf[], int y, int y0);
RUN_SEPFILTER5x5_IMPL
(
uchar
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
short
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
float
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
ushort
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
float
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
float
)
#undef RUN_SEPFILTER5x5_IMPL
//-------------------------
//
// Fluid kernels: Filter 2D
...
...
@@ -978,11 +995,11 @@ void run_rgb2yuv422_impl(uchar out[], const uchar in[], int width)
}
}
//-------------------------
//-------------------------
----
//
// Fluid kernels: sepFilter
// Fluid kernels: sepFilter
3x3
//
//-------------------------
//-------------------------
----
#if CV_SIMD
// this variant not using buf[] appears 15% faster than reference any-2-float code below
...
...
@@ -1322,7 +1339,7 @@ static void run_sepfilter3x3_char2short(short out[], const uchar *in[], int widt
}
}
}
#endif
#endif
//USE_SEPFILTER3X3_CHAR2SHORT
#endif // CV_SIMD
...
...
@@ -1464,18 +1481,576 @@ void run_sepfilter3x3_impl(DST out[], const SRC *in[], int width, int chan, \
} \
}
RUN_SEPFILTER3X3_IMPL
(
uchar
,
uchar
)
RUN_SEPFILTER3X3_IMPL
(
short
,
uchar
)
RUN_SEPFILTER3X3_IMPL
(
float
,
uchar
)
RUN_SEPFILTER3X3_IMPL
(
uchar
,
uchar
)
RUN_SEPFILTER3X3_IMPL
(
short
,
uchar
)
RUN_SEPFILTER3X3_IMPL
(
float
,
uchar
)
RUN_SEPFILTER3X3_IMPL
(
ushort
,
ushort
)
RUN_SEPFILTER3X3_IMPL
(
short
,
ushort
)
RUN_SEPFILTER3X3_IMPL
(
float
,
ushort
)
RUN_SEPFILTER3X3_IMPL
(
short
,
short
)
RUN_SEPFILTER3X3_IMPL
(
float
,
short
)
RUN_SEPFILTER3X3_IMPL
(
float
,
float
)
RUN_SEPFILTER3X3_IMPL
(
short
,
ushort
)
RUN_SEPFILTER3X3_IMPL
(
float
,
ushort
)
RUN_SEPFILTER3X3_IMPL
(
short
,
short
)
RUN_SEPFILTER3X3_IMPL
(
float
,
short
)
RUN_SEPFILTER3X3_IMPL
(
float
,
float
)
#undef RUN_SEPFILTER3X3_IMPL
//-----------------------------
//
// Fluid kernels: sepFilter 5x5
//
//-----------------------------
#if CV_SIMD
// this code with manually vectored rounding to uchar
template
<
bool
noscale
,
typename
SRC
>
static
void
run_sepfilter5x5_any2char
(
uchar
out
[],
const
SRC
*
in
[],
int
width
,
int
chan
,
const
float
kx
[],
const
float
ky
[],
int
border
,
float
scale
,
float
delta
,
float
*
buf
[],
int
y
,
int
y0
)
{
constexpr
int
kxLen
=
5
;
constexpr
int
kyLen
=
kxLen
;
constexpr
int
buffSize
=
5
;
int
r
[
buffSize
];
for
(
int
n
=
0
;
n
<
buffSize
;
++
n
)
{
r
[
n
]
=
(
y
-
y0
+
n
)
%
5
;
// previous, this, next rows
}
const
int
length
=
width
*
chan
;
const
int
shift
=
chan
;
// horizontal pass
int
k0
=
(
y
==
y0
)
?
0
:
4
;
for
(
int
k
=
k0
;
k
<
kxLen
;
++
k
)
{
const
SRC
*
s
[
kxLen
]
=
{
nullptr
};
for
(
int
i
=
0
;
i
<
kxLen
;
++
i
)
{
// previous , this , next pixels
s
[
i
]
=
in
[
k
]
+
(
i
-
border
)
*
shift
;
}
// rely on compiler vectoring
for
(
int
l
=
0
;
l
<
length
;
++
l
)
{
float
sum
=
0
;
for
(
int
j
=
0
;
j
<
kxLen
;
++
j
)
{
sum
+=
s
[
j
][
l
]
*
kx
[
j
];
}
buf
[
r
[
k
]][
l
]
=
sum
;
}
}
// vertical pass
constexpr
int
nlanes
=
v_uint8
::
nlanes
;
for
(
int
l
=
0
;
l
<
length
;)
{
// main part of row
for
(;
l
<=
length
-
nlanes
;
l
+=
nlanes
)
{
v_float32
sum0
=
vx_load
(
&
buf
[
r
[
0
]][
l
])
*
vx_setall_f32
(
ky
[
0
]);
v_float32
sum1
=
vx_load
(
&
buf
[
r
[
0
]][
l
+
nlanes
/
4
])
*
vx_setall_f32
(
ky
[
0
]);
v_float32
sum2
=
vx_load
(
&
buf
[
r
[
0
]][
l
+
2
*
nlanes
/
4
])
*
vx_setall_f32
(
ky
[
0
]);
v_float32
sum3
=
vx_load
(
&
buf
[
r
[
0
]][
l
+
3
*
nlanes
/
4
])
*
vx_setall_f32
(
ky
[
0
]);
for
(
int
n
=
1
;
n
<
kyLen
;
++
n
)
{
sum0
=
v_fma
(
vx_load
(
&
buf
[
r
[
n
]][
l
]),
vx_setall_f32
(
ky
[
n
]),
sum0
);
sum1
=
v_fma
(
vx_load
(
&
buf
[
r
[
n
]][
l
+
nlanes
/
4
]),
vx_setall_f32
(
ky
[
n
]),
sum1
);
sum2
=
v_fma
(
vx_load
(
&
buf
[
r
[
n
]][
l
+
2
*
nlanes
/
4
]),
vx_setall_f32
(
ky
[
n
]),
sum2
);
sum3
=
v_fma
(
vx_load
(
&
buf
[
r
[
n
]][
l
+
3
*
nlanes
/
4
]),
vx_setall_f32
(
ky
[
n
]),
sum3
);
}
if
(
!
noscale
)
{
sum0
=
v_fma
(
sum0
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
sum1
=
v_fma
(
sum1
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
sum2
=
v_fma
(
sum2
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
sum3
=
v_fma
(
sum3
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
}
v_int32
isum0
=
v_round
(
sum0
),
isum1
=
v_round
(
sum1
),
isum2
=
v_round
(
sum2
),
isum3
=
v_round
(
sum3
);
v_int16
ires0
=
v_pack
(
isum0
,
isum1
),
ires1
=
v_pack
(
isum2
,
isum3
);
v_uint8
res
=
v_pack_u
(
ires0
,
ires1
);
v_store
(
reinterpret_cast
<
uchar
*>
(
&
out
[
l
]),
res
);
}
// tail (if any)
if
(
l
<
length
)
{
GAPI_DbgAssert
(
length
>=
nlanes
);
l
=
length
-
nlanes
;
}
}
return
;
}
// this variant with manually vectored rounding to short/ushort
template
<
bool
noscale
,
typename
DST
,
typename
SRC
>
static
void
run_sepfilter5x5_any2short
(
DST
out
[],
const
SRC
*
in
[],
int
width
,
int
chan
,
const
float
kx
[],
const
float
ky
[],
int
border
,
float
scale
,
float
delta
,
float
*
buf
[],
int
y
,
int
y0
)
{
constexpr
int
kxLen
=
5
;
constexpr
int
kyLen
=
kxLen
;
constexpr
int
buffSize
=
5
;
int
r
[
buffSize
];
for
(
int
n
=
0
;
n
<
buffSize
;
++
n
)
{
r
[
n
]
=
(
y
-
y0
+
n
)
%
5
;
// previous, this, next rows
}
const
int
length
=
width
*
chan
;
const
int
shift
=
chan
;
// horizontal pass
int
k0
=
(
y
==
y0
)
?
0
:
4
;
for
(
int
k
=
k0
;
k
<
kyLen
;
++
k
)
{
const
SRC
*
s
[
kxLen
]
=
{
nullptr
};
for
(
int
i
=
0
;
i
<
kxLen
;
++
i
)
{
// previous , this , next pixels
s
[
i
]
=
in
[
k
]
+
(
i
-
border
)
*
shift
;
}
// rely on compiler vectoring
for
(
int
l
=
0
;
l
<
length
;
++
l
)
{
float
sum
=
0
;
for
(
int
j
=
0
;
j
<
kxLen
;
++
j
)
{
sum
+=
s
[
j
][
l
]
*
kx
[
j
];
}
buf
[
r
[
k
]][
l
]
=
sum
;
}
}
// vertical pass
constexpr
int
nlanes
=
v_int16
::
nlanes
;
for
(
int
l
=
0
;
l
<
length
;)
{
//GAPI_Assert(length >= nlanes);
// main part of row
for
(;
l
<=
length
-
nlanes
;
l
+=
nlanes
)
{
v_float32
sum0
=
vx_load
(
&
buf
[
r
[
0
]][
l
])
*
vx_setall_f32
(
ky
[
0
]);
v_float32
sum1
=
vx_load
(
&
buf
[
r
[
0
]][
l
+
nlanes
/
2
])
*
vx_setall_f32
(
ky
[
0
]);
for
(
int
j
=
1
;
j
<
kyLen
;
++
j
)
{
sum0
=
v_fma
(
vx_load
(
&
buf
[
r
[
j
]][
l
]),
vx_setall_f32
(
ky
[
j
]),
sum0
);
sum1
=
v_fma
(
vx_load
(
&
buf
[
r
[
j
]][
l
+
nlanes
/
2
]),
vx_setall_f32
(
ky
[
j
]),
sum1
);
}
if
(
!
noscale
)
{
sum0
=
v_fma
(
sum0
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
sum1
=
v_fma
(
sum1
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
}
v_int32
isum0
=
v_round
(
sum0
),
isum1
=
v_round
(
sum1
);
if
(
std
::
is_same
<
DST
,
short
>::
value
)
{
// signed short
v_int16
res
=
v_pack
(
isum0
,
isum1
);
v_store
(
reinterpret_cast
<
short
*>
(
&
out
[
l
]),
res
);
}
else
{
// unsigned short
v_uint16
res
=
v_pack_u
(
isum0
,
isum1
);
v_store
(
reinterpret_cast
<
ushort
*>
(
&
out
[
l
]),
res
);
}
}
// tail (if any)
if
(
l
<
length
)
{
GAPI_DbgAssert
(
length
>=
nlanes
);
l
=
length
-
nlanes
;
}
}
return
;
}
// this variant not using buf[]
template
<
bool
noscale
,
typename
SRC
>
static
void
run_sepfilter5x5_any2float
(
float
out
[],
const
SRC
*
in
[],
int
width
,
int
chan
,
const
float
kx
[],
const
float
ky
[],
int
border
,
float
scale
,
float
delta
)
{
constexpr
int
kxLen
=
5
;
constexpr
int
kyLen
=
kxLen
;
constexpr
int
buffSize
=
5
;
const
int
length
=
width
*
chan
;
const
int
shift
=
chan
;
static
const
int
nlanes
=
v_float32
::
nlanes
;
for
(
int
l
=
0
;
l
<
length
;
)
{
//GAPI_Assert(length >= nlanes);
// main part
for
(;
l
<=
length
-
nlanes
;
l
+=
nlanes
)
{
auto
xsum
=
[
l
,
border
,
shift
,
kx
](
const
SRC
inp
[])
{
v_float32
t
[
5
];
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
t
[
i
]
=
vx_load_f32
(
&
inp
[
l
+
(
i
-
border
)
*
shift
]);
}
v_float32
sum
=
t
[
0
]
*
vx_setall_f32
(
kx
[
0
]);
for
(
int
j
=
1
;
j
<
5
;
++
j
)
{
sum
=
v_fma
(
t
[
j
],
vx_setall_f32
(
kx
[
j
]),
sum
);
}
return
sum
;
};
v_float32
s
[
buffSize
];
for
(
int
m
=
0
;
m
<
buffSize
;
++
m
)
{
s
[
m
]
=
xsum
(
in
[
m
]);
}
v_float32
sum
=
s
[
0
]
*
vx_setall_f32
(
ky
[
0
]);
for
(
int
n
=
1
;
n
<
kyLen
;
++
n
)
{
sum
=
v_fma
(
s
[
n
],
vx_setall_f32
(
ky
[
n
]),
sum
);
}
if
(
!
noscale
)
{
sum
=
v_fma
(
sum
,
vx_setall_f32
(
scale
),
vx_setall_f32
(
delta
));
}
v_store
(
&
out
[
l
],
sum
);
}
// tail (if any)
if
(
l
<
length
)
{
GAPI_DbgAssert
(
length
>=
nlanes
);
l
=
length
-
nlanes
;
}
}
return
;
}
#define USE_SEPFILTER5X5_CHAR2SHORT 1
#if USE_SEPFILTER5X5_CHAR2SHORT
template
<
bool
noscale
>
static
void
run_sepfilter5x5_char2short
(
short
out
[],
const
uchar
*
in
[],
int
width
,
int
chan
,
const
float
kx
[],
const
float
ky
[],
int
border
,
float
scale
,
float
delta
,
float
*
buf
[],
int
y
,
int
y0
)
{
constexpr
int
kxLen
=
5
;
constexpr
int
kyLen
=
kxLen
;
constexpr
int
buffSize
=
5
;
schar
ikx
[
kxLen
];
schar
iky
[
kyLen
];
for
(
int
i
=
0
;
i
<
kxLen
;
++
i
)
{
ikx
[
i
]
=
saturate
<
schar
>
(
kx
[
i
],
rintf
);
iky
[
i
]
=
saturate
<
schar
>
(
ky
[
i
],
rintf
);
}
const
short
iscale
=
saturate
<
short
>
(
scale
*
(
1
<<
15
),
rintf
);
const
short
idelta
=
saturate
<
short
>
(
delta
,
rintf
);
// check if this code is applicable
if
(
ikx
[
0
]
!=
kx
[
0
]
||
ikx
[
1
]
!=
kx
[
1
]
||
ikx
[
2
]
!=
kx
[
2
]
||
ikx
[
3
]
!=
kx
[
3
]
||
ikx
[
4
]
!=
kx
[
4
]
||
iky
[
0
]
!=
ky
[
0
]
||
iky
[
1
]
!=
ky
[
1
]
||
iky
[
2
]
!=
ky
[
2
]
||
iky
[
3
]
!=
ky
[
3
]
||
iky
[
4
]
!=
ky
[
4
]
||
idelta
!=
delta
||
std
::
abs
(
scale
)
>
1
||
std
::
abs
(
scale
)
<
0.01
)
{
run_sepfilter5x5_any2short
<
noscale
>
(
out
,
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
return
;
}
short
*
ibuf
[
buffSize
];
int
r
[
buffSize
];
for
(
int
n
=
0
;
n
<
buffSize
;
++
n
)
{
ibuf
[
n
]
=
reinterpret_cast
<
short
*>
(
buf
[
n
]);
r
[
n
]
=
(
y
-
y0
+
n
)
%
5
;
// previous, this, next rows
}
const
int
length
=
width
*
chan
;
const
int
shift
=
chan
;
// horizontal pass
// full horizontal pass is needed only if the very 1st row in ROI is handled;
// for 2nd and further rows, it's enough to convolve only the
// "next" row - as we can reuse buffers from previous calls to
// this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
int
k0
=
(
y
==
y0
)
?
0
:
4
;
constexpr
int
nlanes
=
v_int16
::
nlanes
;
for
(
int
k
=
k0
;
k
<
kyLen
;
++
k
)
{
for
(
int
l
=
0
;
l
<
length
;)
{
GAPI_Assert
(
length
>=
nlanes
);
// main part of output row
for
(;
l
<=
length
-
nlanes
;
l
+=
nlanes
)
{
v_uint16
t
[
kxLen
];
v_int16
sum
;
for
(
int
i
=
0
;
i
<
kxLen
;
++
i
)
{
// previous, current, next pixels
t
[
i
]
=
vx_load_expand
(
&
in
[
k
][
l
+
(
i
-
border
)
*
shift
]);
sum
+=
v_reinterpret_as_s16
(
t
[
i
])
*
vx_setall_s16
(
ikx
[
i
]);
}
v_store
(
&
ibuf
[
r
[
k
]][
l
],
sum
);
}
// tail (if any)
if
(
l
<
length
)
{
GAPI_DbgAssert
(
length
>=
nlanes
);
l
=
length
-
nlanes
;
}
}
}
// vertical pass
for
(
int
l
=
0
;
l
<
length
;)
{
//GAPI_Assert(length >= nlanes);
// main part of output row
for
(;
l
<=
length
-
nlanes
;
l
+=
nlanes
)
{
v_int16
s
[
buffSize
];
v_int16
sum
;
for
(
int
i
=
0
;
i
<
kyLen
;
++
i
)
{
// previous, current, next rows
s
[
i
]
=
vx_load
(
&
ibuf
[
r
[
i
]][
l
]);
sum
+=
s
[
i
]
*
vx_setall_s16
(
iky
[
i
]);
}
if
(
!
noscale
)
{
sum
=
v_mul_hi
(
sum
<<
1
,
vx_setall_s16
(
iscale
))
+
vx_setall_s16
(
idelta
);
}
v_store
(
&
out
[
l
],
sum
);
}
// tail (if any)
if
(
l
<
length
)
{
GAPI_DbgAssert
(
length
>=
nlanes
);
l
=
length
-
nlanes
;
}
}
return
;
}
#endif //USE_SEPFILTER5X5_CHAR2SHORT
#endif //CV_SIMD
template
<
bool
noscale
,
typename
DST
,
typename
SRC
>
static
void
run_sepfilter5x5_reference
(
DST
out
[],
const
SRC
*
in
[],
int
width
,
int
chan
,
const
float
kx
[],
const
float
ky
[],
int
border
,
float
scale
,
float
delta
,
float
*
buf
[],
int
y
,
int
y0
)
{
constexpr
int
kxLen
=
5
;
// kernel size
constexpr
int
kyLen
=
kxLen
;
int
r
[
kyLen
];
for
(
int
n
=
0
;
n
<
kyLen
;
++
n
)
{
r
[
n
]
=
(
y
-
y0
+
n
)
%
5
;
// previous, this, next rows
}
int
length
=
width
*
chan
;
int
shift
=
chan
;
// horizontal pass
// full horizontal pass is needed only if very 1st row in ROI;
// for 2nd and further rows, it is enough to convolve only the
// "next" row - as we can reuse buffers from previous calls to
// this kernel (Fluid does rows consequently: y=y0, y0+1, ...)
int
k0
=
(
y
==
y0
)
?
0
:
4
;
for
(
int
k
=
k0
;
k
<
kyLen
;
++
k
)
{
const
SRC
*
s
[
kxLen
]
=
{
nullptr
};
for
(
int
i
=
0
;
i
<
kxLen
;
++
i
)
{
// previous , this , next pixels
s
[
i
]
=
in
[
k
]
+
(
i
-
border
)
*
shift
;
}
// rely on compiler vectoring
for
(
int
l
=
0
;
l
<
length
;
++
l
)
{
float
sum
=
0
;
for
(
int
i
=
0
;
i
<
kxLen
;
++
i
)
{
sum
+=
s
[
i
][
l
]
*
kx
[
i
];
}
buf
[
r
[
k
]][
l
]
=
sum
;
}
}
// vertical pass
for
(
int
l
=
0
;
l
<
length
;
++
l
)
{
float
sum
=
0
;
for
(
int
j
=
0
;
j
<
kyLen
;
++
j
)
{
sum
+=
buf
[
r
[
j
]][
l
]
*
ky
[
j
];
}
if
(
!
noscale
)
{
sum
=
sum
*
scale
+
delta
;
}
out
[
l
]
=
saturate
<
DST
>
(
sum
,
rintf
);
}
return
;
}
template
<
bool
noscale
,
typename
DST
,
typename
SRC
>
static
void
run_sepfilter5x5_code
(
DST
out
[],
const
SRC
*
in
[],
int
width
,
int
chan
,
const
float
kx
[],
const
float
ky
[],
int
border
,
float
scale
,
float
delta
,
float
*
buf
[],
int
y
,
int
y0
)
{
#if CV_SIMD
int
length
=
width
*
chan
;
// length variable may be unused if types do not match at 'if' statements below
(
void
)
length
;
if
(
std
::
is_same
<
DST
,
short
>::
value
&&
std
::
is_same
<
SRC
,
uchar
>::
value
&&
length
>=
v_int16
::
nlanes
)
{
run_sepfilter5x5_char2short
<
noscale
>
(
reinterpret_cast
<
short
*>
(
out
),
reinterpret_cast
<
const
uchar
**>
(
in
),
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
return
;
}
if
(
std
::
is_same
<
DST
,
float
>::
value
&&
std
::
is_same
<
SRC
,
float
>::
value
&&
length
>=
v_float32
::
nlanes
)
{
run_sepfilter5x5_any2float
<
noscale
>
(
reinterpret_cast
<
float
*>
(
out
),
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
);
return
;
}
if
(
std
::
is_same
<
DST
,
short
>::
value
&&
length
>=
v_int16
::
nlanes
)
{
run_sepfilter5x5_any2short
<
noscale
>
(
reinterpret_cast
<
short
*>
(
out
),
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
return
;
}
if
(
std
::
is_same
<
DST
,
ushort
>::
value
&&
length
>=
v_uint16
::
nlanes
)
{
run_sepfilter5x5_any2short
<
noscale
>
(
reinterpret_cast
<
ushort
*>
(
out
),
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
return
;
}
if
(
std
::
is_same
<
DST
,
uchar
>::
value
&&
length
>=
v_uint8
::
nlanes
)
{
run_sepfilter5x5_any2char
<
noscale
>
(
reinterpret_cast
<
uchar
*>
(
out
),
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
return
;
}
#endif // CV_SIMD
// reference code is quite fast for any-to-float case,
// but not for any-to-integral due to very slow rounding
run_sepfilter5x5_reference
<
noscale
>
(
out
,
in
,
width
,
chan
,
kx
,
ky
,
border
,
scale
,
delta
,
buf
,
y
,
y0
);
}
#define RUN_SEPFILTER5x5_IMPL(DST, SRC) \
void run_sepfilter5x5_impl(DST out[], const SRC *in[], int width, int chan, const float kx[], \
const float ky[], int border, float scale, float delta, \
float *buf[], int y, int y0) \
{ \
if (scale == 1 && delta == 0) \
{ \
constexpr bool noscale = true; \
run_sepfilter5x5_code<noscale>(out, in, width, chan, kx, ky, border, \
scale, delta, buf, y, y0); \
} \
else \
{ \
constexpr bool noscale = false; \
run_sepfilter5x5_code<noscale>(out, in, width, chan, kx, ky, border, \
scale, delta, buf, y, y0); \
} \
return; \
}
RUN_SEPFILTER5x5_IMPL
(
uchar
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
short
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
float
,
uchar
)
RUN_SEPFILTER5x5_IMPL
(
ushort
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
float
,
ushort
)
RUN_SEPFILTER5x5_IMPL
(
short
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
short
)
RUN_SEPFILTER5x5_IMPL
(
float
,
float
)
#undef RUN_SEPFILTER5x5_IMPL
//-------------------------
//
// Fluid kernels: Filter 2D
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录