Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
1f8b41f3
O
Opencv
项目概览
Greenplum
/
Opencv
11 个月 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
1f8b41f3
编写于
4月 23, 2014
作者:
A
Aaron Kunze
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimizes filter2D for Intel GPUs
上级
dcc4d364
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
488 addition
and
66 deletion
+488
-66
modules/core/src/ocl.cpp
modules/core/src/ocl.cpp
+1
-1
modules/imgproc/src/filter.cpp
modules/imgproc/src/filter.cpp
+136
-55
modules/imgproc/src/opencl/filter2DSmall.cl
modules/imgproc/src/opencl/filter2DSmall.cl
+335
-0
modules/imgproc/test/ocl/test_filter2d.cpp
modules/imgproc/test/ocl/test_filter2d.cpp
+16
-10
未找到文件。
modules/core/src/ocl.cpp
浏览文件 @
1f8b41f3
...
...
@@ -4379,7 +4379,7 @@ String kernelToStr(InputArray _kernel, int ddepth, const char * name)
typedef
std
::
string
(
*
func_t
)(
const
Mat
&
);
static
const
func_t
funcs
[]
=
{
kerToStr
<
uchar
>
,
kerToStr
<
char
>
,
kerToStr
<
ushort
>
,
kerToStr
<
short
>
,
kerToStr
<
int
>
,
kerToStr
<
float
>
,
kerToStr
<
double
>
,
0
};
const
func_t
func
=
funcs
[
depth
];
const
func_t
func
=
funcs
[
d
d
epth
];
CV_Assert
(
func
!=
0
);
return
cv
::
format
(
" -D %s=%s"
,
name
?
name
:
"COEFF"
,
func
(
kernel
).
c_str
());
...
...
modules/imgproc/src/filter.cpp
浏览文件 @
1f8b41f3
...
...
@@ -3191,11 +3191,10 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
"BORDER_WRAP"
,
"BORDER_REFLECT_101"
};
cv
::
Mat
kernelMat
=
_kernel
.
getMat
();
std
::
vector
<
float
>
kernelMatDataFloat
;
int
kernel_size_y2_aligned
=
_prepareKernelFilter2D
<
float
>
(
kernelMatDataFloat
,
kernelMat
);
cv
::
Size
sz
=
_src
.
size
(),
wholeSize
;
size_t
globalsize
[
2
]
=
{
sz
.
width
,
sz
.
height
},
localsize
[
2
]
=
{
0
,
1
};
size_t
globalsize
[
2
]
=
{
sz
.
width
,
sz
.
height
};
size_t
localsize_general
[
2
]
=
{
0
,
1
};
size_t
*
localsize
=
NULL
;
ocl
::
Kernel
k
;
UMat
src
=
_src
.
getUMat
();
...
...
@@ -3210,63 +3209,134 @@ static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth,
size_t
tryWorkItems
=
maxWorkItemSizes
[
0
];
char
cvt
[
2
][
40
];
String
kerStr
=
ocl
::
kernelToStr
(
kernelMatDataFloat
,
CV_32F
);
for
(
;
;
)
// For smaller filter kernels, there is a special kernel that is more
// efficient than the general one.
UMat
kernalDataUMat
;
if
(
device
.
isIntel
()
&&
(
device
.
type
()
&
ocl
::
Device
::
TYPE_GPU
)
&&
((
ksize
.
width
<
5
&&
ksize
.
height
<
5
)
||
(
ksize
.
width
==
5
&&
ksize
.
height
==
5
&&
cn
==
1
)))
{
size_t
BLOCK_SIZE
=
tryWorkItems
;
while
(
BLOCK_SIZE
>
32
&&
BLOCK_SIZE
>=
(
size_t
)
ksize
.
width
*
2
&&
BLOCK_SIZE
>
(
size_t
)
sz
.
width
*
2
)
BLOCK_SIZE
/=
2
;
#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
size_t
BLOCK_SIZE_Y
=
1
;
#else
size_t
BLOCK_SIZE_Y
=
8
;
// TODO Check heuristic value on devices
while
(
BLOCK_SIZE_Y
<
BLOCK_SIZE
/
8
&&
BLOCK_SIZE_Y
*
src
.
clCxt
->
getDeviceInfo
().
maxComputeUnits
*
32
<
(
size_t
)
src
.
rows
)
BLOCK_SIZE_Y
*=
2
;
#endif
if
((
size_t
)
ksize
.
width
>
BLOCK_SIZE
)
return
false
;
int
requiredTop
=
anchor
.
y
;
int
requiredLeft
=
(
int
)
BLOCK_SIZE
;
// not this: anchor.x;
int
requiredBottom
=
ksize
.
height
-
1
-
anchor
.
y
;
int
requiredRight
=
(
int
)
BLOCK_SIZE
;
// not this: ksize.width - 1 - anchor.x;
kernelMat
.
reshape
(
0
,
1
);
String
kerStr
=
ocl
::
kernelToStr
(
kernelMat
,
CV_32F
);
int
h
=
isolated
?
sz
.
height
:
wholeSize
.
height
;
int
w
=
isolated
?
sz
.
width
:
wholeSize
.
width
;
bool
extra_extrapolation
=
h
<
requiredTop
||
h
<
requiredBottom
||
w
<
requiredLeft
||
w
<
requiredRight
;
if
((
w
<
ksize
.
width
)
||
(
h
<
ksize
.
height
))
return
false
;
String
opts
=
format
(
"-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D cn=%d "
"-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
"-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s "
"-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
"-D convertToWT=%s -D convertToDstT=%s"
,
(
int
)
BLOCK_SIZE
,
(
int
)
BLOCK_SIZE_Y
,
cn
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
kernel_size_y2_aligned
,
borderMap
[
borderType
],
extra_extrapolation
?
"EXTRA_EXTRAPOLATION"
:
"NO_EXTRA_EXTRAPOLATION"
,
isolated
?
"BORDER_ISOLATED"
:
"NO_BORDER_ISOLATED"
,
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
,
kerStr
.
c_str
(),
ocl
::
typeToStr
(
type
),
ocl
::
typeToStr
(
sdepth
),
ocl
::
typeToStr
(
dtype
),
ocl
::
typeToStr
(
ddepth
),
ocl
::
typeToStr
(
wtype
),
ocl
::
typeToStr
(
wdepth
),
ocl
::
convertTypeStr
(
sdepth
,
wdepth
,
cn
,
cvt
[
0
]),
ocl
::
convertTypeStr
(
wdepth
,
ddepth
,
cn
,
cvt
[
1
]));
localsize
[
0
]
=
BLOCK_SIZE
;
globalsize
[
0
]
=
DIVUP
(
sz
.
width
,
BLOCK_SIZE
-
(
ksize
.
width
-
1
))
*
BLOCK_SIZE
;
globalsize
[
1
]
=
DIVUP
(
sz
.
height
,
BLOCK_SIZE_Y
);
if
(
!
k
.
create
(
"filter2D"
,
cv
::
ocl
::
imgproc
::
filter2D_oclsrc
,
opts
))
return
false
;
// Figure out what vector size to use for loading the pixels.
int
pxLoadNumPixels
=
((
cn
!=
1
)
||
sz
.
width
%
4
)
?
1
:
4
;
int
pxLoadVecSize
=
cn
*
pxLoadNumPixels
;
size_t
kernelWorkGroupSize
=
k
.
workGroupSize
();
if
(
localsize
[
0
]
<=
kernelWorkGroupSize
)
break
;
if
(
BLOCK_SIZE
<
kernelWorkGroupSize
)
// Figure out how many pixels per work item to compute in X and Y
// directions. Too many and we run out of registers.
int
pxPerWorkItemX
=
1
;
int
pxPerWorkItemY
=
1
;
if
(
cn
<=
2
&&
ksize
.
width
<=
4
&&
ksize
.
height
<=
4
)
{
pxPerWorkItemX
=
sz
.
width
%
8
?
sz
.
width
%
4
?
sz
.
width
%
2
?
1
:
2
:
4
:
8
;
pxPerWorkItemY
=
sz
.
width
%
2
?
1
:
2
;
}
else
if
(
cn
<
4
||
(
ksize
.
width
<=
4
&&
ksize
.
height
<=
4
))
{
pxPerWorkItemX
=
sz
.
width
%
2
?
1
:
2
;
pxPerWorkItemY
=
sz
.
width
%
2
?
1
:
2
;
}
globalsize
[
0
]
=
sz
.
width
/
pxPerWorkItemX
;
globalsize
[
1
]
=
sz
.
height
/
pxPerWorkItemY
;
// Need some padding in the private array for pixels
int
privDataWidth
=
ROUNDUP
(
pxPerWorkItemX
+
ksize
.
width
-
1
,
pxLoadNumPixels
);
// Make the global size a nice round number so the runtime can pick
// from reasonable choices for the workgroup size
const
int
wgRound
=
256
;
globalsize
[
0
]
=
ROUNDUP
(
globalsize
[
0
],
wgRound
);
char
build_options
[
1024
];
sprintf
(
build_options
,
"-D cn=%d "
"-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
"-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d "
"-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s "
"-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d "
"-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
"-D convertToWT=%s -D convertToDstT=%s %s"
,
cn
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
pxLoadVecSize
,
pxLoadNumPixels
,
pxPerWorkItemX
,
pxPerWorkItemY
,
privDataWidth
,
borderMap
[
borderType
],
isolated
?
"BORDER_ISOLATED"
:
"NO_BORDER_ISOLATED"
,
privDataWidth
/
pxLoadNumPixels
,
pxPerWorkItemY
+
ksize
.
height
-
1
,
ocl
::
typeToStr
(
type
),
ocl
::
typeToStr
(
sdepth
),
ocl
::
typeToStr
(
dtype
),
ocl
::
typeToStr
(
ddepth
),
ocl
::
typeToStr
(
wtype
),
ocl
::
typeToStr
(
wdepth
),
ocl
::
convertTypeStr
(
sdepth
,
wdepth
,
cn
,
cvt
[
0
]),
ocl
::
convertTypeStr
(
wdepth
,
ddepth
,
cn
,
cvt
[
1
]),
kerStr
.
c_str
());
cv
::
String
errmsg
;
if
(
!
k
.
create
(
"filter2DSmall"
,
cv
::
ocl
::
imgproc
::
filter2DSmall_oclsrc
,
build_options
,
&
errmsg
))
return
false
;
tryWorkItems
=
kernelWorkGroupSize
;
}
else
{
localsize
=
localsize_general
;
std
::
vector
<
float
>
kernelMatDataFloat
;
int
kernel_size_y2_aligned
=
_prepareKernelFilter2D
<
float
>
(
kernelMatDataFloat
,
kernelMat
);
String
kerStr
=
ocl
::
kernelToStr
(
kernelMatDataFloat
,
CV_32F
);
for
(
;
;
)
{
size_t
BLOCK_SIZE
=
tryWorkItems
;
while
(
BLOCK_SIZE
>
32
&&
BLOCK_SIZE
>=
(
size_t
)
ksize
.
width
*
2
&&
BLOCK_SIZE
>
(
size_t
)
sz
.
width
*
2
)
BLOCK_SIZE
/=
2
;
#if 1 // TODO Mode with several blocks requires a much more VGPRs, so this optimization is not actual for the current devices
size_t
BLOCK_SIZE_Y
=
1
;
#else
size_t
BLOCK_SIZE_Y
=
8
;
// TODO Check heuristic value on devices
while
(
BLOCK_SIZE_Y
<
BLOCK_SIZE
/
8
&&
BLOCK_SIZE_Y
*
src
.
clCxt
->
getDeviceInfo
().
maxComputeUnits
*
32
<
(
size_t
)
src
.
rows
)
BLOCK_SIZE_Y
*=
2
;
#endif
if
((
size_t
)
ksize
.
width
>
BLOCK_SIZE
)
return
false
;
int
requiredTop
=
anchor
.
y
;
int
requiredLeft
=
(
int
)
BLOCK_SIZE
;
// not this: anchor.x;
int
requiredBottom
=
ksize
.
height
-
1
-
anchor
.
y
;
int
requiredRight
=
(
int
)
BLOCK_SIZE
;
// not this: ksize.width - 1 - anchor.x;
int
h
=
isolated
?
sz
.
height
:
wholeSize
.
height
;
int
w
=
isolated
?
sz
.
width
:
wholeSize
.
width
;
bool
extra_extrapolation
=
h
<
requiredTop
||
h
<
requiredBottom
||
w
<
requiredLeft
||
w
<
requiredRight
;
if
((
w
<
ksize
.
width
)
||
(
h
<
ksize
.
height
))
return
false
;
String
opts
=
format
(
"-D LOCAL_SIZE=%d -D BLOCK_SIZE_Y=%d -D cn=%d "
"-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d "
"-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s "
"-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s "
"-D convertToWT=%s -D convertToDstT=%s"
,
(
int
)
BLOCK_SIZE
,
(
int
)
BLOCK_SIZE_Y
,
cn
,
anchor
.
x
,
anchor
.
y
,
ksize
.
width
,
ksize
.
height
,
kernel_size_y2_aligned
,
borderMap
[
borderType
],
extra_extrapolation
?
"EXTRA_EXTRAPOLATION"
:
"NO_EXTRA_EXTRAPOLATION"
,
isolated
?
"BORDER_ISOLATED"
:
"NO_BORDER_ISOLATED"
,
doubleSupport
?
" -D DOUBLE_SUPPORT"
:
""
,
kerStr
.
c_str
(),
ocl
::
typeToStr
(
type
),
ocl
::
typeToStr
(
sdepth
),
ocl
::
typeToStr
(
dtype
),
ocl
::
typeToStr
(
ddepth
),
ocl
::
typeToStr
(
wtype
),
ocl
::
typeToStr
(
wdepth
),
ocl
::
convertTypeStr
(
sdepth
,
wdepth
,
cn
,
cvt
[
0
]),
ocl
::
convertTypeStr
(
wdepth
,
ddepth
,
cn
,
cvt
[
1
]));
localsize
[
0
]
=
BLOCK_SIZE
;
globalsize
[
0
]
=
DIVUP
(
sz
.
width
,
BLOCK_SIZE
-
(
ksize
.
width
-
1
))
*
BLOCK_SIZE
;
globalsize
[
1
]
=
DIVUP
(
sz
.
height
,
BLOCK_SIZE_Y
);
if
(
!
k
.
create
(
"filter2D"
,
cv
::
ocl
::
imgproc
::
filter2D_oclsrc
,
opts
))
return
false
;
size_t
kernelWorkGroupSize
=
k
.
workGroupSize
();
if
(
localsize
[
0
]
<=
kernelWorkGroupSize
)
break
;
if
(
BLOCK_SIZE
<
kernelWorkGroupSize
)
return
false
;
tryWorkItems
=
kernelWorkGroupSize
;
}
}
_dst
.
create
(
sz
,
dtype
);
...
...
@@ -3678,9 +3748,20 @@ void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth,
temp
=
dst
;
else
temp
.
create
(
dst
.
size
(),
dst
.
type
());
crossCorr
(
src
,
kernel
,
temp
,
src
.
size
(),
CV_MAKETYPE
(
ddepth
,
src
.
channels
()),
anchor
,
delta
,
borderType
);
// crossCorr doesn't accept non-zero delta with multiple channels
if
(
src
.
channels
()
!=
1
&&
delta
!=
0
)
{
crossCorr
(
src
,
kernel
,
temp
,
src
.
size
(),
CV_MAKETYPE
(
ddepth
,
src
.
channels
()),
anchor
,
0
,
borderType
);
add
(
temp
,
delta
,
temp
);
}
else
{
crossCorr
(
src
,
kernel
,
temp
,
src
.
size
(),
CV_MAKETYPE
(
ddepth
,
src
.
channels
()),
anchor
,
delta
,
borderType
);
}
if
(
temp
.
data
!=
dst
.
data
)
temp
.
copyTo
(
dst
);
return
;
...
...
modules/imgproc/src/opencl/filter2DSmall.cl
0 → 100755
浏览文件 @
1f8b41f3
/*M///////////////////////////////////////////////////////////////////////////////////////
//
//
IMPORTANT:
READ
BEFORE
DOWNLOADING,
COPYING,
INSTALLING
OR
USING.
//
//
By
downloading,
copying,
installing
or
using
the
software
you
agree
to
this
license.
//
If
you
do
not
agree
to
this
license,
do
not
download,
install,
//
copy
or
use
the
software.
//
//
//
License
Agreement
//
For
Open
Source
Computer
Vision
Library
//
//
Copyright
(
C
)
2010-2013,
Advanced
Micro
Devices,
Inc.,
all
rights
reserved.
//
Copyright
(
C
)
2014
,
Intel
Corporation,
all
rights
reserved.
//
Third
party
copyrights
are
property
of
their
respective
owners.
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
//
//
*
Redistribution
's
of
source
code
must
retain
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer.
//
//
*
Redistribution
's
in
binary
form
must
reproduce
the
above
copyright
notice,
//
this
list
of
conditions
and
the
following
disclaimer
in
the
documentation
//
and/or
other
materials
provided
with
the
distribution.
//
//
*
The
name
of
the
copyright
holders
may
not
be
used
to
endorse
or
promote
products
//
derived
from
this
software
without
specific
prior
written
permission.
//
//
This
software
is
provided
by
the
copyright
holders
and
contributors
as
is
and
//
any
express
or
implied
warranties,
including,
but
not
limited
to,
the
implied
//
warranties
of
merchantability
and
fitness
for
a
particular
purpose
are
disclaimed.
//
In
no
event
shall
the
Intel
Corporation
or
contributors
be
liable
for
any
direct,
//
indirect,
incidental,
special,
exemplary,
or
consequential
damages
//
(
including,
but
not
limited
to,
procurement
of
substitute
goods
or
services
;
//
loss
of
use,
data,
or
profits
; or business interruption) however caused
//
and
on
any
theory
of
liability,
whether
in
contract,
strict
liability,
//
or
tort
(
including
negligence
or
otherwise
)
arising
in
any
way
out
of
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
ifdef
BORDER_REPLICATE
//BORDER_REPLICATE:
aaaaaa|abcdefgh|hhhhhhh
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
l_edge
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
r_edge
)
-1
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
(
t_edge
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
b_edge
)
-1
:
(
addr
))
#
endif
#
ifdef
BORDER_REFLECT
//BORDER_REFLECT:
fedcba|abcdefgh|hgfedcb
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
-
(
i
)
-1
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-1+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
-
(
i
)
-1
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-1+
((
b_edge
)
<<1
)
:
(
addr
))
#
endif
#
ifdef
BORDER_REFLECT_101
//BORDER_REFLECT_101:
gfedcb|abcdefgh|gfedcba
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
-
(
i
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
-
(
i
)
-2+
((
r_edge
)
<<1
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
-
(
i
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
-
(
i
)
-2+
((
b_edge
)
<<1
)
:
(
addr
))
#
endif
//blur
function
does
not
support
BORDER_WRAP
#
ifdef
BORDER_WRAP
//BORDER_WRAP:
cdefgh|abcdefgh|abcdefg
#
define
ADDR_L
(
i,
l_edge,
r_edge
)
((
i
)
<
(
l_edge
)
?
(
i
)
+
(
r_edge
)
:
(
i
))
#
define
ADDR_R
(
i,
r_edge,
addr
)
((
i
)
>=
(
r_edge
)
?
(
i
)
-
(
r_edge
)
:
(
addr
))
#
define
ADDR_H
(
i,
t_edge,
b_edge
)
((
i
)
<
(
t_edge
)
?
(
i
)
+
(
b_edge
)
:
(
i
))
#
define
ADDR_B
(
i,
b_edge,
addr
)
((
i
)
>=
(
b_edge
)
?
(
i
)
-
(
b_edge
)
:
(
addr
))
#
endif
#
ifdef
BORDER_ISOLATED
#
define
ISOLATED_MIN
(
VAL
)
(
VAL
)
#
else
#
define
ISOLATED_MIN
(
VAL
)
0
#
endif
#
ifdef
EXTRA_EXTRAPOLATION
//
border
>
src
image
size
#
ifdef
BORDER_CONSTANT
//
None
#
elif
defined
BORDER_REPLICATE
#
define
EXTRAPOLATE
(
x,
y,
minX,
minY,
maxX,
maxY
)
\
{
\
x
=
max
(
min
(
x,
maxX
-
1
)
,
minX
)
; \
y
=
max
(
min
(
y,
maxY
-
1
)
,
minY
)
; \
}
#
elif
defined
BORDER_WRAP
#
define
EXTRAPOLATE
(
x,
y,
minX,
minY,
maxX,
maxY
)
\
{
\
if
(
x
<
minX
)
\
x
-=
((
x
-
maxX
+
1
)
/
maxX
)
*
maxX
; \
if
(
x
>=
maxX
)
\
x
%=
maxX
; \
if
(
y
<
minY
)
\
y
-=
((
y
-
maxY
+
1
)
/
maxY
)
*
maxY
; \
if
(
y
>=
maxY
)
\
y
%=
maxY
; \
}
#
elif
defined
(
BORDER_REFLECT
)
|
| defined(BORDER_REFLECT_101)
#define EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, delta) \
{ \
if (maxX - minX == 1) \
x = minX; \
else \
do \
{ \
if (x < minX) \
x = minX - (x - minX) - 1 + delta; \
else \
x = maxX - 1 - (x - maxX) - delta; \
} \
while (x >= maxX || x < minX); \
\
if (maxY - minY == 1) \
y = minY; \
else \
do \
{ \
if (y < minY) \
y = minY - (y - minY) - 1 + delta; \
else \
y = maxY - 1 - (y - maxY) - delta; \
} \
while (y >= maxY || y < minY); \
}
#ifdef BORDER_REFLECT
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 0)
#elif defined(BORDER_REFLECT_101) || defined(BORDER_REFLECT101)
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) EXTRAPOLATE_(x, y, minX, minY, maxX, maxY, 1)
#endif
#else
#error No extrapolation method
#endif
#else
#define EXTRAPOLATE(x, y, minX, minY, maxX, maxY) \
{ \
int _row = y - ISOLATED_MIN(minY), _col = x - ISOLATED_MIN(minX); \
_row = ADDR_H(_row, 0, maxY - ISOLATED_MIN(minY)); \
_row = ADDR_B(_row, maxY - ISOLATED_MIN(minY), _row); \
y = _row + ISOLATED_MIN(minY); \
\
_col = ADDR_L(_col, 0, maxX - ISOLATED_MIN(minX)); \
_col = ADDR_R(_col, maxX - ISOLATED_MIN(minX), _col); \
x = _col + ISOLATED_MIN(minX); \
}
#endif
#ifdef DOUBLE_SUPPORT
#ifdef cl_amd_fp64
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#elif defined (cl_khr_fp64)
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#endif
#endif
#if cn != 3
#define loadpix(addr) *(__global const srcT *)(addr)
#define storepix(val, addr) *(__global dstT *)(addr) = val
#define SRCSIZE (int)sizeof(srcT)
#define DSTSIZE (int)sizeof(dstT)
#else
#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
#define SRCSIZE (int)sizeof(srcT1) * cn
#define DSTSIZE (int)sizeof(dstT1) * cn
#endif
#define noconvert
struct RectCoords
{
int x1, y1, x2, y2;
};
#ifdef BORDER_ISOLATED
inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels)
{
return (coord.x < bounds.x1 || coord.y < bounds.y1 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2);
}
#else
inline bool isBorder(const struct RectCoords bounds, int2 coord, int numPixels)
{
return (coord.x < 0 || coord.y < 0 || coord.x + numPixels > bounds.x2 || coord.y >= bounds.y2);
}
#endif
WT getBorderPixel(const struct RectCoords bounds, int2 coord,
__global const uchar* srcptr, int srcstep)
{
#ifdef BORDER_CONSTANT
return (WT)(0);
#else
int selected_col = coord.x;
int selected_row = coord.y;
EXTRAPOLATE(selected_col, selected_row,
bounds.x1, bounds.y1,
bounds.x2, bounds.y2
);
coord = (int2)(selected_col, selected_row);
__global const uchar* ptr = srcptr + mul24(coord.y, srcstep) +
coord.x * SRCSIZE;
return convertToWT(loadpix(ptr));
#endif
}
inline WT readSrcPixelSingle(int2 pos, __global const uchar* srcptr,
int srcstep, const struct RectCoords srcCoords)
{
if (!isBorder(srcCoords, pos, 1))
{
__global const uchar* ptr = srcptr + mul24(pos.y, srcstep) +
pos.x * SRCSIZE;
return convertToWT(loadpix(ptr));
}
else
{
return getBorderPixel(srcCoords, pos, srcptr, srcstep);
}
}
#define __CAT(x, y) x##y
#define CAT(x, y) __CAT(x, y)
#define vload1(OFFSET, PTR) (*(PTR + OFFSET))
#define PX_LOAD_VEC_TYPE CAT(srcT1, PX_LOAD_VEC_SIZE)
#define PX_LOAD_FLOAT_VEC_TYPE CAT(WT1, PX_LOAD_VEC_SIZE)
#define PX_LOAD_FLOAT_VEC_CONV CAT(convert_, PX_LOAD_FLOAT_VEC_TYPE)
#define PX_LOAD CAT(vload, PX_LOAD_VEC_SIZE)
#define float1 float
inline PX_LOAD_FLOAT_VEC_TYPE readSrcPixelGroup(int2 pos, __global const uchar* srcptr,
int srcstep, const struct RectCoords srcCoords)
{
__global const srcT1* ptr = (__global const srcT1*)
(srcptr + mul24(pos.y, srcstep) +
pos.x * SRCSIZE);
return PX_LOAD_FLOAT_VEC_CONV(PX_LOAD(0, ptr));
}
// Macros to ensure unrolled loops
#define LOOP1(VAR, STMT) (STMT); (VAR)++;
#define LOOP2(VAR, STMT) LOOP1(VAR, STMT); (STMT); (VAR)++;
#define LOOP3(VAR, STMT) LOOP2(VAR, STMT); (STMT); (VAR)++;
#define LOOP4(VAR, STMT) LOOP3(VAR, STMT); (STMT); (VAR)++;
#define LOOP5(VAR, STMT) LOOP4(VAR, STMT); (STMT); (VAR)++;
#define LOOP6(VAR, STMT) LOOP5(VAR, STMT); (STMT); (VAR)++;
#define LOOP7(VAR, STMT) LOOP6(VAR, STMT); (STMT); (VAR)++;
#define LOOP8(VAR, STMT) LOOP7(VAR, STMT); (STMT); (VAR)++;
#define LOOP9(VAR, STMT) LOOP8(VAR, STMT); (STMT); (VAR)++;
#define LOOP10(VAR, STMT) LOOP9(VAR, STMT); (STMT); (VAR)++;
#define LOOP11(VAR, STMT) LOOP10(VAR, STMT); (STMT); (VAR)++;
#define LOOP12(VAR, STMT) LOOP11(VAR, STMT); (STMT); (VAR)++;
#define LOOP13(VAR, STMT) LOOP12(VAR, STMT); (STMT); (VAR)++;
#define LOOP(N, VAR, STMT) CAT(LOOP, N)((VAR), (STMT))
#define DIG(a) a,
__constant WT1 kernelData[] = { COEFF };
__kernel void filter2DSmall(__global const uchar * srcptr, int src_step, int srcOffsetX, int srcOffsetY, int srcEndX, int srcEndY,
__global uchar * dstptr, int dst_step, int dst_offset, int rows, int cols, float delta)
{
const struct RectCoords srcCoords = { srcOffsetX, srcOffsetY, srcEndX, srcEndY }; // for non-isolated border: offsetX, offsetY, wholeX, wholeY
const int startX = get_global_id(0) * PX_PER_WI_X;
const int startY = get_global_id(1) * PX_PER_WI_Y;
if ((startX >= cols) |
|
(
startY
>=
rows
))
{
return
;
}
WT
privateData[PX_PER_WI_Y
+
KERNEL_SIZE_Y
-
1][PRIV_DATA_WIDTH]
;
//
Load
all
of
the
pixels
needed
for
the
calculation
int
py
=
0
;
LOOP
(
PX_LOAD_Y_ITERATIONS,
py,
{
int
y
=
startY
+
py
;
int
px
=
0
;
LOOP
(
PX_LOAD_X_ITERATIONS,
px,
{
int
x
=
startX
+
(
px
*
PX_LOAD_NUM_PX
)
;
int2
srcPos
=
(
int2
)(
srcCoords.x1
+
x
-
ANCHOR_X,
srcCoords.y1
+
y
-
ANCHOR_Y
)
;
if
(
!isBorder
(
srcCoords,
srcPos,
PX_LOAD_NUM_PX
))
{
PX_LOAD_FLOAT_VEC_TYPE
p
=
readSrcPixelGroup
(
srcPos,
srcptr,
src_step,
srcCoords
)
;
*
((
PX_LOAD_FLOAT_VEC_TYPE*
)
&privateData[py][px
*
PX_LOAD_NUM_PX]
)
=
p
;
}
else
{
int
lx
=
0
;
LOOP
(
PX_LOAD_NUM_PX,
lx,
{
WT
p
=
readSrcPixelSingle
(
srcPos,
srcptr,
src_step,
srcCoords
)
;
*
((
WT*
)
&privateData[py][px
*
PX_LOAD_NUM_PX
+
lx]
)
=
p
;
srcPos.x++
;
}
)
;
}
}
)
;
}
)
;
//
Use
the
stored
pixels
to
compute
the
results
py
=
0
;
LOOP
(
PX_PER_WI_Y,
py,
{
int
y
=
startY
+
py
;
int
px
=
0
;
LOOP
(
PX_PER_WI_X,
px,
{
int
x
=
startX
+
px
;
WT
total_sum
=
0
;
int
sy
=
0
;
int
kernelIndex
=
0
;
LOOP
(
KERNEL_SIZE_Y,
sy,
{
int
sx
=
0
;
LOOP
(
KERNEL_SIZE_X,
sx,
{
total_sum
=
mad
(
kernelData[kernelIndex++],
privateData[py
+
sy][px
+
sx],
total_sum
)
;
}
)
;
}
)
;
__global
dstT*
dstPtr
=
(
__global
dstT*
)(
dstptr
+
y
*
dst_step
+
dst_offset
+
x
*
DSTSIZE
)
; // Pointer can be out of bounds!
storepix
(
convertToDstT
(
total_sum
+
(
WT
)(
delta
))
,
dstPtr
)
;
}
)
;
}
)
;
}
modules/imgproc/test/ocl/test_filter2d.cpp
浏览文件 @
1f8b41f3
...
...
@@ -51,7 +51,7 @@ namespace ocl {
/////////////////////////////////////////////////////////////////////////////////////////////////
// Filter2D
PARAM_TEST_CASE
(
Filter2D
,
MatDepth
,
Channels
,
BorderType
,
bool
,
bool
)
PARAM_TEST_CASE
(
Filter2D
,
MatDepth
,
Channels
,
int
,
int
,
BorderType
,
bool
,
bool
)
{
static
const
int
kernelMinSize
=
2
;
static
const
int
kernelMaxSize
=
10
;
...
...
@@ -60,6 +60,7 @@ PARAM_TEST_CASE(Filter2D, MatDepth, Channels, BorderType, bool, bool)
Size
dsize
;
Point
anchor
;
int
borderType
;
int
widthMultiple
;
bool
useRoi
;
Mat
kernel
;
double
delta
;
...
...
@@ -70,27 +71,30 @@ PARAM_TEST_CASE(Filter2D, MatDepth, Channels, BorderType, bool, bool)
virtual
void
SetUp
()
{
type
=
CV_MAKE_TYPE
(
GET_PARAM
(
0
),
GET_PARAM
(
1
));
borderType
=
GET_PARAM
(
2
)
|
(
GET_PARAM
(
3
)
?
BORDER_ISOLATED
:
0
);
useRoi
=
GET_PARAM
(
4
);
Size
ksize
(
GET_PARAM
(
2
),
GET_PARAM
(
2
));
widthMultiple
=
GET_PARAM
(
3
);
borderType
=
GET_PARAM
(
4
)
|
(
GET_PARAM
(
5
)
?
BORDER_ISOLATED
:
0
);
useRoi
=
GET_PARAM
(
6
);
Mat
temp
=
randomMat
(
ksize
,
CV_MAKE_TYPE
(((
CV_64F
==
CV_MAT_DEPTH
(
type
))
?
CV_64F
:
CV_32F
),
1
),
-
MAX_VALUE
,
MAX_VALUE
);
cv
::
normalize
(
temp
,
kernel
,
1.0
,
0.0
,
NORM_L1
);
}
void
random_roi
()
{
dsize
=
randomSize
(
1
,
MAX_VALUE
);
// Make sure the width is a multiple of the requested value, and no more.
dsize
.
width
&=
~
((
widthMultiple
*
2
)
-
1
);
dsize
.
width
+=
widthMultiple
;
Size
ksize
=
randomSize
(
kernelMinSize
,
kernelMaxSize
);
Mat
temp
=
randomMat
(
ksize
,
CV_MAKE_TYPE
(((
CV_64F
==
CV_MAT_DEPTH
(
type
))
?
CV_64F
:
CV_32F
),
1
),
-
MAX_VALUE
,
MAX_VALUE
);
cv
::
normalize
(
temp
,
kernel
,
1.0
,
0.0
,
NORM_L1
);
Size
roiSize
=
randomSize
(
ksize
.
width
,
MAX_VALUE
,
ksize
.
height
,
MAX_VALUE
);
Size
roiSize
=
randomSize
(
kernel
.
size
[
0
],
MAX_VALUE
,
kernel
.
size
[
1
],
MAX_VALUE
);
Border
srcBorder
=
randomBorder
(
0
,
useRoi
?
MAX_VALUE
:
0
);
randomSubMat
(
src
,
src_roi
,
roiSize
,
srcBorder
,
type
,
-
MAX_VALUE
,
MAX_VALUE
);
Border
dstBorder
=
randomBorder
(
0
,
useRoi
?
MAX_VALUE
:
0
);
randomSubMat
(
dst
,
dst_roi
,
dsize
,
dstBorder
,
type
,
-
MAX_VALUE
,
MAX_VALUE
);
anchor
.
x
=
randomInt
(
-
1
,
k
size
.
width
);
anchor
.
y
=
randomInt
(
-
1
,
k
size
.
height
);
anchor
.
x
=
randomInt
(
-
1
,
k
ernel
.
size
[
0
]
);
anchor
.
y
=
randomInt
(
-
1
,
k
ernel
.
size
[
1
]
);
delta
=
randomDouble
(
-
100
,
100
);
...
...
@@ -122,6 +126,8 @@ OCL_INSTANTIATE_TEST_CASE_P(ImageProc, Filter2D,
Combine
(
Values
(
CV_8U
,
CV_16U
,
CV_32F
),
OCL_ALL_CHANNELS
,
Values
(
3
,
5
,
9
),
// Kernel size
Values
(
1
,
4
,
8
),
// Width mutiple
Values
((
BorderType
)
BORDER_CONSTANT
,
(
BorderType
)
BORDER_REPLICATE
,
(
BorderType
)
BORDER_REFLECT
,
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录