Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
ad6aae45
O
Opencv
项目概览
Greenplum
/
Opencv
11 个月 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
ad6aae45
编写于
3月 26, 2013
作者:
Y
yao
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
more fix of mismatch functions on CPU OCL
上级
2c06e59a
变更
6
展开全部
隐藏空白更改
内联
并排
Showing
6 changed file
with
231 addition
and
798 deletion
+231
-798
modules/ocl/src/brute_force_matcher.cpp
modules/ocl/src/brute_force_matcher.cpp
+119
-573
modules/ocl/src/haar.cpp
modules/ocl/src/haar.cpp
+2
-2
modules/ocl/src/moments.cpp
modules/ocl/src/moments.cpp
+3
-3
modules/ocl/src/opencl/brute_force_match.cl
modules/ocl/src/opencl/brute_force_match.cl
+97
-218
modules/ocl/src/opencl/haarobjectdetect.cl
modules/ocl/src/opencl/haarobjectdetect.cl
+8
-0
modules/ocl/test/test_brute_force_matcher.cpp
modules/ocl/test/test_brute_force_matcher.cpp
+2
-2
未找到文件。
modules/ocl/src/brute_force_matcher.cpp
浏览文件 @
ad6aae45
此差异已折叠。
点击以展开。
modules/ocl/src/haar.cpp
浏览文件 @
ad6aae45
...
...
@@ -953,8 +953,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
//int flag = 0;
oclMat
gimg1
(
gimg
.
rows
,
gimg
.
cols
,
CV_8UC1
);
oclMat
gsum
(
totalheight
,
gimg
.
cols
+
1
,
CV_32SC1
);
oclMat
gsqsum
(
totalheight
,
gimg
.
cols
+
1
,
CV_32FC1
);
oclMat
gsum
(
totalheight
+
4
,
gimg
.
cols
+
1
,
CV_32SC1
);
oclMat
gsqsum
(
totalheight
+
4
,
gimg
.
cols
+
1
,
CV_32FC1
);
//cl_mem cascadebuffer;
cl_mem
stagebuffer
;
...
...
modules/ocl/src/moments.cpp
浏览文件 @
ad6aae45
...
...
@@ -106,7 +106,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
bool
is_float
=
CV_SEQ_ELTYPE
(
contour
)
==
CV_32FC2
;
if
(
!
cv
::
ocl
::
Context
::
getContext
()
->
impl
->
double_support
&&
is_float
)
if
(
!
cv
::
ocl
::
Context
::
getContext
()
->
supportsFeature
(
Context
::
CL_DOUBLE
)
&&
is_float
)
{
CV_Error
(
CV_StsUnsupportedFormat
,
"Moments - double is not supported by your GPU!"
);
}
...
...
@@ -146,7 +146,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
cv
::
Mat
dst
(
dst_a
);
a00
=
a10
=
a01
=
a20
=
a11
=
a02
=
a30
=
a21
=
a12
=
a03
=
0.0
;
if
(
!
cv
::
ocl
::
Context
::
getContext
()
->
impl
->
double_support
)
if
(
!
cv
::
ocl
::
Context
::
getContext
()
->
supportsFeature
(
Context
::
CL_DOUBLE
)
)
{
for
(
int
i
=
0
;
i
<
contour
->
total
;
++
i
)
{
...
...
@@ -161,7 +161,7 @@ static void icvContourMoments( CvSeq* contour, CvMoments* mom )
a12
+=
dst
.
at
<
cl_long
>
(
8
,
i
);
a03
+=
dst
.
at
<
cl_long
>
(
9
,
i
);
}
}
}
else
{
a00
=
cv
::
sum
(
dst
.
row
(
0
))[
0
];
...
...
modules/ocl/src/opencl/brute_force_match.cl
浏览文件 @
ad6aae45
...
...
@@ -5,19 +5,93 @@ int bit1Count(float x)
{
int
c
=
0
;
int
ix
=
(
int
)
x
;
for
(
int
i
=
0
; i < 32 ; i++)
{
c
+=
ix
&
0x1
;
ix
>>=
1
;
}
return
(
float
)
c
;
}
float
reduce_block
(
__local
float
*s_query,
__local
float
*s_train,
int
block_size,
int
lidx,
int
lidy,
int
distType
)
{
/*
there
are
threee
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
float
result
=
0
;
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
fabs
(
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; j++)
{
float
qr
=
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
bit1Count
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[
(
uint
)
j
*
block_size
+
lidx]
)
;
}
break
;
}
return
result
;
}
float
reduce_multi_block
(
__local
float
*s_query,
__local
float
*s_train,
int
max_desc_len,
int
block_size,
int
block_index,
int
lidx,
int
lidy,
int
distType
)
{
/*
there
are
threee
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
float
result
=
0
;
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
fabs
(
s_query[lidy
*
max_desc_len
+
block_index
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; j++)
{
float
qr
=
s_query[lidy
*
max_desc_len
+
block_index
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; j++)
{
//result
+=
popcount
((
uint
)
s_query[lidy
*
max_desc_len
+
block_index
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
result
+=
bit1Count
((
uint
)
s_query[lidy
*
max_desc_len
+
block_index
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
}
return
result
;
}
/*
2dim
launch,
global
size:
dim0
is
(
query
rows
+
block_size
-
1
)
/
block_size
*
block_size,
dim1
is
block_size
local
size:
dim0
is
block_size,
dim1
is
block_size.
*/
__kernel
void
BruteForceMatch_UnrollMatch
(
__kernel
void
BruteForceMatch_UnrollMatch
_D5
(
__global
float
*query,
__global
float
*train,
//__global
float
*mask,
...
...
@@ -42,7 +116,6 @@ __kernel void BruteForceMatch_UnrollMatch(
__local
float
*s_train
=
sharebuffer
+
block_size
*
max_desc_len
;
int
queryIdx
=
groupidx
*
block_size
+
lidy
;
//
load
the
query
into
local
memory.
for
(
int
i
=
0
; i < max_desc_len / block_size; i ++)
{
...
...
@@ -55,11 +128,9 @@ __kernel void BruteForceMatch_UnrollMatch(
//
loopUnrolledCached
to
find
the
best
trainIdx
and
best
distance.
volatile
int
imgIdx
=
0
;
for
(
int
t
=
0
; t < (train_rows + block_size - 1) / block_size ; t++)
{
float
result
=
0
;
for
(
int
i
=
0
; i < max_desc_len / block_size ; i++)
{
//load
a
block_size
*
block_size
block
into
local
train.
...
...
@@ -69,38 +140,7 @@ __kernel void BruteForceMatch_UnrollMatch(
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/*
there
are
threee
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
fabs
(
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; j++)
{
float
qr
=
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; j++)
{
//result
+=
popcount
((
uint
)
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
result
+=
bit1Count
((
uint
)
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
}
result
+=
reduce_multi_block
(
s_query,
s_train,
max_desc_len,
block_size,
i,
lidx,
lidy,
distType
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
...
...
@@ -116,8 +156,8 @@ __kernel void BruteForceMatch_UnrollMatch(
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
__local
float
*s_distance
=
(
__local
float
*
)(
sharebuffer
)
;
__local
int
*
s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
block_size
*
block_size
)
;
__local
float
*s_distance
=
(
__local
float*
)(
sharebuffer
)
;
__local
int
*
s_trainIdx
=
(
__local
int
*
)(
sharebuffer
+
block_size
*
block_size
)
;
//find
BestMatch
s_distance
+=
lidy
*
block_size
;
...
...
@@ -144,7 +184,7 @@ __kernel void BruteForceMatch_UnrollMatch(
}
}
__kernel
void
BruteForceMatch_Match
(
__kernel
void
BruteForceMatch_Match
_D5
(
__global
float
*query,
__global
float
*train,
//__global
float
*mask,
...
...
@@ -177,7 +217,6 @@ __kernel void BruteForceMatch_Match(
{
//Dist
dist
;
float
result
=
0
;
for
(
int
i
=
0
; i < (query_cols + block_size - 1) / block_size ; i++)
{
const
int
loadx
=
lidx
+
i
*
block_size
;
...
...
@@ -193,38 +232,7 @@ __kernel void BruteForceMatch_Match(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/*
there
are
threee
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
fabs
(
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; j++)
{
float
qr
=
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; j++)
{
//result
+=
popcount
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
result
+=
bit1Count
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[
(
uint
)
j
*
block_size
+
lidx]
)
;
}
break
;
}
result
+=
reduce_block
(
s_query,
s_train,
block_size,
lidx,
lidy,
distType
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
...
...
@@ -270,7 +278,7 @@ __kernel void BruteForceMatch_Match(
}
//radius_unrollmatch
__kernel
void
BruteForceMatch_RadiusUnrollMatch
(
__kernel
void
BruteForceMatch_RadiusUnrollMatch
_D5
(
__global
float
*query,
__global
float
*train,
float
maxDistance,
...
...
@@ -303,7 +311,6 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
__local
float
*s_train
=
sharebuffer
+
block_size
*
block_size
;
float
result
=
0
;
for
(
int
i
=
0
; i < max_desc_len / block_size ; ++i)
{
//load
a
block_size
*
block_size
block
into
local
train.
...
...
@@ -315,37 +322,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/*
there
are
three
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; ++j)
{
result
+=
fabs
(
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; ++j)
{
float
qr
=
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; ++j)
{
result
+=
bit1Count
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
}
result
+=
reduce_block
(
s_query,
s_train,
block_size,
lidx,
lidy,
distType
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
...
...
@@ -354,7 +331,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
{
unsigned
int
ind
=
atom_inc
(
nMatches
+
queryIdx/*,
(
unsigned
int
)
-1*/
)
;
if
(
ind
<
bestTrainIdx_cols
)
if
(
ind
<
bestTrainIdx_cols
)
{
//bestImgIdx
=
imgIdx
;
bestTrainIdx[queryIdx
*
(
ostep
/
sizeof
(
int
))
+
ind]
=
trainIdx
;
...
...
@@ -364,7 +341,7 @@ __kernel void BruteForceMatch_RadiusUnrollMatch(
}
//radius_match
__kernel
void
BruteForceMatch_RadiusMatch
(
__kernel
void
BruteForceMatch_RadiusMatch
_D5
(
__global
float
*query,
__global
float
*train,
float
maxDistance,
...
...
@@ -396,7 +373,6 @@ __kernel void BruteForceMatch_RadiusMatch(
__local
float
*s_train
=
sharebuffer
+
block_size
*
block_size
;
float
result
=
0
;
for
(
int
i
=
0
; i < (query_cols + block_size - 1) / block_size ; ++i)
{
//load
a
block_size
*
block_size
block
into
local
train.
...
...
@@ -408,46 +384,16 @@ __kernel void BruteForceMatch_RadiusMatch(
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/*
there
are
three
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; ++j)
{
result
+=
fabs
(
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; ++j)
{
float
qr
=
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; ++j)
{
result
+=
bit1Count
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
}
result
+=
reduce_block
(
s_query,
s_train,
block_size,
lidx,
lidy,
distType
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
queryIdx
<
query_rows
&&
trainIdx
<
train_rows
&&
result
<
maxDistance/*
&&
mask
(
queryIdx,
trainIdx
)
*/
)
{
unsigned
int
ind
=
atom_inc
(
nMatches
+
queryIdx
/*,
(
unsigned
int
)
-1*/
)
;
unsigned
int
ind
=
atom_inc
(
nMatches
+
queryIdx
)
;
if
(
ind
<
bestTrainIdx_cols
)
if
(
ind
<
bestTrainIdx_cols
)
{
//bestImgIdx
=
imgIdx
;
bestTrainIdx[queryIdx
*
(
ostep
/
sizeof
(
int
))
+
ind]
=
trainIdx
;
...
...
@@ -457,7 +403,7 @@ __kernel void BruteForceMatch_RadiusMatch(
}
__kernel
void
BruteForceMatch_knnUnrollMatch
(
__kernel
void
BruteForceMatch_knnUnrollMatch
_D5
(
__global
float
*query,
__global
float
*train,
//__global
float
*mask,
...
...
@@ -496,11 +442,9 @@ __kernel void BruteForceMatch_knnUnrollMatch(
//loopUnrolledCached
volatile
int
imgIdx
=
0
;
for
(
int
t
=
0
; t < (train_rows + block_size - 1) / block_size ; t++)
{
float
result
=
0
;
for
(
int
i
=
0
; i < max_desc_len / block_size ; i++)
{
const
int
loadX
=
lidx
+
i
*
block_size
;
...
...
@@ -511,38 +455,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
//synchronize
to
make
sure
each
elem
for
reduceIteration
in
share
memory
is
written
already.
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/*
there
are
threee
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
fabs
(
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; j++)
{
float
qr
=
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; j++)
{
//result
+=
popcount
((
uint
)
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
result
+=
bit1Count
((
uint
)
s_query[lidy
*
max_desc_len
+
i
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
}
result
+=
reduce_multi_block
(
s_query,
s_train,
max_desc_len,
block_size,
i,
lidx,
lidy,
distType
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
...
...
@@ -589,7 +502,6 @@ __kernel void BruteForceMatch_knnUnrollMatch(
for
(
int
i
=
0
; i < block_size ; i++)
{
float
val
=
s_distance[i]
;
if
(
val
<
bestDistance1
)
{
bestDistance2
=
bestDistance1
;
...
...
@@ -640,7 +552,7 @@ __kernel void BruteForceMatch_knnUnrollMatch(
}
}
__kernel
void
BruteForceMatch_knnMatch
(
__kernel
void
BruteForceMatch_knnMatch
_D5
(
__global
float
*query,
__global
float
*train,
//__global
float
*mask,
...
...
@@ -673,8 +585,7 @@ __kernel void BruteForceMatch_knnMatch(
for
(
int
t
=
0
; t < (train_rows + block_size - 1) / block_size ; t++)
{
float
result
=
0.0f
;
for
(
int
i
=
0
; i < (query_cols + block_size - 1) / block_size ; i++)
for
(
int
i
=
0
; i < (query_cols + block_size -1) / block_size ; i++)
{
const
int
loadx
=
lidx
+
i
*
block_size
;
//load
query
and
train
into
local
memory
...
...
@@ -689,38 +600,7 @@ __kernel void BruteForceMatch_knnMatch(
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
/*
there
are
threee
types
in
the
reducer.
the
first
is
L1Dist,
which
to
sum
the
abs
(
v1,
v2
)
,
the
second
is
L2Dist,
which
to
sum
the
(
v1
-
v2
)
*
(
v1
-
v2
)
,
the
third
is
humming,
which
to
popc
(
v1
^
v2
)
,
popc
is
to
count
the
bits
are
set
to
1*/
switch
(
distType
)
{
case
0:
for
(
int
j
=
0
; j < block_size ; j++)
{
result
+=
fabs
(
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
)
;
}
break
;
case
1:
for
(
int
j
=
0
; j < block_size ; j++)
{
float
qr
=
s_query[lidy
*
block_size
+
j]
-
s_train[j
*
block_size
+
lidx]
;
result
+=
qr
*
qr
;
}
break
;
case
2:
for
(
int
j
=
0
; j < block_size ; j++)
{
//result
+=
popcount
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[j
*
block_size
+
lidx]
)
;
result
+=
bit1Count
((
uint
)
s_query[lidy
*
block_size
+
j]
^
(
uint
)
s_train[
(
uint
)
j
*
block_size
+
lidx]
)
;
}
break
;
}
result
+=
reduce_block
(
s_query,
s_train,
block_size,
lidx,
lidy,
distType
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
...
...
@@ -767,7 +647,6 @@ __kernel void BruteForceMatch_knnMatch(
for
(
int
i
=
0
; i < block_size ; i++)
{
float
val
=
s_distance[i]
;
if
(
val
<
bestDistance1
)
{
bestDistance2
=
bestDistance1
;
...
...
@@ -818,7 +697,7 @@ __kernel void BruteForceMatch_knnMatch(
}
}
kernel
void
BruteForceMatch_calcDistanceUnrolled
(
kernel
void
BruteForceMatch_calcDistanceUnrolled
_D5
(
__global
float
*query,
__global
float
*train,
//__global
float
*mask,
...
...
@@ -836,7 +715,7 @@ kernel void BruteForceMatch_calcDistanceUnrolled(
/*
Todo
*/
}
kernel
void
BruteForceMatch_calcDistance
(
kernel
void
BruteForceMatch_calcDistance
_D5
(
__global
float
*query,
__global
float
*train,
//__global
float
*mask,
...
...
@@ -853,7 +732,7 @@ kernel void BruteForceMatch_calcDistance(
/*
Todo
*/
}
kernel
void
BruteForceMatch_findBestMatch
(
kernel
void
BruteForceMatch_findBestMatch
_D5
(
__global
float
*allDist,
__global
int
*bestTrainIdx,
__global
float
*bestDistance,
...
...
modules/ocl/src/opencl/haarobjectdetect.cl
浏览文件 @
ad6aae45
...
...
@@ -211,10 +211,14 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int4
data
=
*
(
__global
int4*
)
&sum[glb_off]
;
int
lcl_off
=
mad24
(
lcl_y,
readwidth,
lcl_x<<2
)
;
#
if
OFF
lcldata[lcl_off]
=
data.x
;
lcldata[lcl_off+1]
=
data.y
;
lcldata[lcl_off+2]
=
data.z
;
lcldata[lcl_off+3]
=
data.w
;
#
else
vstore4
(
data,
0
,
&lcldata[lcl_off]
)
;
#
endif
}
lcloutindex[lcl_id]
=
0
;
...
...
@@ -559,3 +563,7 @@ if(result)
}
}
*/
modules/ocl/test/test_brute_force_matcher.cpp
浏览文件 @
ad6aae45
...
...
@@ -110,7 +110,7 @@ namespace
}
};
TEST_P
(
BruteForceMatcher
,
DISABLED_
Match_Single
)
TEST_P
(
BruteForceMatcher
,
Match_Single
)
{
cv
::
ocl
::
BruteForceMatcher_OCL_base
matcher
(
distType
);
...
...
@@ -130,7 +130,7 @@ namespace
ASSERT_EQ
(
0
,
badCount
);
}
TEST_P
(
BruteForceMatcher
,
DISABLED_
KnnMatch_2_Single
)
TEST_P
(
BruteForceMatcher
,
KnnMatch_2_Single
)
{
const
int
knn
=
2
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录