Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
917b883c
O
Opencv
项目概览
Greenplum
/
Opencv
大约 1 年 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
917b883c
编写于
12月 17, 2013
作者:
K
krodyush
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
remove extra calculations from haar to be consistent with native implementation
上级
5d5527d0
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
62 addition
and
33 deletion
+62
-33
modules/ocl/src/haar.cpp
modules/ocl/src/haar.cpp
+49
-20
modules/ocl/src/opencl/haarobjectdetect.cl
modules/ocl/src/opencl/haarobjectdetect.cl
+13
-13
未找到文件。
modules/ocl/src/haar.cpp
浏览文件 @
917b883c
...
...
@@ -866,16 +866,17 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
if
(
gcascade
->
is_stump_based
&&
gsum
.
clCxt
->
supportsFeature
(
FEATURE_CL_INTEL_DEVICE
))
{
//setup local group size
localThreads
[
0
]
=
8
;
localThreads
[
1
]
=
16
;
//setup local group size
for "pixel step" = 1
localThreads
[
0
]
=
16
;
localThreads
[
1
]
=
32
;
localThreads
[
2
]
=
1
;
//
init
maximal number of workgroups
//
calc
maximal number of workgroups
int
WGNumX
=
1
+
(
sizev
[
0
].
width
/
(
localThreads
[
0
]));
int
WGNumY
=
1
+
(
sizev
[
0
].
height
/
(
localThreads
[
1
]));
int
WGNumZ
=
loopcount
;
int
WGNum
=
0
;
//accurate number of non -empty workgroups
int
WGNumTotal
=
0
;
//accurate number of non-empty workgroups
int
WGNumSampled
=
0
;
//accurate number of workgroups processed only 1/4 part of all pixels. it is made for large images with scale <= 2
oclMat
oclWGInfo
(
1
,
sizeof
(
cl_int4
)
*
WGNumX
*
WGNumY
*
WGNumZ
,
CV_8U
);
{
cl_int4
*
pWGInfo
=
(
cl_int4
*
)
clEnqueueMapBuffer
(
getClCommandQueue
(
oclWGInfo
.
clCxt
),(
cl_mem
)
oclWGInfo
.
datastart
,
true
,
CL_MAP_WRITE
,
0
,
oclWGInfo
.
step
,
0
,
0
,
0
,
&
status
);
...
...
@@ -895,12 +896,16 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
if
(
gx
>=
(
Width
-
cascade
->
orig_window_size
.
width
))
continue
;
// no data to process
if
(
scaleinfo
[
z
].
factor
<=
2
)
{
WGNumSampled
++
;
}
// save no-empty workgroup info into array
pWGInfo
[
WGNum
].
s
[
0
]
=
scaleinfo
[
z
].
width_height
;
pWGInfo
[
WGNum
].
s
[
1
]
=
(
gx
<<
16
)
|
gy
;
pWGInfo
[
WGNum
].
s
[
2
]
=
scaleinfo
[
z
].
imgoff
;
memcpy
(
&
(
pWGInfo
[
WGNum
].
s
[
3
]),
&
(
scaleinfo
[
z
].
factor
),
sizeof
(
float
));
WGNum
++
;
pWGInfo
[
WGNum
Total
].
s
[
0
]
=
scaleinfo
[
z
].
width_height
;
pWGInfo
[
WGNum
Total
].
s
[
1
]
=
(
gx
<<
16
)
|
gy
;
pWGInfo
[
WGNum
Total
].
s
[
2
]
=
scaleinfo
[
z
].
imgoff
;
memcpy
(
&
(
pWGInfo
[
WGNum
Total
].
s
[
3
]),
&
(
scaleinfo
[
z
].
factor
),
sizeof
(
float
));
WGNum
Total
++
;
}
}
}
...
...
@@ -908,13 +913,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
pWGInfo
=
NULL
;
}
// setup global sizes to have linear array of workgroups with WGNum size
globalThreads
[
0
]
=
localThreads
[
0
]
*
WGNum
;
globalThreads
[
1
]
=
localThreads
[
1
];
globalThreads
[
2
]
=
1
;
#define NODE_SIZE 12
// pack node info to have less memory loads
// pack node info to have less memory loads
on the device side
oclMat
oclNodesPK
(
1
,
sizeof
(
cl_int
)
*
NODE_SIZE
*
nodenum
,
CV_8U
);
{
cl_int
status
;
...
...
@@ -963,8 +963,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
options
+=
format
(
" -D WND_SIZE_X=%d"
,
cascade
->
orig_window_size
.
width
);
options
+=
format
(
" -D WND_SIZE_Y=%d"
,
cascade
->
orig_window_size
.
height
);
options
+=
format
(
" -D STUMP_BASED=%d"
,
gcascade
->
is_stump_based
);
options
+=
format
(
" -D LSx=%d"
,
localThreads
[
0
]);
options
+=
format
(
" -D LSy=%d"
,
localThreads
[
1
]);
options
+=
format
(
" -D SPLITNODE=%d"
,
splitnode
);
options
+=
format
(
" -D SPLITSTAGE=%d"
,
splitstage
);
options
+=
format
(
" -D OUTPUTSZ=%d"
,
outputsz
);
...
...
@@ -972,8 +970,39 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
// init candiate global count by 0
int
pattern
=
0
;
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
candidatebuffer
,
1
,
0
,
1
*
sizeof
(
pattern
),
&
pattern
,
0
,
NULL
,
NULL
));
// execute face detector
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect
,
"gpuRunHaarClassifierCascadePacked"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
,
options
.
c_str
());
if
(
WGNumTotal
>
WGNumSampled
)
{
// small images and each pixel is processed
// setup global sizes to have linear array of workgroups with WGNum size
int
pixelstep
=
1
;
size_t
LS
[
3
]
=
{
localThreads
[
0
]
/
pixelstep
,
localThreads
[
1
]
/
pixelstep
,
1
};
globalThreads
[
0
]
=
LS
[
0
]
*
(
WGNumTotal
-
WGNumSampled
);
globalThreads
[
1
]
=
LS
[
1
];
globalThreads
[
2
]
=
1
;
string
options1
=
options
;
options1
+=
format
(
" -D PIXEL_STEP=%d"
,
pixelstep
);
options1
+=
format
(
" -D WGSTART=%d"
,
WGNumSampled
);
options1
+=
format
(
" -D LSx=%d"
,
LS
[
0
]);
options1
+=
format
(
" -D LSy=%d"
,
LS
[
1
]);
// execute face detector
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect
,
"gpuRunHaarClassifierCascadePacked"
,
globalThreads
,
LS
,
args
,
-
1
,
-
1
,
options1
.
c_str
());
}
if
(
WGNumSampled
>
0
)
{
// large images each 4th pixel is processed
// setup global sizes to have linear array of workgroups with WGNum size
int
pixelstep
=
2
;
size_t
LS
[
3
]
=
{
localThreads
[
0
]
/
pixelstep
,
localThreads
[
1
]
/
pixelstep
,
1
};
globalThreads
[
0
]
=
LS
[
0
]
*
WGNumSampled
;
globalThreads
[
1
]
=
LS
[
1
];
globalThreads
[
2
]
=
1
;
string
options2
=
options
;
options2
+=
format
(
" -D PIXEL_STEP=%d"
,
pixelstep
);
options2
+=
format
(
" -D WGSTART=%d"
,
0
);
options2
+=
format
(
" -D LSx=%d"
,
LS
[
0
]);
options2
+=
format
(
" -D LSy=%d"
,
LS
[
1
]);
// execute face detector
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect
,
"gpuRunHaarClassifierCascadePacked"
,
globalThreads
,
LS
,
args
,
-
1
,
-
1
,
options2
.
c_str
());
}
//read candidate buffer back and put it into host list
openCLReadBuffer
(
gsum
.
clCxt
,
candidatebuffer
,
candidate
,
4
*
sizeof
(
int
)
*
outputsz
);
assert
(
candidate
[
0
]
<
outputsz
);
...
...
modules/ocl/src/opencl/haarobjectdetect.cl
浏览文件 @
917b883c
...
...
@@ -126,13 +126,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
)
{
//
this
version
used
information
provided
for
each
workgroup
//
no
empty
WG
int
gid
=
(
int
)
get_group_id
(
0
)
;
int
lid_x
=
(
int
)
get_local_id
(
0
)
;
int
lid_y
=
(
int
)
get_local_id
(
1
)
;
int
lid
=
lid_y*LSx+lid_x
;
int4
WGInfo
=
pWGInfo[gid]
;
int4
WGInfo
=
pWGInfo[
WGSTART+
gid]
;
int
GroupX
=
(
WGInfo.y
>>
16
)
&0xFFFF
;
int
GroupY
=
(
WGInfo.y
>>
0
)
&
0xFFFF
;
int
Width
=
(
WGInfo.x
>>
16
)
&0xFFFF
;
...
...
@@ -140,8 +138,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int
ImgOffset
=
WGInfo.z
;
float
ScaleFactor
=
as_float
(
WGInfo.w
)
;
#
define
DATA_SIZE_X
(
LSx+WND_SIZE_X
)
#
define
DATA_SIZE_Y
(
LSy+WND_SIZE_Y
)
#
define
DATA_SIZE_X
(
PIXEL_STEP*
LSx+WND_SIZE_X
)
#
define
DATA_SIZE_Y
(
PIXEL_STEP*
LSy+WND_SIZE_Y
)
#
define
DATA_SIZE
(
DATA_SIZE_X*DATA_SIZE_Y
)
local
int
SumL[DATA_SIZE]
;
...
...
@@ -165,9 +163,11 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int4
info1
=
p
;
int4
info2
=
pq
;
{
int
xl
=
lid_x
;
int
yl
=
lid_y
;
//
calc
processed
ROI
coordinate
in
local
mem
int
xl
=
lid_x*PIXEL_STEP
;
int
yl
=
lid_y*PIXEL_STEP
;
{//
calc
variance_norm_factor
for
all
stages
int
OffsetLocal
=
yl
*
DATA_SIZE_X
+
xl
;
int
OffsetGlobal
=
(
GroupY+yl
)
*
pixelstep
+
(
GroupX+xl
)
;
...
...
@@ -194,13 +194,13 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int
result
=
(
1.0f>0.0f
)
;
for
(
int
stageloop
=
start_stage
; (stageloop < end_stage) && result; stageloop++ )
{//
iterate
until
candidate
is
exist
{//
iterate
until
candidate
is
valid
float
stage_sum
=
0.0f
;
__global
GpuHidHaarStageClassifier*
stageinfo
=
(
__global
GpuHidHaarStageClassifier*
)
((
__global
uchar*
)
stagecascadeptr+stageloop*sizeof
(
GpuHidHaarStageClassifier
))
;
int
lcl_off
=
(
yl*DATA_SIZE_X
)
+
(
xl
)
;
int
stagecount
=
stageinfo->count
;
float
stagethreshold
=
stageinfo->threshold
;
int
lcl_off
=
(
lid_y*DATA_SIZE_X
)
+
(
lid_x
)
;
for
(
int
nodeloop
=
0
; nodeloop < stagecount; nodecounter++,nodeloop++ )
{
//
simple
macro
to
extract
shorts
from
int
...
...
@@ -212,7 +212,7 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int4
n1
=
pN[1]
;
int4
n2
=
pN[2]
;
float
nodethreshold
=
as_float
(
n2.y
)
*
variance_norm_factor
;
//
calc
sum
of
intensity
pixels
according
to
node
information
//
calc
sum
of
intensity
pixels
according
to
classifier
node
information
float
classsum
=
(
SumL[M0
(
n0.x
)
+lcl_off]
-
SumL[M1
(
n0.x
)
+lcl_off]
-
SumL[M0
(
n0.y
)
+lcl_off]
+
SumL[M1
(
n0.y
)
+lcl_off]
)
*
as_float
(
n1.z
)
+
(
SumL[M0
(
n0.z
)
+lcl_off]
-
SumL[M1
(
n0.z
)
+lcl_off]
-
SumL[M0
(
n0.w
)
+lcl_off]
+
SumL[M1
(
n0.w
)
+lcl_off]
)
*
as_float
(
n1.w
)
+
...
...
@@ -228,8 +228,8 @@ __kernel void gpuRunHaarClassifierCascadePacked(
int
index
=
1+atomic_inc
((
volatile
global
int*
)
candidate
)
; //get index to write global data with face info
if
(
index<OUTPUTSZ
)
{
int
x
=
GroupX+
lid_x
;
int
y
=
GroupY+
lid_y
;
int
x
=
GroupX+
xl
;
int
y
=
GroupY+
yl
;
int4
candidate_result
;
candidate_result.x
=
convert_int_rtn
(
x*ScaleFactor
)
;
candidate_result.y
=
convert_int_rtn
(
y*ScaleFactor
)
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录