Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
69a0b5dd
O
Opencv
项目概览
Greenplum
/
Opencv
11 个月 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
69a0b5dd
编写于
4月 15, 2013
作者:
Y
yao
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add OclCascadeClassifierBuf interface
上级
abe2ea59
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
956 addition
and
499 deletion
+956
-499
modules/ocl/include/opencv2/ocl/ocl.hpp
modules/ocl/include/opencv2/ocl/ocl.hpp
+38
-0
modules/ocl/src/haar.cpp
modules/ocl/src/haar.cpp
+587
-128
modules/ocl/src/opencl/haarobjectdetect.cl
modules/ocl/src/opencl/haarobjectdetect.cl
+142
-176
modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
+141
-145
modules/ocl/test/test_haar.cpp
modules/ocl/test/test_haar.cpp
+48
-50
未找到文件。
modules/ocl/include/opencv2/ocl/ocl.hpp
浏览文件 @
69a0b5dd
...
...
@@ -802,6 +802,44 @@ namespace cv
int
minNeighbors
,
int
flags
,
CvSize
minSize
=
cvSize
(
0
,
0
),
CvSize
maxSize
=
cvSize
(
0
,
0
));
};
class
CV_EXPORTS
OclCascadeClassifierBuf
:
public
cv
::
CascadeClassifier
{
public:
OclCascadeClassifierBuf
()
:
m_flags
(
0
),
initialized
(
false
),
m_scaleFactor
(
0
),
buffers
(
NULL
)
{}
~
OclCascadeClassifierBuf
()
{}
void
detectMultiScale
(
oclMat
&
image
,
CV_OUT
std
::
vector
<
cv
::
Rect
>&
faces
,
double
scaleFactor
=
1.1
,
int
minNeighbors
=
3
,
int
flags
=
0
,
Size
minSize
=
Size
(),
Size
maxSize
=
Size
());
void
release
();
private:
void
Init
(
const
int
rows
,
const
int
cols
,
double
scaleFactor
,
int
flags
,
const
int
outputsz
,
const
size_t
localThreads
[],
CvSize
minSize
,
CvSize
maxSize
);
void
CreateBaseBufs
(
const
int
datasize
,
const
int
totalclassifier
,
const
int
flags
,
const
int
outputsz
);
void
CreateFactorRelatedBufs
(
const
int
rows
,
const
int
cols
,
const
int
flags
,
const
double
scaleFactor
,
const
size_t
localThreads
[],
CvSize
minSize
,
CvSize
maxSize
);
void
GenResult
(
CV_OUT
std
::
vector
<
cv
::
Rect
>&
faces
,
const
std
::
vector
<
cv
::
Rect
>
&
rectList
,
const
std
::
vector
<
int
>
&
rweights
);
int
m_rows
;
int
m_cols
;
int
m_flags
;
int
m_loopcount
;
int
m_nodenum
;
bool
findBiggestObject
;
bool
initialized
;
double
m_scaleFactor
;
Size
m_minSize
;
Size
m_maxSize
;
vector
<
CvSize
>
sizev
;
vector
<
float
>
scalev
;
oclMat
gimg1
,
gsum
,
gsqsum
;
void
*
buffers
;
};
/////////////////////////////// Pyramid /////////////////////////////////////
...
...
modules/ocl/src/haar.cpp
浏览文件 @
69a0b5dd
...
...
@@ -20,6 +20,7 @@
// Jia Haipeng, jiahaipeng95@gmail.com
// Wu Xinglong, wxl370@126.com
// Wang Yao, bitwangyaoyao@gmail.com
// Sen Liu, swjtuls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
...
...
@@ -842,15 +843,13 @@ static void gpuSetHaarClassifierCascade( CvHaarClassifierCascade *_cascade
}
/* j */
}
}
CvSeq
*
cv
::
ocl
::
OclCascadeClassifier
::
oclHaarDetectObjects
(
oclMat
&
gimg
,
CvMemStorage
*
storage
,
double
scaleFactor
,
int
minNeighbors
,
int
flags
,
CvSize
minSize
,
CvSize
maxSize
)
{
CvHaarClassifierCascade
*
cascade
=
oldCascade
;
//double alltime = (double)cvGetTickCount();
//double t = (double)cvGetTickCount();
const
double
GROUP_EPS
=
0.2
;
oclMat
gtemp
,
gsum1
,
gtilted1
,
gsqsum1
,
gnormImg
,
gsumcanny
;
CvSeq
*
result_seq
=
0
;
cv
::
Ptr
<
CvMemStorage
>
temp_storage
;
...
...
@@ -861,7 +860,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int
datasize
=
0
;
int
totalclassifier
=
0
;
//void *out;
GpuHidHaarClassifierCascade
*
gcascade
;
GpuHidHaarStageClassifier
*
stage
;
GpuHidHaarClassifier
*
classifier
;
...
...
@@ -870,11 +868,8 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int
*
candidate
;
cl_int
status
;
// bool doCannyPruning = (flags & CV_HAAR_DO_CANNY_PRUNING) != 0;
bool
findBiggestObject
=
(
flags
&
CV_HAAR_FIND_BIGGEST_OBJECT
)
!=
0
;
// bool roughSearch = (flags & CV_HAAR_DO_ROUGH_SEARCH) != 0;
//double t = 0;
if
(
maxSize
.
height
==
0
||
maxSize
.
width
==
0
)
{
maxSize
.
height
=
gimg
.
rows
;
...
...
@@ -896,27 +891,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
if
(
findBiggestObject
)
flags
&=
~
CV_HAAR_SCALE_IMAGE
;
//gtemp = oclMat( gimg.rows, gimg.cols, CV_8UC1);
//gsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32SC1 );
//gsqsum1 = oclMat( gimg.rows + 1, gimg.cols + 1, CV_32FC1 );
if
(
!
cascade
->
hid_cascade
)
/*out = (void *)*/
gpuCreateHidHaarClassifierCascade
(
cascade
,
&
datasize
,
&
totalclassifier
);
if
(
cascade
->
hid_cascade
->
has_tilted_features
)
gtilted1
=
oclMat
(
gimg
.
rows
+
1
,
gimg
.
cols
+
1
,
CV_32SC1
);
gpuCreateHidHaarClassifierCascade
(
cascade
,
&
datasize
,
&
totalclassifier
);
result_seq
=
cvCreateSeq
(
0
,
sizeof
(
CvSeq
),
sizeof
(
CvAvgComp
),
storage
);
if
(
CV_MAT_CN
(
gimg
.
type
())
>
1
)
{
oclMat
gtemp
;
cvtColor
(
gimg
,
gtemp
,
CV_BGR2GRAY
);
gimg
=
gtemp
;
}
if
(
findBiggestObject
)
flags
&=
~
(
CV_HAAR_SCALE_IMAGE
|
CV_HAAR_DO_CANNY_PRUNING
);
//t = (double)cvGetTickCount() - t;
//printf( "before if time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
if
(
gimg
.
cols
<
minSize
.
width
||
gimg
.
rows
<
minSize
.
height
)
CV_Error
(
CV_StsError
,
"Image too small"
);
...
...
@@ -924,12 +912,9 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
if
(
(
flags
&
CV_HAAR_SCALE_IMAGE
)
)
{
CvSize
winSize0
=
cascade
->
orig_window_size
;
//float scalefactor = 1.1f;
//float factor = 1.f;
int
totalheight
=
0
;
int
indexy
=
0
;
CvSize
sz
;
//t = (double)cvGetTickCount();
vector
<
CvSize
>
sizev
;
vector
<
float
>
scalev
;
for
(
factor
=
1.
f
;;
factor
*=
scaleFactor
)
...
...
@@ -950,20 +935,15 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
sizev
.
push_back
(
sz
);
scalev
.
push_back
(
factor
);
}
//int flag = 0;
oclMat
gimg1
(
gimg
.
rows
,
gimg
.
cols
,
CV_8UC1
);
oclMat
gsum
(
totalheight
+
4
,
gimg
.
cols
+
1
,
CV_32SC1
);
oclMat
gsqsum
(
totalheight
+
4
,
gimg
.
cols
+
1
,
CV_32FC1
);
//cl_mem cascadebuffer;
cl_mem
stagebuffer
;
//cl_mem classifierbuffer;
cl_mem
nodebuffer
;
cl_mem
candidatebuffer
;
cl_mem
scaleinfobuffer
;
//cl_kernel kernel;
//kernel = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect, "gpuRunHaarClassifierCascade");
cv
::
Rect
roi
,
roi2
;
cv
::
Mat
imgroi
,
imgroisq
;
cv
::
ocl
::
oclMat
resizeroi
,
gimgroi
,
gimgroisq
;
...
...
@@ -971,18 +951,13 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
size_t
blocksize
=
8
;
size_t
localThreads
[
3
]
=
{
blocksize
,
blocksize
,
1
};
size_t
globalThreads
[
3
]
=
{
grp_per_CU
*
((
gsum
.
clCxt
)
->
computeUnits
()
)
*
localThreads
[
0
],
size_t
globalThreads
[
3
]
=
{
grp_per_CU
*
gsum
.
clCxt
->
computeUnits
(
)
*
localThreads
[
0
],
localThreads
[
1
],
1
};
int
outputsz
=
256
*
globalThreads
[
0
]
/
localThreads
[
0
];
int
loopcount
=
sizev
.
size
();
detect_piramid_info
*
scaleinfo
=
(
detect_piramid_info
*
)
malloc
(
sizeof
(
detect_piramid_info
)
*
loopcount
);
//t = (double)cvGetTickCount() - t;
// printf( "pre time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
//int *it =scaleinfo;
// t = (double)cvGetTickCount();
for
(
int
i
=
0
;
i
<
loopcount
;
i
++
)
{
sz
=
sizev
[
i
];
...
...
@@ -992,7 +967,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
resizeroi
=
gimg1
(
roi2
);
gimgroi
=
gsum
(
roi
);
gimgroisq
=
gsqsum
(
roi
);
//scaleinfo[i].rows = gimgroi.rows;
int
width
=
gimgroi
.
cols
-
1
-
cascade
->
orig_window_size
.
width
;
int
height
=
gimgroi
.
rows
-
1
-
cascade
->
orig_window_size
.
height
;
scaleinfo
[
i
].
width_height
=
(
width
<<
16
)
|
height
;
...
...
@@ -1000,76 +974,40 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int
grpnumperline
=
(
width
+
localThreads
[
0
]
-
1
)
/
localThreads
[
0
];
int
totalgrp
=
((
height
+
localThreads
[
1
]
-
1
)
/
localThreads
[
1
])
*
grpnumperline
;
//outputsz +=width*height;
scaleinfo
[
i
].
grpnumperline_totalgrp
=
(
grpnumperline
<<
16
)
|
totalgrp
;
scaleinfo
[
i
].
imgoff
=
gimgroi
.
offset
>>
2
;
scaleinfo
[
i
].
factor
=
factor
;
//printf("rows = %d,ystep = %d,width = %d,height = %d,grpnumperline = %d,totalgrp = %d,imgoff = %d,factor = %f\n",
// scaleinfo[i].rows,scaleinfo[i].ystep,scaleinfo[i].width,scaleinfo[i].height,scaleinfo[i].grpnumperline,
// scaleinfo[i].totalgrp,scaleinfo[i].imgoff,scaleinfo[i].factor);
cv
::
ocl
::
resize
(
gimg
,
resizeroi
,
Size
(
sz
.
width
-
1
,
sz
.
height
-
1
),
0
,
0
,
INTER_LINEAR
);
//cv::imwrite("D:\\1.jpg",gimg1);
cv
::
ocl
::
integral
(
resizeroi
,
gimgroi
,
gimgroisq
);
//cv::ocl::oclMat chk(sz.height,sz.width,CV_32SC1),chksq(sz.height,sz.width,CV_32FC1);
//cv::ocl::integral(gimg1, chk, chksq);
//double r = cv::norm(chk,gimgroi,NORM_INF);
//if(r > std::numeric_limits<double>::epsilon())
//{
// printf("failed");
//}
indexy
+=
sz
.
height
;
}
//int ystep = factor > 2 ? 1 : 2;
// t = (double)cvGetTickCount() - t;
//printf( "resize integral time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
//t = (double)cvGetTickCount();
gcascade
=
(
GpuHidHaarClassifierCascade
*
)
cascade
->
hid_cascade
;
stage
=
(
GpuHidHaarStageClassifier
*
)(
gcascade
+
1
);
classifier
=
(
GpuHidHaarClassifier
*
)(
stage
+
gcascade
->
count
);
node
=
(
GpuHidHaarTreeNode
*
)(
classifier
->
node
);
//int m,n;
//m = (gsum.cols - 1 - cascade->orig_window_size.width + ystep - 1)/ystep;
//n = (gsum.rows - 1 - cascade->orig_window_size.height + ystep - 1)/ystep;
//int counter = m*n;
int
nodenum
=
(
datasize
-
sizeof
(
GpuHidHaarClassifierCascade
)
-
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
-
sizeof
(
GpuHidHaarClassifier
)
*
totalclassifier
)
/
sizeof
(
GpuHidHaarTreeNode
);
//if(flag == 0){
candidate
=
(
int
*
)
malloc
(
4
*
sizeof
(
int
)
*
outputsz
);
//memset((char*)candidate,0,4*sizeof(int)*outputsz);
gpuSetImagesForHaarClassifierCascade
(
cascade
,
/* &sum1, &sqsum1, _tilted,*/
1.
,
gsum
.
step
/
4
);
//cascadebuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifierCascade),NULL,&status);
//openCLVerifyCall(status);
//openCLSafeCall(clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,cascadebuffer,1,0,sizeof(GpuHidHaarClassifierCascade),gcascade,0,NULL,NULL));
gpuSetImagesForHaarClassifierCascade
(
cascade
,
1.
,
gsum
.
step
/
4
);
stagebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
);
//openCLVerifyCall(status);
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
stagebuffer
,
1
,
0
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
,
stage
,
0
,
NULL
,
NULL
));
//classifierbuffer = clCreateBuffer(gsum.clCxt->clContext,CL_MEM_READ_ONLY,sizeof(GpuHidHaarClassifier)*totalclassifier,NULL,&status);
//status = clEnqueueWriteBuffer(gsum.clCxt->clCmdQueue,classifierbuffer,1,0,sizeof(GpuHidHaarClassifier)*totalclassifier,classifier,0,NULL,NULL);
cl_command_queue
qu
=
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
();
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
stagebuffer
,
1
,
0
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
,
stage
,
0
,
NULL
,
NULL
));
nodebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
nodenum
*
sizeof
(
GpuHidHaarTreeNode
));
//openCLVerifyCall(status);
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
nodebuffer
,
1
,
0
,
nodenum
*
sizeof
(
GpuHidHaarTreeNode
),
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
nodebuffer
,
1
,
0
,
nodenum
*
sizeof
(
GpuHidHaarTreeNode
),
node
,
0
,
NULL
,
NULL
));
candidatebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_WRITE_ONLY
,
4
*
sizeof
(
int
)
*
outputsz
);
//openCLVerifyCall(status);
scaleinfobuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
detect_piramid_info
)
*
loopcount
);
//openCLVerifyCall(status);
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
scaleinfobuffer
,
1
,
0
,
sizeof
(
detect_piramid_info
)
*
loopcount
,
scaleinfo
,
0
,
NULL
,
NULL
));
//flag = 1;
//}
//t = (double)cvGetTickCount() - t
;
//printf( "update time = %g ms\n", t/((double)cvGetTickFrequency()*1000.)
);
scaleinfobuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
detect_piramid_info
)
*
loopcount
)
;
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
scaleinfobuffer
,
1
,
0
,
sizeof
(
detect_piramid_info
)
*
loopcount
,
scaleinfo
,
0
,
NULL
,
NULL
)
);
//size_t globalThreads[3] = { counter+blocksize*blocksize-counter%(blocksize*blocksize),1,1};
//t = (double)cvGetTickCount();
int
startstage
=
0
;
int
endstage
=
gcascade
->
count
;
int
startnode
=
0
;
...
...
@@ -1087,11 +1025,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
pq
.
s
[
3
]
=
gcascade
->
pq3
;
float
correction
=
gcascade
->
inv_window_area
;
//int grpnumperline = ((m + localThreads[0] - 1) / localThreads[0]);
//int totalgrp = ((n + localThreads[1] - 1) / localThreads[1])*grpnumperline;
// openCLVerifyKernel(gsum.clCxt, kernel, &blocksize, globalThreads, localThreads);
//openCLSafeCall(clSetKernelArg(kernel,argcount++,sizeof(cl_mem),(void*)&cascadebuffer));
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
stagebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
scaleinfobuffer
));
...
...
@@ -1111,28 +1044,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
)
,
(
void
*
)
&
correction
));
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect
,
"gpuRunHaarClassifierCascade"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
//t = (double)cvGetTickCount() - t;
//printf( "detection time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
//t = (double)cvGetTickCount();
//openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->impl->clCmdQueue, candidatebuffer, 1, 0, 4 * sizeof(int)*outputsz, candidate, 0, NULL, NULL));
openCLReadBuffer
(
gsum
.
clCxt
,
candidatebuffer
,
candidate
,
4
*
sizeof
(
int
)
*
outputsz
);
for
(
int
i
=
0
;
i
<
outputsz
;
i
++
)
if
(
candidate
[
4
*
i
+
2
]
!=
0
)
allCandidates
.
push_back
(
Rect
(
candidate
[
4
*
i
],
candidate
[
4
*
i
+
1
],
candidate
[
4
*
i
+
2
],
candidate
[
4
*
i
+
3
]));
// t = (double)cvGetTickCount() - t;
//printf( "post time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
//t = (double)cvGetTickCount();
allCandidates
.
push_back
(
Rect
(
candidate
[
4
*
i
],
candidate
[
4
*
i
+
1
],
candidate
[
4
*
i
+
2
],
candidate
[
4
*
i
+
3
]));
free
(
scaleinfo
);
free
(
candidate
);
//openCLSafeCall(clReleaseMemObject(cascadebuffer));
openCLSafeCall
(
clReleaseMemObject
(
stagebuffer
));
openCLSafeCall
(
clReleaseMemObject
(
scaleinfobuffer
));
openCLSafeCall
(
clReleaseMemObject
(
nodebuffer
));
openCLSafeCall
(
clReleaseMemObject
(
candidatebuffer
));
// openCLSafeCall(clReleaseKernel(kernel));
//t = (double)cvGetTickCount() - t;
//printf( "release time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) );
}
else
{
...
...
@@ -1150,7 +1075,6 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
classifier
=
(
GpuHidHaarClassifier
*
)(
stage
+
gcascade
->
count
);
node
=
(
GpuHidHaarTreeNode
*
)(
classifier
->
node
);
cl_mem
stagebuffer
;
//cl_mem classifierbuffer;
cl_mem
nodebuffer
;
cl_mem
candidatebuffer
;
cl_mem
scaleinfobuffer
;
...
...
@@ -1187,24 +1111,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
size_t
blocksize
=
8
;
size_t
localThreads
[
3
]
=
{
blocksize
,
blocksize
,
1
};
size_t
globalThreads
[
3
]
=
{
grp_per_CU
*
gsum
.
clCxt
->
computeUnits
()
*
localThreads
[
0
],
localThreads
[
1
],
1
};
localThreads
[
1
],
1
};
int
outputsz
=
256
*
globalThreads
[
0
]
/
localThreads
[
0
];
int
nodenum
=
(
datasize
-
sizeof
(
GpuHidHaarClassifierCascade
)
-
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
-
sizeof
(
GpuHidHaarClassifier
)
*
totalclassifier
)
/
sizeof
(
GpuHidHaarTreeNode
);
nodebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
nodenum
*
sizeof
(
GpuHidHaarTreeNode
));
//openCLVerifyCall(status
);
openCLSafeCall
(
clEnqueueWriteBuffer
(
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
()
,
nodebuffer
,
1
,
0
,
cl_command_queue
qu
=
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(
);
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
nodebuffer
,
1
,
0
,
nodenum
*
sizeof
(
GpuHidHaarTreeNode
),
node
,
0
,
NULL
,
NULL
));
cl_mem
newnodebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_WRITE
,
loopcount
*
nodenum
*
sizeof
(
GpuHidHaarTreeNode
));
int
startstage
=
0
;
int
endstage
=
gcascade
->
count
;
//cl_kernel kernel;
//kernel = openCLGetKernelFromSource(gsum.clCxt, &haarobjectdetect_scaled2, "gpuRunHaarClassifierCascade_scaled2");
//cl_kernel kernel2 = openCLGetKernelFromSource(gimg.clCxt, &haarobjectdetect_scaled2, "gpuscaleclassifier");
for
(
int
i
=
0
;
i
<
loopcount
;
i
++
)
{
sz
=
sizev
[
i
];
...
...
@@ -1223,7 +1143,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
int
height
=
(
gsum
.
rows
-
1
-
sz
.
height
+
ystep
-
1
)
/
ystep
;
int
grpnumperline
=
(
width
+
localThreads
[
0
]
-
1
)
/
localThreads
[
0
];
int
totalgrp
=
((
height
+
localThreads
[
1
]
-
1
)
/
localThreads
[
1
])
*
grpnumperline
;
//outputsz +=width*height;
scaleinfo
[
i
].
width_height
=
(
width
<<
16
)
|
height
;
scaleinfo
[
i
].
grpnumperline_totalgrp
=
(
grpnumperline
<<
16
)
|
totalgrp
;
scaleinfo
[
i
].
imgoff
=
0
;
...
...
@@ -1241,28 +1161,20 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
size_t
globalThreads2
[
3
]
=
{
nodenum
,
1
,
1
};
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect_scaled2
,
"gpuscaleclassifier"
,
globalThreads2
,
NULL
/*localThreads2*/
,
args1
,
-
1
,
-
1
);
//clEnqueueNDRangeKernel(gsum.clCxt->impl->clCmdQueue, kernel2, 1, NULL, globalThreads2, 0, 0, NULL, NULL);
//clFinish(gsum.clCxt->impl->clCmdQueue);
}
//clReleaseKernel(kernel2);
int
step
=
gsum
.
step
/
4
;
int
startnode
=
0
;
int
splitstage
=
3
;
int
splitnode
=
stage
[
0
].
count
+
stage
[
1
].
count
+
stage
[
2
].
count
;
stagebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
);
//openCLVerifyCall(status);
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
stagebuffer
,
1
,
0
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
,
stage
,
0
,
NULL
,
NULL
));
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
stagebuffer
,
1
,
0
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
,
stage
,
0
,
NULL
,
NULL
));
candidatebuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_WRITE_ONLY
|
CL_MEM_ALLOC_HOST_PTR
,
4
*
sizeof
(
int
)
*
outputsz
);
//openCLVerifyCall(status);
scaleinfobuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
detect_piramid_info
)
*
loopcount
);
//openCLVerifyCall(status);
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
scaleinfobuffer
,
1
,
0
,
sizeof
(
detect_piramid_info
)
*
loopcount
,
scaleinfo
,
0
,
NULL
,
NULL
));
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
scaleinfobuffer
,
1
,
0
,
sizeof
(
detect_piramid_info
)
*
loopcount
,
scaleinfo
,
0
,
NULL
,
NULL
));
pbuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
cl_int4
)
*
loopcount
);
openCLSafeCall
(
clEnqueueWriteBuffer
(
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
()
,
pbuffer
,
1
,
0
,
sizeof
(
cl_int4
)
*
loopcount
,
p
,
0
,
NULL
,
NULL
));
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
pbuffer
,
1
,
0
,
sizeof
(
cl_int4
)
*
loopcount
,
p
,
0
,
NULL
,
NULL
));
correctionbuffer
=
openCLCreateBuffer
(
gsum
.
clCxt
,
CL_MEM_READ_ONLY
,
sizeof
(
cl_float
)
*
loopcount
);
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
correctionbuffer
,
1
,
0
,
sizeof
(
cl_float
)
*
loopcount
,
correction
,
0
,
NULL
,
NULL
));
//int argcount = 0;
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
correctionbuffer
,
1
,
0
,
sizeof
(
cl_float
)
*
loopcount
,
correction
,
0
,
NULL
,
NULL
));
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
stagebuffer
));
...
...
@@ -1271,22 +1183,21 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
gsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
gsqsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
candidatebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
gsum
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
gsum
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
loopcount
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
splitstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
endstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startnode
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
splitnode
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
pbuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
correctionbuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
nodenum
));
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect_scaled2
,
"gpuRunHaarClassifierCascade_scaled2"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
//openCLSafeCall(clEnqueueReadBuffer(gsum.clCxt->clCmdQueue,candidatebuffer,1,0,4*sizeof(int)*outputsz,candidate,0,NULL,NULL));
candidate
=
(
int
*
)
clEnqueueMapBuffer
((
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
(),
candidatebuffer
,
1
,
CL_MAP_READ
,
0
,
4
*
sizeof
(
int
),
0
,
0
,
0
,
&
status
);
candidate
=
(
int
*
)
clEnqueueMapBuffer
(
qu
,
candidatebuffer
,
1
,
CL_MAP_READ
,
0
,
4
*
sizeof
(
int
)
*
outputsz
,
0
,
0
,
0
,
&
status
);
for
(
int
i
=
0
;
i
<
outputsz
;
i
++
)
{
...
...
@@ -1297,7 +1208,7 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
free
(
scaleinfo
);
free
(
p
);
free
(
correction
);
clEnqueueUnmapMemObject
(
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
()
,
candidatebuffer
,
candidate
,
0
,
0
,
0
);
clEnqueueUnmapMemObject
(
qu
,
candidatebuffer
,
candidate
,
0
,
0
,
0
);
openCLSafeCall
(
clReleaseMemObject
(
stagebuffer
));
openCLSafeCall
(
clReleaseMemObject
(
scaleinfobuffer
));
openCLSafeCall
(
clReleaseMemObject
(
nodebuffer
));
...
...
@@ -1306,20 +1217,547 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
openCLSafeCall
(
clReleaseMemObject
(
pbuffer
));
openCLSafeCall
(
clReleaseMemObject
(
correctionbuffer
));
}
//t = (double)cvGetTickCount() ;
cvFree
(
&
cascade
->
hid_cascade
);
// printf("%d\n",globalcounter);
rectList
.
resize
(
allCandidates
.
size
());
if
(
!
allCandidates
.
empty
())
std
::
copy
(
allCandidates
.
begin
(),
allCandidates
.
end
(),
rectList
.
begin
());
//cout << "count = " << rectList.size()<< endl;
if
(
minNeighbors
!=
0
||
findBiggestObject
)
groupRectangles
(
rectList
,
rweights
,
std
::
max
(
minNeighbors
,
1
),
GROUP_EPS
);
else
rweights
.
resize
(
rectList
.
size
(),
0
);
if
(
findBiggestObject
&&
rectList
.
size
()
)
{
CvAvgComp
result_comp
=
{{
0
,
0
,
0
,
0
},
0
};
for
(
size_t
i
=
0
;
i
<
rectList
.
size
();
i
++
)
{
cv
::
Rect
r
=
rectList
[
i
];
if
(
r
.
area
()
>
cv
::
Rect
(
result_comp
.
rect
).
area
()
)
{
result_comp
.
rect
=
r
;
result_comp
.
neighbors
=
rweights
[
i
];
}
}
cvSeqPush
(
result_seq
,
&
result_comp
);
}
else
{
for
(
size_t
i
=
0
;
i
<
rectList
.
size
();
i
++
)
{
CvAvgComp
c
;
c
.
rect
=
rectList
[
i
];
c
.
neighbors
=
rweights
[
i
];
cvSeqPush
(
result_seq
,
&
c
);
}
}
return
result_seq
;
}
struct
OclBuffers
{
cl_mem
stagebuffer
;
cl_mem
nodebuffer
;
cl_mem
candidatebuffer
;
cl_mem
scaleinfobuffer
;
cl_mem
pbuffer
;
cl_mem
correctionbuffer
;
cl_mem
newnodebuffer
;
};
struct
getRect
{
Rect
operator
()(
const
CvAvgComp
&
e
)
const
{
return
e
.
rect
;
}
};
void
cv
::
ocl
::
OclCascadeClassifierBuf
::
detectMultiScale
(
oclMat
&
gimg
,
CV_OUT
std
::
vector
<
cv
::
Rect
>&
faces
,
double
scaleFactor
,
int
minNeighbors
,
int
flags
,
Size
minSize
,
Size
maxSize
)
{
int
blocksize
=
8
;
int
grp_per_CU
=
12
;
size_t
localThreads
[
3
]
=
{
blocksize
,
blocksize
,
1
};
size_t
globalThreads
[
3
]
=
{
grp_per_CU
*
Context
::
getContext
()
->
computeUnits
()
*
localThreads
[
0
],
localThreads
[
1
],
1
};
int
outputsz
=
256
*
globalThreads
[
0
]
/
localThreads
[
0
];
Init
(
gimg
.
rows
,
gimg
.
cols
,
scaleFactor
,
flags
,
outputsz
,
localThreads
,
minSize
,
maxSize
);
const
double
GROUP_EPS
=
0.2
;
cv
::
ConcurrentRectVector
allCandidates
;
std
::
vector
<
cv
::
Rect
>
rectList
;
std
::
vector
<
int
>
rweights
;
CvHaarClassifierCascade
*
cascade
=
oldCascade
;
GpuHidHaarClassifierCascade
*
gcascade
;
GpuHidHaarStageClassifier
*
stage
;
GpuHidHaarClassifier
*
classifier
;
GpuHidHaarTreeNode
*
node
;
if
(
CV_MAT_DEPTH
(
gimg
.
type
())
!=
CV_8U
)
CV_Error
(
CV_StsUnsupportedFormat
,
"Only 8-bit images are supported"
);
if
(
CV_MAT_CN
(
gimg
.
type
())
>
1
)
{
oclMat
gtemp
;
cvtColor
(
gimg
,
gtemp
,
CV_BGR2GRAY
);
gimg
=
gtemp
;
}
int
*
candidate
;
if
(
(
flags
&
CV_HAAR_SCALE_IMAGE
)
)
{
int
indexy
=
0
;
CvSize
sz
;
cv
::
Rect
roi
,
roi2
;
cv
::
Mat
imgroi
,
imgroisq
;
cv
::
ocl
::
oclMat
resizeroi
,
gimgroi
,
gimgroisq
;
for
(
int
i
=
0
;
i
<
m_loopcount
;
i
++
)
{
sz
=
sizev
[
i
];
roi
=
Rect
(
0
,
indexy
,
sz
.
width
,
sz
.
height
);
roi2
=
Rect
(
0
,
0
,
sz
.
width
-
1
,
sz
.
height
-
1
);
resizeroi
=
gimg1
(
roi2
);
gimgroi
=
gsum
(
roi
);
gimgroisq
=
gsqsum
(
roi
);
cv
::
ocl
::
resize
(
gimg
,
resizeroi
,
Size
(
sz
.
width
-
1
,
sz
.
height
-
1
),
0
,
0
,
INTER_LINEAR
);
cv
::
ocl
::
integral
(
resizeroi
,
gimgroi
,
gimgroisq
);
indexy
+=
sz
.
height
;
}
gcascade
=
(
GpuHidHaarClassifierCascade
*
)(
cascade
->
hid_cascade
);
stage
=
(
GpuHidHaarStageClassifier
*
)(
gcascade
+
1
);
classifier
=
(
GpuHidHaarClassifier
*
)(
stage
+
gcascade
->
count
);
node
=
(
GpuHidHaarTreeNode
*
)(
classifier
->
node
);
gpuSetImagesForHaarClassifierCascade
(
cascade
,
1.
,
gsum
.
step
/
4
);
cl_command_queue
qu
=
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
();
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
stagebuffer
,
1
,
0
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
,
stage
,
0
,
NULL
,
NULL
));
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
nodebuffer
,
1
,
0
,
m_nodenum
*
sizeof
(
GpuHidHaarTreeNode
),
node
,
0
,
NULL
,
NULL
));
int
startstage
=
0
;
int
endstage
=
gcascade
->
count
;
int
startnode
=
0
;
int
pixelstep
=
gsum
.
step
/
4
;
int
splitstage
=
3
;
int
splitnode
=
stage
[
0
].
count
+
stage
[
1
].
count
+
stage
[
2
].
count
;
cl_int4
p
,
pq
;
p
.
s
[
0
]
=
gcascade
->
p0
;
p
.
s
[
1
]
=
gcascade
->
p1
;
p
.
s
[
2
]
=
gcascade
->
p2
;
p
.
s
[
3
]
=
gcascade
->
p3
;
pq
.
s
[
0
]
=
gcascade
->
pq0
;
pq
.
s
[
1
]
=
gcascade
->
pq1
;
pq
.
s
[
2
]
=
gcascade
->
pq2
;
pq
.
s
[
3
]
=
gcascade
->
pq3
;
float
correction
=
gcascade
->
inv_window_area
;
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
stagebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
scaleinfobuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
nodebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
gsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
gsqsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
pixelstep
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
m_loopcount
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
splitstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
endstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startnode
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
splitnode
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int4
)
,
(
void
*
)
&
p
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int4
)
,
(
void
*
)
&
pq
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_float
)
,
(
void
*
)
&
correction
));
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect
,
"gpuRunHaarClassifierCascade"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
candidate
=
(
int
*
)
malloc
(
4
*
sizeof
(
int
)
*
outputsz
);
memset
(
candidate
,
0
,
4
*
sizeof
(
int
)
*
outputsz
);
openCLReadBuffer
(
gsum
.
clCxt
,
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
,
candidate
,
4
*
sizeof
(
int
)
*
outputsz
);
for
(
int
i
=
0
;
i
<
outputsz
;
i
++
)
if
(
candidate
[
4
*
i
+
2
]
!=
0
)
allCandidates
.
push_back
(
Rect
(
candidate
[
4
*
i
],
candidate
[
4
*
i
+
1
],
candidate
[
4
*
i
+
2
],
candidate
[
4
*
i
+
3
]));
free
((
void
*
)
candidate
);
candidate
=
NULL
;
}
else
{
cv
::
ocl
::
integral
(
gimg
,
gsum
,
gsqsum
);
gpuSetHaarClassifierCascade
(
cascade
);
gcascade
=
(
GpuHidHaarClassifierCascade
*
)
cascade
->
hid_cascade
;
stage
=
(
GpuHidHaarStageClassifier
*
)(
gcascade
+
1
);
classifier
=
(
GpuHidHaarClassifier
*
)(
stage
+
gcascade
->
count
);
node
=
(
GpuHidHaarTreeNode
*
)(
classifier
->
node
);
cl_command_queue
qu
=
(
cl_command_queue
)
gsum
.
clCxt
->
oclCommandQueue
();
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
nodebuffer
,
1
,
0
,
m_nodenum
*
sizeof
(
GpuHidHaarTreeNode
),
node
,
0
,
NULL
,
NULL
));
cl_int4
*
p
=
(
cl_int4
*
)
malloc
(
sizeof
(
cl_int4
)
*
m_loopcount
);
float
*
correction
=
(
float
*
)
malloc
(
sizeof
(
float
)
*
m_loopcount
);
int
startstage
=
0
;
int
endstage
=
gcascade
->
count
;
double
factor
;
for
(
int
i
=
0
;
i
<
m_loopcount
;
i
++
)
{
factor
=
scalev
[
i
];
int
equRect_x
=
(
int
)(
factor
*
gcascade
->
p0
+
0.5
);
int
equRect_y
=
(
int
)(
factor
*
gcascade
->
p1
+
0.5
);
int
equRect_w
=
(
int
)(
factor
*
gcascade
->
p3
+
0.5
);
int
equRect_h
=
(
int
)(
factor
*
gcascade
->
p2
+
0.5
);
p
[
i
].
s
[
0
]
=
equRect_x
;
p
[
i
].
s
[
1
]
=
equRect_y
;
p
[
i
].
s
[
2
]
=
equRect_x
+
equRect_w
;
p
[
i
].
s
[
3
]
=
equRect_y
+
equRect_h
;
correction
[
i
]
=
1.
/
(
equRect_w
*
equRect_h
);
int
startnodenum
=
m_nodenum
*
i
;
float
factor2
=
(
float
)
factor
;
vector
<
pair
<
size_t
,
const
void
*>
>
args1
;
args1
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
nodebuffer
));
args1
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
newnodebuffer
));
args1
.
push_back
(
make_pair
(
sizeof
(
cl_float
)
,
(
void
*
)
&
factor2
));
args1
.
push_back
(
make_pair
(
sizeof
(
cl_float
)
,
(
void
*
)
&
correction
[
i
]
));
args1
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startnodenum
));
size_t
globalThreads2
[
3
]
=
{
m_nodenum
,
1
,
1
};
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect_scaled2
,
"gpuscaleclassifier"
,
globalThreads2
,
NULL
/*localThreads2*/
,
args1
,
-
1
,
-
1
);
}
int
step
=
gsum
.
step
/
4
;
int
startnode
=
0
;
int
splitstage
=
3
;
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
stagebuffer
,
1
,
0
,
sizeof
(
GpuHidHaarStageClassifier
)
*
gcascade
->
count
,
stage
,
0
,
NULL
,
NULL
));
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
pbuffer
,
1
,
0
,
sizeof
(
cl_int4
)
*
m_loopcount
,
p
,
0
,
NULL
,
NULL
));
openCLSafeCall
(
clEnqueueWriteBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
correctionbuffer
,
1
,
0
,
sizeof
(
cl_float
)
*
m_loopcount
,
correction
,
0
,
NULL
,
NULL
));
vector
<
pair
<
size_t
,
const
void
*>
>
args
;
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
stagebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
scaleinfobuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
newnodebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
gsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
gsqsum
.
data
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
gsum
.
rows
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
gsum
.
cols
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
step
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
m_loopcount
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
splitstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
endstage
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
startnode
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
pbuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_mem
)
,
(
void
*
)
&
((
OclBuffers
*
)
buffers
)
->
correctionbuffer
));
args
.
push_back
(
make_pair
(
sizeof
(
cl_int
)
,
(
void
*
)
&
m_nodenum
));
openCLExecuteKernel
(
gsum
.
clCxt
,
&
haarobjectdetect_scaled2
,
"gpuRunHaarClassifierCascade_scaled2"
,
globalThreads
,
localThreads
,
args
,
-
1
,
-
1
);
candidate
=
(
int
*
)
clEnqueueMapBuffer
(
qu
,
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
,
1
,
CL_MAP_READ
,
0
,
4
*
sizeof
(
int
)
*
outputsz
,
0
,
0
,
0
,
NULL
);
for
(
int
i
=
0
;
i
<
outputsz
;
i
++
)
{
if
(
candidate
[
4
*
i
+
2
]
!=
0
)
allCandidates
.
push_back
(
Rect
(
candidate
[
4
*
i
],
candidate
[
4
*
i
+
1
],
candidate
[
4
*
i
+
2
],
candidate
[
4
*
i
+
3
]));
}
free
(
p
);
free
(
correction
);
clEnqueueUnmapMemObject
(
qu
,
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
,
candidate
,
0
,
0
,
0
);
}
rectList
.
resize
(
allCandidates
.
size
());
if
(
!
allCandidates
.
empty
())
std
::
copy
(
allCandidates
.
begin
(),
allCandidates
.
end
(),
rectList
.
begin
());
if
(
minNeighbors
!=
0
||
findBiggestObject
)
groupRectangles
(
rectList
,
rweights
,
std
::
max
(
minNeighbors
,
1
),
GROUP_EPS
);
else
rweights
.
resize
(
rectList
.
size
(),
0
);
GenResult
(
faces
,
rectList
,
rweights
);
}
void
cv
::
ocl
::
OclCascadeClassifierBuf
::
Init
(
const
int
rows
,
const
int
cols
,
double
scaleFactor
,
int
flags
,
const
int
outputsz
,
const
size_t
localThreads
[],
CvSize
minSize
,
CvSize
maxSize
)
{
CvHaarClassifierCascade
*
cascade
=
oldCascade
;
if
(
!
CV_IS_HAAR_CLASSIFIER
(
cascade
)
)
CV_Error
(
!
cascade
?
CV_StsNullPtr
:
CV_StsBadArg
,
"Invalid classifier cascade"
);
if
(
scaleFactor
<=
1
)
CV_Error
(
CV_StsOutOfRange
,
"scale factor must be > 1"
);
if
(
cols
<
minSize
.
width
||
rows
<
minSize
.
height
)
CV_Error
(
CV_StsError
,
"Image too small"
);
int
datasize
=
0
;
int
totalclassifier
=
0
;
if
(
!
cascade
->
hid_cascade
)
gpuCreateHidHaarClassifierCascade
(
cascade
,
&
datasize
,
&
totalclassifier
);
if
(
maxSize
.
height
==
0
||
maxSize
.
width
==
0
)
{
maxSize
.
height
=
rows
;
maxSize
.
width
=
cols
;
}
findBiggestObject
=
(
flags
&
CV_HAAR_FIND_BIGGEST_OBJECT
)
!=
0
;
if
(
findBiggestObject
)
flags
&=
~
(
CV_HAAR_SCALE_IMAGE
|
CV_HAAR_DO_CANNY_PRUNING
);
CreateBaseBufs
(
datasize
,
totalclassifier
,
flags
,
outputsz
);
CreateFactorRelatedBufs
(
rows
,
cols
,
flags
,
scaleFactor
,
localThreads
,
minSize
,
maxSize
);
m_scaleFactor
=
scaleFactor
;
m_rows
=
rows
;
m_cols
=
cols
;
m_flags
=
flags
;
m_minSize
=
minSize
;
m_maxSize
=
maxSize
;
initialized
=
true
;
}
void
cv
::
ocl
::
OclCascadeClassifierBuf
::
CreateBaseBufs
(
const
int
datasize
,
const
int
totalclassifier
,
const
int
flags
,
const
int
outputsz
)
{
if
(
!
initialized
)
{
buffers
=
malloc
(
sizeof
(
OclBuffers
));
size_t
tempSize
=
sizeof
(
GpuHidHaarStageClassifier
)
*
((
GpuHidHaarClassifierCascade
*
)
oldCascade
->
hid_cascade
)
->
count
;
m_nodenum
=
(
datasize
-
sizeof
(
GpuHidHaarClassifierCascade
)
-
tempSize
-
sizeof
(
GpuHidHaarClassifier
)
*
totalclassifier
)
/
sizeof
(
GpuHidHaarTreeNode
);
((
OclBuffers
*
)
buffers
)
->
stagebuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_READ_ONLY
,
tempSize
);
((
OclBuffers
*
)
buffers
)
->
nodebuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_READ_ONLY
,
m_nodenum
*
sizeof
(
GpuHidHaarTreeNode
));
}
if
(
initialized
&&
((
m_flags
&
CV_HAAR_SCALE_IMAGE
)
^
(
flags
&
CV_HAAR_SCALE_IMAGE
)))
{
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
candidatebuffer
));
}
if
(
flags
&
CV_HAAR_SCALE_IMAGE
)
{
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_WRITE_ONLY
,
4
*
sizeof
(
int
)
*
outputsz
);
}
else
{
((
OclBuffers
*
)
buffers
)
->
candidatebuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_WRITE_ONLY
|
CL_MEM_ALLOC_HOST_PTR
,
4
*
sizeof
(
int
)
*
outputsz
);
}
}
void
cv
::
ocl
::
OclCascadeClassifierBuf
::
CreateFactorRelatedBufs
(
const
int
rows
,
const
int
cols
,
const
int
flags
,
const
double
scaleFactor
,
const
size_t
localThreads
[],
CvSize
minSize
,
CvSize
maxSize
)
{
if
(
initialized
)
{
if
((
m_flags
&
CV_HAAR_SCALE_IMAGE
)
&&
!
(
flags
&
CV_HAAR_SCALE_IMAGE
))
{
gimg1
.
release
();
gsum
.
release
();
gsqsum
.
release
();
}
else
if
(
!
(
m_flags
&
CV_HAAR_SCALE_IMAGE
)
&&
(
flags
&
CV_HAAR_SCALE_IMAGE
))
{
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
newnodebuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
correctionbuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
pbuffer
));
}
else
if
((
m_flags
&
CV_HAAR_SCALE_IMAGE
)
&&
(
flags
&
CV_HAAR_SCALE_IMAGE
))
{
if
(
fabs
(
m_scaleFactor
-
scaleFactor
)
<
1e-6
&&
(
rows
==
m_rows
&&
cols
==
m_cols
)
&&
(
minSize
.
width
==
m_minSize
.
width
)
&&
(
minSize
.
height
==
m_minSize
.
height
)
&&
(
maxSize
.
width
==
m_maxSize
.
width
)
&&
(
maxSize
.
height
==
m_maxSize
.
height
))
{
return
;
}
}
else
{
if
(
fabs
(
m_scaleFactor
-
scaleFactor
)
<
1e-6
&&
(
rows
==
m_rows
&&
cols
==
m_cols
)
&&
(
minSize
.
width
==
m_minSize
.
width
)
&&
(
minSize
.
height
==
m_minSize
.
height
)
&&
(
maxSize
.
width
==
m_maxSize
.
width
)
&&
(
maxSize
.
height
==
m_maxSize
.
height
))
{
return
;
}
else
{
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
newnodebuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
correctionbuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
pbuffer
));
}
}
}
int
loopcount
;
int
indexy
=
0
;
int
totalheight
=
0
;
double
factor
;
Rect
roi
;
CvSize
sz
;
CvSize
winSize0
=
oldCascade
->
orig_window_size
;
detect_piramid_info
*
scaleinfo
;
if
(
flags
&
CV_HAAR_SCALE_IMAGE
)
{
for
(
factor
=
1.
f
;;
factor
*=
scaleFactor
)
{
CvSize
winSize
=
{
cvRound
(
winSize0
.
width
*
factor
),
cvRound
(
winSize0
.
height
*
factor
)
};
sz
.
width
=
cvRound
(
cols
/
factor
)
+
1
;
sz
.
height
=
cvRound
(
rows
/
factor
)
+
1
;
CvSize
sz1
=
{
sz
.
width
-
winSize0
.
width
-
1
,
sz
.
height
-
winSize0
.
height
-
1
};
if
(
sz1
.
width
<=
0
||
sz1
.
height
<=
0
)
break
;
if
(
winSize
.
width
>
maxSize
.
width
||
winSize
.
height
>
maxSize
.
height
)
break
;
if
(
winSize
.
width
<
minSize
.
width
||
winSize
.
height
<
minSize
.
height
)
continue
;
totalheight
+=
sz
.
height
;
sizev
.
push_back
(
sz
);
scalev
.
push_back
(
static_cast
<
float
>
(
factor
));
}
loopcount
=
sizev
.
size
();
gimg1
.
create
(
rows
,
cols
,
CV_8UC1
);
gsum
.
create
(
totalheight
+
4
,
cols
+
1
,
CV_32SC1
);
gsqsum
.
create
(
totalheight
+
4
,
cols
+
1
,
CV_32FC1
);
scaleinfo
=
(
detect_piramid_info
*
)
malloc
(
sizeof
(
detect_piramid_info
)
*
loopcount
);
for
(
int
i
=
0
;
i
<
loopcount
;
i
++
)
{
sz
=
sizev
[
i
];
roi
=
Rect
(
0
,
indexy
,
sz
.
width
,
sz
.
height
);
int
width
=
sz
.
width
-
1
-
oldCascade
->
orig_window_size
.
width
;
int
height
=
sz
.
height
-
1
-
oldCascade
->
orig_window_size
.
height
;
int
grpnumperline
=
(
width
+
localThreads
[
0
]
-
1
)
/
localThreads
[
0
];
int
totalgrp
=
((
height
+
localThreads
[
1
]
-
1
)
/
localThreads
[
1
])
*
grpnumperline
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
width_height
=
(
width
<<
16
)
|
height
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
grpnumperline_totalgrp
=
(
grpnumperline
<<
16
)
|
totalgrp
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
imgoff
=
gsum
(
roi
).
offset
>>
2
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
factor
=
scalev
[
i
];
indexy
+=
sz
.
height
;
}
}
else
{
for
(
factor
=
1
;
cvRound
(
factor
*
winSize0
.
width
)
<
cols
-
10
&&
cvRound
(
factor
*
winSize0
.
height
)
<
rows
-
10
;
factor
*=
scaleFactor
)
{
CvSize
winSize
=
{
cvRound
(
winSize0
.
width
*
factor
),
cvRound
(
winSize0
.
height
*
factor
)
};
if
(
winSize
.
width
<
minSize
.
width
||
winSize
.
height
<
minSize
.
height
)
{
continue
;
}
sizev
.
push_back
(
winSize
);
scalev
.
push_back
(
factor
);
}
loopcount
=
scalev
.
size
();
if
(
loopcount
==
0
)
{
loopcount
=
1
;
sizev
.
push_back
(
minSize
);
scalev
.
push_back
(
min
(
cvRound
(
minSize
.
width
/
winSize0
.
width
),
cvRound
(
minSize
.
height
/
winSize0
.
height
))
);
}
((
OclBuffers
*
)
buffers
)
->
pbuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_READ_ONLY
,
sizeof
(
cl_int4
)
*
loopcount
);
((
OclBuffers
*
)
buffers
)
->
correctionbuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_READ_ONLY
,
sizeof
(
cl_float
)
*
loopcount
);
((
OclBuffers
*
)
buffers
)
->
newnodebuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_READ_WRITE
,
loopcount
*
m_nodenum
*
sizeof
(
GpuHidHaarTreeNode
));
scaleinfo
=
(
detect_piramid_info
*
)
malloc
(
sizeof
(
detect_piramid_info
)
*
loopcount
);
for
(
int
i
=
0
;
i
<
loopcount
;
i
++
)
{
sz
=
sizev
[
i
];
factor
=
scalev
[
i
];
int
ystep
=
cvRound
(
std
::
max
(
2.
,
factor
));
int
width
=
(
cols
-
1
-
sz
.
width
+
ystep
-
1
)
/
ystep
;
int
height
=
(
rows
-
1
-
sz
.
height
+
ystep
-
1
)
/
ystep
;
int
grpnumperline
=
(
width
+
localThreads
[
0
]
-
1
)
/
localThreads
[
0
];
int
totalgrp
=
((
height
+
localThreads
[
1
]
-
1
)
/
localThreads
[
1
])
*
grpnumperline
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
width_height
=
(
width
<<
16
)
|
height
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
grpnumperline_totalgrp
=
(
grpnumperline
<<
16
)
|
totalgrp
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
imgoff
=
0
;
((
detect_piramid_info
*
)
scaleinfo
)[
i
].
factor
=
factor
;
}
}
if
(
loopcount
!=
m_loopcount
)
{
if
(
initialized
)
{
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
scaleinfobuffer
));
}
((
OclBuffers
*
)
buffers
)
->
scaleinfobuffer
=
openCLCreateBuffer
(
cv
::
ocl
::
Context
::
getContext
(),
CL_MEM_READ_ONLY
,
sizeof
(
detect_piramid_info
)
*
loopcount
);
}
openCLSafeCall
(
clEnqueueWriteBuffer
((
cl_command_queue
)
cv
::
ocl
::
Context
::
getContext
()
->
oclCommandQueue
(),
((
OclBuffers
*
)
buffers
)
->
scaleinfobuffer
,
1
,
0
,
sizeof
(
detect_piramid_info
)
*
loopcount
,
scaleinfo
,
0
,
NULL
,
NULL
));
free
(
scaleinfo
);
m_loopcount
=
loopcount
;
}
void
cv
::
ocl
::
OclCascadeClassifierBuf
::
GenResult
(
CV_OUT
std
::
vector
<
cv
::
Rect
>&
faces
,
const
std
::
vector
<
cv
::
Rect
>
&
rectList
,
const
std
::
vector
<
int
>
&
rweights
)
{
CvSeq
*
result_seq
=
cvCreateSeq
(
0
,
sizeof
(
CvSeq
),
sizeof
(
CvAvgComp
),
cvCreateMemStorage
(
0
)
);
if
(
findBiggestObject
&&
rectList
.
size
()
)
{
...
...
@@ -1346,13 +1784,34 @@ CvSeq *cv::ocl::OclCascadeClassifier::oclHaarDetectObjects( oclMat &gimg, CvMemS
cvSeqPush
(
result_seq
,
&
c
);
}
}
//t = (double)cvGetTickCount() - t;
//printf( "get face time = %g ms\n", t/((double)cvGetTickFrequency()*1000.) )
;
//alltime = (double)cvGetTickCount() - alltime
;
//printf( "all time = %g ms\n", alltime/((double)cvGetTickFrequency()*1000.)
);
return
result_seq
;
vector
<
CvAvgComp
>
vecAvgComp
;
Seq
<
CvAvgComp
>
(
result_seq
).
copyTo
(
vecAvgComp
)
;
faces
.
resize
(
vecAvgComp
.
size
()
);
std
::
transform
(
vecAvgComp
.
begin
(),
vecAvgComp
.
end
(),
faces
.
begin
(),
getRect
())
;
}
void
cv
::
ocl
::
OclCascadeClassifierBuf
::
release
()
{
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
stagebuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
scaleinfobuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
nodebuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
candidatebuffer
));
if
(
(
m_flags
&
CV_HAAR_SCALE_IMAGE
)
)
{
cvFree
(
&
oldCascade
->
hid_cascade
);
}
else
{
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
newnodebuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
correctionbuffer
));
openCLSafeCall
(
clReleaseMemObject
(((
OclBuffers
*
)
buffers
)
->
pbuffer
));
}
free
(
buffers
);
buffers
=
NULL
;
}
#ifndef _MAX_PATH
#define _MAX_PATH 1024
...
...
modules/ocl/src/opencl/haarobjectdetect.cl
浏览文件 @
69a0b5dd
...
...
@@ -112,7 +112,7 @@ typedef struct __attribute__((aligned (64))) GpuHidHaarClassifierCascade
}
GpuHidHaarClassifierCascade
;
__kernel
void
__attribute__
((
reqd_work_group_size
(
8
,
8
,
1
)))
gpuRunHaarClassifierCascade
(
//constant
GpuHidHaarClassifierCascade
*
cascade,
__kernel
void
__attribute__
((
reqd_work_group_size
(
8
,
8
,
1
)))
gpuRunHaarClassifierCascade
(
global
GpuHidHaarStageClassifier
*
stagecascadeptr,
global
int4
*
info,
global
GpuHidHaarTreeNode
*
nodeptr,
...
...
@@ -128,12 +128,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
const
int
splitnode,
const
int4
p,
const
int4
pq,
const
float
correction
//const
int
width,
//const
int
height,
//const
int
grpnumperline,
//const
int
totalgrp
)
const
float
correction
)
{
int
grpszx
=
get_local_size
(
0
)
;
int
grpszy
=
get_local_size
(
1
)
;
...
...
@@ -145,13 +140,8 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int
lcl_sz
=
mul24
(
grpszx,grpszy
)
;
int
lcl_id
=
mad24
(
lclidy,grpszx,lclidx
)
;
//assume
lcl_sz
==
256
or
128
or
64
//int
lcl_sz_shift
=
(
lcl_sz
==
256
)
?
8
:
7
;
//lcl_sz_shift
=
(
lcl_sz
==
64
)
?
6
:
lcl_sz_shift
;
__local
int
lclshare[1024]
;
#
define
OFF
0
__local
int*
lcldata
=
lclshare
+
OFF
;//for save win data
__local
int*
lcldata
=
lclshare
;//for save win data
__local
int*
glboutindex
=
lcldata
+
28*28
;//for save global out index
__local
int*
lclcount
=
glboutindex
+
1
;//for save the numuber of temp pass pixel
__local
int*
lcloutindex
=
lclcount
+
1
;//for save info of temp pass pixel
...
...
@@ -181,7 +171,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int
totalgrp
=
scaleinfo1.y
&
0xffff
;
int
imgoff
=
scaleinfo1.z
;
float
factor
=
as_float
(
scaleinfo1.w
)
;
//int
ystep
=1
;// factor > 2.0 ? 1 : 2;
__global
const
int
*
sum
=
sum1
+
imgoff
;
__global
const
float
*
sqsum
=
sqsum1
+
imgoff
;
...
...
@@ -191,8 +180,6 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int
grpidx
=
grploop
-
mul24
(
grpidy,
grpnumperline
)
;
int
x
=
mad24
(
grpidx,grpszx,lclidx
)
;
int
y
=
mad24
(
grpidy,grpszy,lclidy
)
;
//candidate_result.x
=
convert_int_rtn
(
x*factor
)
;
//candidate_result.y
=
convert_int_rtn
(
y*factor
)
;
int
grpoffx
=
x-lclidx
;
int
grpoffy
=
y-lclidy
;
...
...
@@ -211,14 +198,7 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int4
data
=
*
(
__global
int4*
)
&sum[glb_off]
;
int
lcl_off
=
mad24
(
lcl_y,
readwidth,
lcl_x<<2
)
;
#
if
OFF
lcldata[lcl_off]
=
data.x
;
lcldata[lcl_off+1]
=
data.y
;
lcldata[lcl_off+2]
=
data.z
;
lcldata[lcl_off+3]
=
data.w
;
#
else
vstore4
(
data,
0
,
&lcldata[lcl_off]
)
;
#
endif
}
lcloutindex[lcl_id]
=
0
;
...
...
@@ -231,184 +211,170 @@ __kernel void __attribute__((reqd_work_group_size(8,8,1)))gpuRunHaarClassifierCa
int
lcl_off
=
mad24
(
lclidy,readwidth,lclidx
)
;
int4
cascadeinfo1,
cascadeinfo2
;
cascadeinfo1
=
p
;
cascadeinfo2
=
pq
;
// + mad24(y, pixelstep, x);
cascadeinfo2
=
pq
;
cascadeinfo1.x
+=lcl_off
;
cascadeinfo1.z
+=lcl_off
;
mean
=
(
lcldata[mad24
(
cascadeinfo1.y,readwidth,cascadeinfo1.x
)
]
-
lcldata[mad24
(
cascadeinfo1.y,readwidth,cascadeinfo1.z
)
]
-
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.x
)
]
+
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.z
)
]
)
*correction
;
//if
((
x
<
width
)
&&
(
y
<
height
))
{
cascadeinfo1.x
+=lcl_off
;
cascadeinfo1.z
+=lcl_off
;
mean
=
(
lcldata[mad24
(
cascadeinfo1.y,readwidth,cascadeinfo1.x
)
]
-
lcldata[mad24
(
cascadeinfo1.y,readwidth,cascadeinfo1.z
)
]
-
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.x
)
]
+
lcldata[mad24
(
cascadeinfo1.w,readwidth,cascadeinfo1.z
)
]
)
*correction
;
int
p_offset
=
mad24
(
y,
pixelstep,
x
)
;
cascadeinfo2.x
+=p_offset
;
cascadeinfo2.z
+=p_offset
;
variance_norm_factor
=sqsum[mad24
(
cascadeinfo2.y,
pixelstep,
cascadeinfo2.x
)
]
-
sqsum[mad24
(
cascadeinfo2.y,
pixelstep,
cascadeinfo2.z
)
]
-
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.x
)
]
+
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.z
)
]
;
variance_norm_factor
=
variance_norm_factor
*
correction
-
mean
*
mean
;
variance_norm_factor
=
variance_norm_factor
>=0.f
?
sqrt
(
variance_norm_factor
)
:
1.f
;
//if
(
cascade->is_stump_based
)
//{
for
(
int
stageloop
=
start_stage
; (stageloop < split_stage) && result; stageloop++ )
{
float
stage_sum
=
0.f
;
int2
stageinfo
=
*
(
global
int2*
)(
stagecascadeptr+stageloop
)
;
float
stagethreshold
=
as_float
(
stageinfo.y
)
;
for
(
int
nodeloop
=
0
; nodeloop < stageinfo.x; nodeloop++ )
{
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
nodeptr
+
nodecounter
)
;
int
p_offset
=
mad24
(
y,
pixelstep,
x
)
;
int4
info1
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
cascadeinfo2.x
+=p_offset
;
cascadeinfo2.z
+=p_offset
;
variance_norm_factor
=sqsum[mad24
(
cascadeinfo2.y,
pixelstep,
cascadeinfo2.x
)
]
-
sqsum[mad24
(
cascadeinfo2.y,
pixelstep,
cascadeinfo2.z
)
]
-
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.x
)
]
+
sqsum[mad24
(
cascadeinfo2.w,
pixelstep,
cascadeinfo2.z
)
]
;
info1.x
+=lcl_off
;
info1.z
+=lcl_off
;
info2.x
+=lcl_off
;
info2.z
+=lcl_off
;
variance_norm_factor
=
variance_norm_factor
*
correction
-
mean
*
mean
;
variance_norm_factor
=
variance_norm_factor
>=0.f
?
sqrt
(
variance_norm_factor
)
:
1.f
;
float
classsum
=
(
lcldata[mad24
(
info1.y,readwidth,info1.x
)
]
-
lcldata[mad24
(
info1.y,readwidth,info1.z
)
]
-
lcldata[mad24
(
info1.w,readwidth,info1.x
)
]
+
lcldata[mad24
(
info1.w,readwidth,info1.z
)
]
)
*
w.x
;
for
(
int
stageloop
=
start_stage
; (stageloop < split_stage) && result; stageloop++ )
{
float
stage_sum
=
0.f
;
int2
stageinfo
=
*
(
global
int2*
)(
stagecascadeptr+stageloop
)
;
float
stagethreshold
=
as_float
(
stageinfo.y
)
;
for
(
int
nodeloop
=
0
; nodeloop < stageinfo.x; nodeloop++ )
{
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
nodeptr
+
nodecounter
)
;
int4
info1
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
classsum
+=
(
lcldata[mad24
(
info2.y,readwidth,info2.x
)
]
-
lcldata[mad24
(
info2.y,readwidth,info2.z
)
]
-
lcldata[mad24
(
info2.w,readwidth,info2.x
)
]
+
lcldata[mad24
(
info2.w,readwidth,info2.z
)
]
)
*
w.y
;
info1.x
+=lcl_off
;
info1.z
+=lcl_off
;
info2.x
+=lcl_off
;
info2.z
+=lcl_off
;
float
classsum
=
(
lcldata[mad24
(
info1.y,readwidth,info1.x
)
]
-
lcldata[mad24
(
info1.y,readwidth,info1.z
)
]
-
lcldata[mad24
(
info1.w,readwidth,info1.x
)
]
+
lcldata[mad24
(
info1.w,readwidth,info1.z
)
]
)
*
w.x
;
//if
((
info3.z
-
info3.x
)
&&
(
!stageinfo.z
))
//{
info3.x
+=lcl_off
;
info3.z
+=lcl_off
;
classsum
+=
(
lcldata[mad24
(
info3.y,readwidth,info3.x
)
]
-
lcldata[mad24
(
info3.y,readwidth,info3.z
)
]
-
lcldata[mad24
(
info3.w,readwidth,info3.x
)
]
+
lcldata[mad24
(
info3.w,readwidth,info3.z
)
]
)
*
w.z
;
//}
stage_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
nodecounter++
;
}
classsum
+=
(
lcldata[mad24
(
info2.y,readwidth,info2.x
)
]
-
lcldata[mad24
(
info2.y,readwidth,info2.z
)
]
-
lcldata[mad24
(
info2.w,readwidth,info2.x
)
]
+
lcldata[mad24
(
info2.w,readwidth,info2.z
)
]
)
*
w.y
;
result
=
(
stage_sum
>=
stagethreshold
)
;
}
info3.x
+=lcl_off
;
info3.z
+=lcl_off
;
classsum
+=
(
lcldata[mad24
(
info3.y,readwidth,info3.x
)
]
-
lcldata[mad24
(
info3.y,readwidth,info3.z
)
]
-
lcldata[mad24
(
info3.w,readwidth,info3.x
)
]
+
lcldata[mad24
(
info3.w,readwidth,info3.z
)
]
)
*
w.z
;
if
(
result
&&
(
x
<
width
)
&&
(
y
<
height
))
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex<<1]
=
(
lclidy
<<
16
)
|
lclidx
;
lcloutindex[
(
queueindex<<1
)
+1]
=
as_int
(
variance_norm_factor
)
;
stage_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
nodecounter++
;
}
result
=
(
stage_sum
>=
stagethreshold
)
;
}
if
(
result
&&
(
x
<
width
)
&&
(
y
<
height
))
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex<<1]
=
(
lclidy
<<
16
)
|
lclidx
;
lcloutindex[
(
queueindex<<1
)
+1]
=
as_int
(
variance_norm_factor
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
queuecount
=
lclcount[0]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
nodecounter
=
splitnode
;
for
(
int
stageloop
=
split_stage
; stageloop< end_stage && queuecount>0; stageloop++)
{
lclcount[0]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
queuecount
=
lclcount[0]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
nodecounter
=
splitnode
;
for
(
int
stageloop
=
split_stage
; stageloop< end_stage && queuecount>0; stageloop++)
{
//barrier
(
CLK_LOCAL_MEM_FENCE
)
;
//if
(
lcl_id
==
0
)
lclcount[0]=0
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int2
stageinfo
=
*
(
global
int2*
)(
stagecascadeptr+stageloop
)
;
float
stagethreshold
=
as_float
(
stageinfo.y
)
;
int2
stageinfo
=
*
(
global
int2*
)(
stagecascadeptr+stageloop
)
;
float
stagethreshold
=
as_float
(
stageinfo.y
)
;
int
perfscale
=
queuecount
>
4
?
3
:
2
;
int
queuecount_loop
=
(
queuecount
+
(
1<<perfscale
)
-1
)
>>
perfscale
;
int
lcl_compute_win
=
lcl_sz
>>
perfscale
;
int
lcl_compute_win_id
=
(
lcl_id
>>
(
6-perfscale
))
;
int
lcl_loops
=
(
stageinfo.x
+
lcl_compute_win
-1
)
>>
(
6-perfscale
)
;
int
lcl_compute_id
=
lcl_id
-
(
lcl_compute_win_id
<<
(
6-perfscale
))
;
for
(
int
queueloop=0
; queueloop<queuecount_loop/* && lcl_compute_win_id < queuecount*/; queueloop++)
int
perfscale
=
queuecount
>
4
?
3
:
2
;
int
queuecount_loop
=
(
queuecount
+
(
1<<perfscale
)
-1
)
>>
perfscale
;
int
lcl_compute_win
=
lcl_sz
>>
perfscale
;
int
lcl_compute_win_id
=
(
lcl_id
>>
(
6-perfscale
))
;
int
lcl_loops
=
(
stageinfo.x
+
lcl_compute_win
-1
)
>>
(
6-perfscale
)
;
int
lcl_compute_id
=
lcl_id
-
(
lcl_compute_win_id
<<
(
6-perfscale
))
;
for
(
int
queueloop=0
; queueloop<queuecount_loop; queueloop++)
{
float
stage_sum
=
0.f
;
int
temp_coord
=
lcloutindex[lcl_compute_win_id<<1]
;
float
variance_norm_factor
=
as_float
(
lcloutindex[
(
lcl_compute_win_id<<1
)
+1]
)
;
int
queue_pixel
=
mad24
(((
temp_coord
&
(
int
)
0xffff0000
)
>>16
)
,
readwidth,temp_coord
&
0xffff
)
;
if
(
lcl_compute_win_id
<
queuecount
)
{
float
stage_sum
=
0.f
;
int
temp_coord
=
lcloutindex[lcl_compute_win_id<<1]
;
float
variance_norm_factor
=
as_float
(
lcloutindex[
(
lcl_compute_win_id<<1
)
+1]
)
;
int
queue_pixel
=
mad24
(((
temp_coord
&
(
int
)
0xffff0000
)
>>16
)
,
readwidth,temp_coord
&
0xffff
)
;
//barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lcl_compute_win_id
<
queuecount
)
int
tempnodecounter
=
lcl_compute_id
;
float
part_sum
=
0.f
;
for
(
int
lcl_loop=0
; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
{
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
nodeptr
+
nodecounter
+
tempnodecounter
)
;
int4
info1
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
info1.x
+=queue_pixel
;
info1.z
+=queue_pixel
;
info2.x
+=queue_pixel
;
info2.z
+=queue_pixel
;
float
classsum
=
(
lcldata[mad24
(
info1.y,readwidth,info1.x
)
]
-
lcldata[mad24
(
info1.y,readwidth,info1.z
)
]
-
lcldata[mad24
(
info1.w,readwidth,info1.x
)
]
+
lcldata[mad24
(
info1.w,readwidth,info1.z
)
]
)
*
w.x
;
int
tempnodecounter
=
lcl_compute_id
;
float
part_sum
=
0.f
;
for
(
int
lcl_loop=0
; lcl_loop<lcl_loops && tempnodecounter<stageinfo.x; lcl_loop++)
{
__global
GpuHidHaarTreeNode*
currentnodeptr
=
(
nodeptr
+
nodecounter
+
tempnodecounter
)
;
int4
info1
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
info1.x
+=queue_pixel
;
info1.z
+=queue_pixel
;
info2.x
+=queue_pixel
;
info2.z
+=queue_pixel
;
float
classsum
=
(
lcldata[mad24
(
info1.y,readwidth,info1.x
)
]
-
lcldata[mad24
(
info1.y,readwidth,info1.z
)
]
-
lcldata[mad24
(
info1.w,readwidth,info1.x
)
]
+
lcldata[mad24
(
info1.w,readwidth,info1.z
)
]
)
*
w.x
;
classsum
+=
(
lcldata[mad24
(
info2.y,readwidth,info2.x
)
]
-
lcldata[mad24
(
info2.y,readwidth,info2.z
)
]
-
lcldata[mad24
(
info2.w,readwidth,info2.x
)
]
+
lcldata[mad24
(
info2.w,readwidth,info2.z
)
]
)
*
w.y
;
//if
((
info3.z
-
info3.x
)
&&
(
!stageinfo.z
))
//{
info3.x
+=queue_pixel
;
info3.z
+=queue_pixel
;
classsum
+=
(
lcldata[mad24
(
info3.y,readwidth,info3.x
)
]
-
lcldata[mad24
(
info3.y,readwidth,info3.z
)
]
-
lcldata[mad24
(
info3.w,readwidth,info3.x
)
]
+
lcldata[mad24
(
info3.w,readwidth,info3.z
)
]
)
*
w.z
;
//}
part_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
tempnodecounter
+=lcl_compute_win
;
}//end
for
(
int
lcl_loop=0
;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum
;
classsum
+=
(
lcldata[mad24
(
info2.y,readwidth,info2.x
)
]
-
lcldata[mad24
(
info2.y,readwidth,info2.z
)
]
-
lcldata[mad24
(
info2.w,readwidth,info2.x
)
]
+
lcldata[mad24
(
info2.w,readwidth,info2.z
)
]
)
*
w.y
;
info3.x
+=queue_pixel
;
info3.z
+=queue_pixel
;
classsum
+=
(
lcldata[mad24
(
info3.y,readwidth,info3.x
)
]
-
lcldata[mad24
(
info3.y,readwidth,info3.z
)
]
-
lcldata[mad24
(
info3.w,readwidth,info3.x
)
]
+
lcldata[mad24
(
info3.w,readwidth,info3.z
)
]
)
*
w.z
;
part_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
tempnodecounter
+=lcl_compute_win
;
}//end
for
(
int
lcl_loop=0
;lcl_loop<lcl_loops;lcl_loop++)
partialsum[lcl_id]=part_sum
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lcl_compute_win_id
<
queuecount
)
{
for
(
int
i=0
; i<lcl_compute_win && (lcl_compute_id==0); i++)
{
stage_sum
+=
partialsum[lcl_id+i]
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lcl_compute_win_id
<
queuecount
)
if
(
stage_sum
>=
stagethreshold
&&
(
lcl_compute_id==0
))
{
for
(
int
i=0
; i<lcl_compute_win && (lcl_compute_id==0); i++)
{
stage_sum
+=
partialsum[lcl_id+i]
;
}
if
(
stage_sum
>=
stagethreshold
&&
(
lcl_compute_id==0
))
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex<<1]
=
temp_coord
;
lcloutindex[
(
queueindex<<1
)
+1]
=
as_int
(
variance_norm_factor
)
;
}
lcl_compute_win_id
+=
(
1<<perfscale
)
;
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex<<1]
=
temp_coord
;
lcloutindex[
(
queueindex<<1
)
+1]
=
as_int
(
variance_norm_factor
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}//end
for
(
int
queueloop=0
;queueloop<queuecount_loop;queueloop++)
//barrier
(
CLK_LOCAL_MEM_FENCE
)
;
queuecount
=
lclcount[0]
;
lcl_compute_win_id
+=
(
1<<perfscale
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
nodecounter
+=
stageinfo.x
;
}//end
for
(
int
stageloop
=
splitstage
; stageloop< endstage && queuecount>0;stageloop++)
//barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lcl_id<queuecount
)
{
int
temp
=
lcloutindex[lcl_id<<1]
;
int
x
=
mad24
(
grpidx,grpszx,temp
&
0xffff
)
;
int
y
=
mad24
(
grpidy,grpszy,
((
temp
&
(
int
)
0xffff0000
)
>>
16
))
;
temp
=
glboutindex[0]
;
int4
candidate_result
;
candidate_result.zw
=
(
int2
)
convert_int_rtn
(
factor*20.f
)
;
candidate_result.x
=
convert_int_rtn
(
x*factor
)
;
candidate_result.y
=
convert_int_rtn
(
y*factor
)
;
atomic_inc
(
glboutindex
)
;
candidate[outputoff+temp+lcl_id]
=
candidate_result
;
}
}//end
for
(
int
queueloop=0
;queueloop<queuecount_loop;queueloop++)
queuecount
=
lclcount[0]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}//end
if
((
x
<
width
)
&&
(
y
<
height
))
nodecounter
+=
stageinfo.x
;
}//end
for
(
int
stageloop
=
splitstage
; stageloop< endstage && queuecount>0;stageloop++)
if
(
lcl_id<queuecount
)
{
int
temp
=
lcloutindex[lcl_id<<1]
;
int
x
=
mad24
(
grpidx,grpszx,temp
&
0xffff
)
;
int
y
=
mad24
(
grpidy,grpszy,
((
temp
&
(
int
)
0xffff0000
)
>>
16
))
;
temp
=
glboutindex[0]
;
int4
candidate_result
;
candidate_result.zw
=
(
int2
)
convert_int_rtn
(
factor*20.f
)
;
candidate_result.x
=
convert_int_rtn
(
x*factor
)
;
candidate_result.y
=
convert_int_rtn
(
y*factor
)
;
atomic_inc
(
glboutindex
)
;
candidate[outputoff+temp+lcl_id]
=
candidate_result
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}//end
for
(
int
grploop=grpidx
;grploop<totalgrp;grploop+=grpnumx)
//outputoff
+=mul24
(
width,height
)
;
}//end
for
(
int
scalei
=
0
; scalei <loopcount; scalei++)
}
...
...
modules/ocl/src/opencl/haarobjectdetect_scaled2.cl
浏览文件 @
69a0b5dd
...
...
@@ -16,6 +16,7 @@
//
//
@Authors
//
Wu
Xinglong,
wxl370@126.com
//
Sen
Liu,
swjtuls1987@126.com
//
//
Redistribution
and
use
in
source
and
binary
forms,
with
or
without
modification,
//
are
permitted
provided
that
the
following
conditions
are
met:
...
...
@@ -52,11 +53,11 @@ typedef struct __attribute__((aligned(128))) GpuHidHaarFeature
{
struct
__attribute__
((
aligned
(
32
)))
{
int
p0
__attribute__
((
aligned
(
4
)))
;
int
p1
__attribute__
((
aligned
(
4
)))
;
int
p2
__attribute__
((
aligned
(
4
)))
;
int
p3
__attribute__
((
aligned
(
4
)))
;
float
weight
__attribute__
((
aligned
(
4
)))
;
int
p0
__attribute__
((
aligned
(
4
)))
;
int
p1
__attribute__
((
aligned
(
4
)))
;
int
p2
__attribute__
((
aligned
(
4
)))
;
int
p3
__attribute__
((
aligned
(
4
)))
;
float
weight
__attribute__
((
aligned
(
4
)))
;
}
rect[CV_HAAR_FEATURE_MAX]
__attribute__
((
aligned
(
32
)))
;
}
...
...
@@ -113,173 +114,168 @@ __kernel void gpuRunHaarClassifierCascade_scaled2(
global
const
int
*restrict
sum,
global
const
float
*restrict
sqsum,
global
int4
*candidate,
const
int
rows,
const
int
cols,
const
int
step,
const
int
loopcount,
const
int
start_stage,
const
int
split_stage,
const
int
end_stage,
const
int
startnode,
const
int
splitnode,
global
int4
*p,
//const
int4
*
pq,
global
float
*correction,
const
int
nodecount
)
{
int
grpszx
=
get_local_size
(
0
)
;
int
grpszy
=
get_local_size
(
1
)
;
int
grpnumx
=
get_num_groups
(
0
)
;
int
grpidx
=
get_group_id
(
0
)
;
int
lclidx
=
get_local_id
(
0
)
;
int
lclidy
=
get_local_id
(
1
)
;
int
lcl_sz
=
mul24
(
grpszx,
grpszy
)
;
int
lcl_id
=
mad24
(
lclidy,
grpszx,
lclidx
)
;
__local
int
lclshare[1024]
;
__local
int
*glboutindex
=
lclshare
+
0
;
__local
int
*lclcount
=
glboutindex
+
1
;
__local
int
*lcloutindex
=
lclcount
+
1
;
__local
float
*partialsum
=
(
__local
float
*
)(
lcloutindex
+
(
lcl_sz
<<
1
))
;
glboutindex[0]
=
0
;
int
outputoff
=
mul24
(
grpidx,
256
)
;
candidate[outputoff
+
(
lcl_id
<<
2
)
]
=
(
int4
)
0
;
candidate[outputoff
+
(
lcl_id
<<
2
)
+
1]
=
(
int4
)
0
;
candidate[outputoff
+
(
lcl_id
<<
2
)
+
2]
=
(
int4
)
0
;
candidate[outputoff
+
(
lcl_id
<<
2
)
+
3]
=
(
int4
)
0
;
int
grpszx
=
get_local_size
(
0
)
;
int
grpszy
=
get_local_size
(
1
)
;
int
grpnumx
=
get_num_groups
(
0
)
;
int
grpidx
=
get_group_id
(
0
)
;
int
lclidx
=
get_local_id
(
0
)
;
int
lclidy
=
get_local_id
(
1
)
;
int
lcl_sz
=
mul24
(
grpszx,
grpszy
)
;
int
lcl_id
=
mad24
(
lclidy,
grpszx,
lclidx
)
;
__local
int
glboutindex[1]
;
__local
int
lclcount[1]
;
__local
int
lcloutindex[64]
;
glboutindex[0]
=
0
;
int
outputoff
=
mul24
(
grpidx,
256
)
;
candidate[outputoff
+
(
lcl_id
<<
2
)
]
=
(
int4
)
0
;
candidate[outputoff
+
(
lcl_id
<<
2
)
+
1]
=
(
int4
)
0
;
candidate[outputoff
+
(
lcl_id
<<
2
)
+
2]
=
(
int4
)
0
;
candidate[outputoff
+
(
lcl_id
<<
2
)
+
3]
=
(
int4
)
0
;
int
max_idx
=
rows
*
cols
-
1
;
for
(
int
scalei
=
0
; scalei < loopcount; scalei++)
{
int4
scaleinfo1
;
scaleinfo1
=
info[scalei]
;
int
width
=
(
scaleinfo1.x
&
0xffff0000
)
>>
16
;
int
height
=
scaleinfo1.x
&
0xffff
;
int
grpnumperline
=
(
scaleinfo1.y
&
0xffff0000
)
>>
16
;
int
totalgrp
=
scaleinfo1.y
&
0xffff
;
float
factor
=
as_float
(
scaleinfo1.w
)
;
float
correction_t
=
correction[scalei]
;
int
ystep
=
(
int
)(
max
(
2.0f,
factor
)
+
0.5f
)
;
for
(
int
scalei
=
0
; scalei < loopcount; scalei++
)
for
(
int
grploop
=
get_group_id
(
0
)
; grploop < totalgrp; grploop += grpnumx
)
{
int4
scaleinfo1
;
scaleinfo1
=
info[scalei]
;
int
width
=
(
scaleinfo1.x
&
0xffff0000
)
>>
16
;
int
height
=
scaleinfo1.x
&
0xffff
;
int
grpnumperline
=
(
scaleinfo1.y
&
0xffff0000
)
>>
16
;
int
totalgrp
=
scaleinfo1.y
&
0xffff
;
float
factor
=
as_float
(
scaleinfo1.w
)
;
float
correction_t
=
correction[scalei]
;
int
ystep
=
(
int
)(
max
(
2.0f,
factor
)
+
0.5f
)
;
int4
cascadeinfo
=
p[scalei]
;
int
grpidy
=
grploop
/
grpnumperline
;
int
grpidx
=
grploop
-
mul24
(
grpidy,
grpnumperline
)
;
int
ix
=
mad24
(
grpidx,
grpszx,
lclidx
)
;
int
iy
=
mad24
(
grpidy,
grpszy,
lclidy
)
;
int
x
=
ix
*
ystep
;
int
y
=
iy
*
ystep
;
lcloutindex[lcl_id]
=
0
;
lclcount[0]
=
0
;
int
nodecounter
;
float
mean,
variance_norm_factor
;
//if
((
ix
<
width
)
&&
(
iy
<
height
))
{
const
int
p_offset
=
mad24
(
y,
step,
x
)
;
cascadeinfo.x
+=
p_offset
;
cascadeinfo.z
+=
p_offset
;
mean
=
(
sum[clamp
(
mad24
(
cascadeinfo.y,
step,
cascadeinfo.x
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
cascadeinfo.y,
step,
cascadeinfo.z
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
cascadeinfo.w,
step,
cascadeinfo.x
)
,
0
,
max_idx
)
]
+
sum[clamp
(
mad24
(
cascadeinfo.w,
step,
cascadeinfo.z
)
,
0
,
max_idx
)
]
)
*
correction_t
;
variance_norm_factor
=
sqsum[clamp
(
mad24
(
cascadeinfo.y,
step,
cascadeinfo.x
)
,
0
,
max_idx
)
]
-
sqsum[clamp
(
mad24
(
cascadeinfo.y,
step,
cascadeinfo.z
)
,
0
,
max_idx
)
]
-
sqsum[clamp
(
mad24
(
cascadeinfo.w,
step,
cascadeinfo.x
)
,
0
,
max_idx
)
]
+
sqsum[clamp
(
mad24
(
cascadeinfo.w,
step,
cascadeinfo.z
)
,
0
,
max_idx
)
]
;
variance_norm_factor
=
variance_norm_factor
*
correction_t
-
mean
*
mean
;
variance_norm_factor
=
variance_norm_factor
>=
0.f
?
sqrt
(
variance_norm_factor
)
:
1.f
;
bool
result
=
true
;
nodecounter
=
startnode
+
nodecount
*
scalei
;
for
(
int
grploop
=
get_group_id
(
0
)
; grploop < totalgrp; grploop += grpnumx
)
for
(
int
stageloop
=
start_stage
; (stageloop < end_stage) && result; stageloop++
)
{
int4
cascadeinfo
=
p[scalei]
;
int
grpidy
=
grploop
/
grpnumperline
;
int
grpidx
=
grploop
-
mul24
(
grpidy,
grpnumperline
)
;
int
ix
=
mad24
(
grpidx,
grpszx,
lclidx
)
;
int
iy
=
mad24
(
grpidy,
grpszy,
lclidy
)
;
int
x
=
ix
*
ystep
;
int
y
=
iy
*
ystep
;
lcloutindex[lcl_id]
=
0
;
lclcount[0]
=
0
;
int
result
=
1
,
nodecounter
;
float
mean,
variance_norm_factor
;
//if
((
ix
<
width
)
&&
(
iy
<
height
))
{
const
int
p_offset
=
mad24
(
y,
step,
x
)
;
cascadeinfo.x
+=
p_offset
;
cascadeinfo.z
+=
p_offset
;
mean
=
(
sum[mad24
(
cascadeinfo.y,
step,
cascadeinfo.x
)
]
-
sum[mad24
(
cascadeinfo.y,
step,
cascadeinfo.z
)
]
-
sum[mad24
(
cascadeinfo.w,
step,
cascadeinfo.x
)
]
+
sum[mad24
(
cascadeinfo.w,
step,
cascadeinfo.z
)
]
)
*
correction_t
;
variance_norm_factor
=
sqsum[mad24
(
cascadeinfo.y,
step,
cascadeinfo.x
)
]
-
sqsum[mad24
(
cascadeinfo.y,
step,
cascadeinfo.z
)
]
-
sqsum[mad24
(
cascadeinfo.w,
step,
cascadeinfo.x
)
]
+
sqsum[mad24
(
cascadeinfo.w,
step,
cascadeinfo.z
)
]
;
variance_norm_factor
=
variance_norm_factor
*
correction_t
-
mean
*
mean
;
variance_norm_factor
=
variance_norm_factor
>=
0.f
?
sqrt
(
variance_norm_factor
)
:
1.f
;
result
=
1
;
nodecounter
=
startnode
+
nodecount
*
scalei
;
for
(
int
stageloop
=
start_stage
; stageloop < end_stage && result; stageloop++)
{
float
stage_sum
=
0.f
;
int4
stageinfo
=
*
(
global
int4
*
)(
stagecascadeptr
+
stageloop
)
;
float
stagethreshold
=
as_float
(
stageinfo.y
)
;
for
(
int
nodeloop
=
0
; nodeloop < stageinfo.x; nodeloop++)
{
__global
GpuHidHaarTreeNode
*currentnodeptr
=
(
nodeptr
+
nodecounter
)
;
int4
info1
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4
*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2
*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
info1.x
+=
p_offset
;
info1.z
+=
p_offset
;
info2.x
+=
p_offset
;
info2.z
+=
p_offset
;
float
classsum
=
(
sum[mad24
(
info1.y,
step,
info1.x
)
]
-
sum[mad24
(
info1.y,
step,
info1.z
)
]
-
sum[mad24
(
info1.w,
step,
info1.x
)
]
+
sum[mad24
(
info1.w,
step,
info1.z
)
]
)
*
w.x
;
classsum
+=
(
sum[mad24
(
info2.y,
step,
info2.x
)
]
-
sum[mad24
(
info2.y,
step,
info2.z
)
]
-
sum[mad24
(
info2.w,
step,
info2.x
)
]
+
sum[mad24
(
info2.w,
step,
info2.z
)
]
)
*
w.y
;
info3.x
+=
p_offset
;
info3.z
+=
p_offset
;
classsum
+=
(
sum[mad24
(
info3.y,
step,
info3.x
)
]
-
sum[mad24
(
info3.y,
step,
info3.z
)
]
-
sum[mad24
(
info3.w,
step,
info3.x
)
]
+
sum[mad24
(
info3.w,
step,
info3.z
)
]
)
*
w.z
;
stage_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
nodecounter++
;
}
result
=
(
stage_sum
>=
stagethreshold
)
;
}
float
stage_sum
=
0.f
;
int
stagecount
=
stagecascadeptr[stageloop].count
;
for
(
int
nodeloop
=
0
; nodeloop < stagecount; nodeloop++)
{
__global
GpuHidHaarTreeNode
*currentnodeptr
=
(
nodeptr
+
nodecounter
)
;
int4
info1
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[0][0]
))
;
int4
info2
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[1][0]
))
;
int4
info3
=
*
(
__global
int4
*
)(
&
(
currentnodeptr->p[2][0]
))
;
float4
w
=
*
(
__global
float4
*
)(
&
(
currentnodeptr->weight[0]
))
;
float2
alpha2
=
*
(
__global
float2
*
)(
&
(
currentnodeptr->alpha[0]
))
;
float
nodethreshold
=
w.w
*
variance_norm_factor
;
info1.x
+=
p_offset
;
info1.z
+=
p_offset
;
info2.x
+=
p_offset
;
info2.z
+=
p_offset
;
float
classsum
=
(
sum[clamp
(
mad24
(
info1.y,
step,
info1.x
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
info1.y,
step,
info1.z
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
info1.w,
step,
info1.x
)
,
0
,
max_idx
)
]
+
sum[clamp
(
mad24
(
info1.w,
step,
info1.z
)
,
0
,
max_idx
)
]
)
*
w.x
;
classsum
+=
(
sum[clamp
(
mad24
(
info2.y,
step,
info2.x
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
info2.y,
step,
info2.z
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
info2.w,
step,
info2.x
)
,
0
,
max_idx
)
]
+
sum[clamp
(
mad24
(
info2.w,
step,
info2.z
)
,
0
,
max_idx
)
]
)
*
w.y
;
info3.x
+=
p_offset
;
info3.z
+=
p_offset
;
classsum
+=
(
sum[clamp
(
mad24
(
info3.y,
step,
info3.x
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
info3.y,
step,
info3.z
)
,
0
,
max_idx
)
]
-
sum[clamp
(
mad24
(
info3.w,
step,
info3.x
)
,
0
,
max_idx
)
]
+
sum[clamp
(
mad24
(
info3.w,
step,
info3.z
)
,
0
,
max_idx
)
]
)
*
w.z
;
stage_sum
+=
classsum
>=
nodethreshold
?
alpha2.y
:
alpha2.x
;
nodecounter++
;
}
result
=
(
bool
)(
stage_sum
>=
stagecascadeptr[stageloop].threshold
)
;
}
if
(
result
&&
(
ix
<
width
)
&&
(
iy
<
height
))
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex
<<
1]
=
(
y
<<
16
)
|
x
;
lcloutindex[
(
queueindex
<<
1
)
+
1]
=
as_int
(
variance_norm_factor
)
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
queuecount
=
lclcount[0]
;
nodecounter
=
splitnode
+
nodecount
*
scalei
;
if
(
result
&&
(
ix
<
width
)
&&
(
iy
<
height
))
{
int
queueindex
=
atomic_inc
(
lclcount
)
;
lcloutindex[queueindex]
=
(
y
<<
16
)
|
x
;
}
if
(
lcl_id
<
queuecount
)
{
int
temp
=
lcloutindex[lcl_id
<<
1]
;
int
x
=
temp
&
0xffff
;
int
y
=
(
temp
&
(
int
)
0xffff0000
)
>>
16
;
temp
=
glboutindex[0]
;
int4
candidate_result
;
candidate_result.zw
=
(
int2
)
convert_int_rtn
(
factor
*
20.f
)
;
candidate_result.x
=
x
;
candidate_result.y
=
y
;
atomic_inc
(
glboutindex
)
;
candidate[outputoff
+
temp
+
lcl_id]
=
candidate_result
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
queuecount
=
lclcount[0]
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
if
(
lcl_id
<
queuecount
)
{
int
temp
=
lcloutindex[lcl_id]
;
int
x
=
temp
&
0xffff
;
int
y
=
(
temp
&
(
int
)
0xffff0000
)
>>
16
;
temp
=
atomic_inc
(
glboutindex
)
;
int4
candidate_result
;
candidate_result.zw
=
(
int2
)
convert_int_rtn
(
factor
*
20.f
)
;
candidate_result.x
=
x
;
candidate_result.y
=
y
;
candidate[outputoff
+
temp
+
lcl_id]
=
candidate_result
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
}
}
__kernel
void
gpuscaleclassifier
(
global
GpuHidHaarTreeNode
*orinode,
global
GpuHidHaarTreeNode
*newnode,
float
scale,
float
weight_scale,
int
nodenum
)
{
int
counter
=
get_global_id
(
0
)
;
int
tr_x[3],
tr_y[3],
tr_h[3],
tr_w[3],
i
=
0
;
GpuHidHaarTreeNode
t1
=
*
(
orinode
+
counter
)
;
int
counter
=
get_global_id
(
0
)
;
int
tr_x[3],
tr_y[3],
tr_h[3],
tr_w[3],
i
=
0
;
GpuHidHaarTreeNode
t1
=
*
(
orinode
+
counter
)
;
#
pragma
unroll
for
(
i
=
0
; i < 3; i++)
{
tr_x[i]
=
(
int
)(
t1.p[i][0]
*
scale
+
0.5f
)
;
tr_y[i]
=
(
int
)(
t1.p[i][1]
*
scale
+
0.5f
)
;
tr_w[i]
=
(
int
)(
t1.p[i][2]
*
scale
+
0.5f
)
;
tr_h[i]
=
(
int
)(
t1.p[i][3]
*
scale
+
0.5f
)
;
}
for
(
i
=
0
; i < 3; i++)
{
tr_x[i]
=
(
int
)(
t1.p[i][0]
*
scale
+
0.5f
)
;
tr_y[i]
=
(
int
)(
t1.p[i][1]
*
scale
+
0.5f
)
;
tr_w[i]
=
(
int
)(
t1.p[i][2]
*
scale
+
0.5f
)
;
tr_h[i]
=
(
int
)(
t1.p[i][3]
*
scale
+
0.5f
)
;
}
t1.weight[0]
=
t1.p[2][0]
?
-
(
t1.weight[1]
*
tr_h[1]
*
tr_w[1]
+
t1.weight[2]
*
tr_h[2]
*
tr_w[2]
)
/
(
tr_h[0]
*
tr_w[0]
)
:
-t1.weight[1]
*
tr_h[1]
*
tr_w[1]
/
(
tr_h[0]
*
tr_w[0]
)
;
counter
+=
nodenum
;
t1.weight[0]
=
t1.p[2][0]
?
-
(
t1.weight[1]
*
tr_h[1]
*
tr_w[1]
+
t1.weight[2]
*
tr_h[2]
*
tr_w[2]
)
/
(
tr_h[0]
*
tr_w[0]
)
:
-t1.weight[1]
*
tr_h[1]
*
tr_w[1]
/
(
tr_h[0]
*
tr_w[0]
)
;
counter
+=
nodenum
;
#
pragma
unroll
for
(
i
=
0
; i < 3; i++)
{
newnode[counter].p[i][0]
=
tr_x[i]
;
newnode[counter].p[i][1]
=
tr_y[i]
;
newnode[counter].p[i][2]
=
tr_x[i]
+
tr_w[i]
;
newnode[counter].p[i][3]
=
tr_y[i]
+
tr_h[i]
;
newnode[counter].weight[i]
=
t1.weight[i]
*
weight_scale
;
}
for
(
i
=
0
; i < 3; i++)
{
newnode[counter].p[i][0]
=
tr_x[i]
;
newnode[counter].p[i][1]
=
tr_y[i]
;
newnode[counter].p[i][2]
=
tr_x[i]
+
tr_w[i]
;
newnode[counter].p[i][3]
=
tr_y[i]
+
tr_h[i]
;
newnode[counter].weight[i]
=
t1.weight[i]
*
weight_scale
;
}
newnode[counter].left
=
t1.left
;
newnode[counter].right
=
t1.right
;
newnode[counter].threshold
=
t1.threshold
;
newnode[counter].alpha[0]
=
t1.alpha[0]
;
newnode[counter].alpha[1]
=
t1.alpha[1]
;
newnode[counter].left
=
t1.left
;
newnode[counter].right
=
t1.right
;
newnode[counter].threshold
=
t1.threshold
;
newnode[counter].alpha[0]
=
t1.alpha[0]
;
newnode[counter].alpha[1]
=
t1.alpha[1]
;
}
modules/ocl/test/test_haar.cpp
浏览文件 @
69a0b5dd
...
...
@@ -16,6 +16,7 @@
//
// @Authors
// Jia Haipeng, jiahaipeng95@gmail.com
// Sen Liu, swjutls1987@126.com
//
// Redistribution and use in source and binary forms, with or without modification,
// are permitted provided that the following conditions are met:
...
...
@@ -61,40 +62,31 @@ struct getRect
}
};
PARAM_TEST_CASE
(
Haar
TestBase
,
int
,
int
)
PARAM_TEST_CASE
(
Haar
,
double
,
int
)
{
//std::vector<cv::ocl::Info> oclinfo;
cv
::
ocl
::
OclCascadeClassifier
cascade
,
nestedCascade
;
cv
::
ocl
::
OclCascadeClassifierBuf
cascadebuf
;
cv
::
CascadeClassifier
cpucascade
,
cpunestedCascade
;
// Mat img;
double
scale
;
int
index
;
int
flags
;
virtual
void
SetUp
()
{
scale
=
1.0
;
index
=
0
;
scale
=
GET_PARAM
(
0
)
;
flags
=
GET_PARAM
(
1
)
;
string
cascadeName
=
workdir
+
"../../data/haarcascades/haarcascade_frontalface_alt.xml"
;
if
(
(
!
cascade
.
load
(
cascadeName
))
||
(
!
cpucascade
.
load
(
cascadeName
)))
if
(
(
!
cascade
.
load
(
cascadeName
))
||
(
!
cpucascade
.
load
(
cascadeName
))
||
(
!
cascadebuf
.
load
(
cascadeName
))
)
{
cout
<<
"ERROR: Could not load classifier cascade"
<<
endl
;
return
;
}
//int devnums = getDevice(oclinfo);
//CV_Assert(devnums>0);
////if you want to use undefault device, set it here
////setDevice(oclinfo[0]);
//cv::ocl::setBinpath("E:\\");
}
};
////////////////////////////////faceDetect/////////////////////////////////////////////////
struct
Haar
:
HaarTestBase
{};
TEST_F
(
Haar
,
FaceDetect
)
TEST_P
(
Haar
,
FaceDetect
)
{
string
imgName
=
workdir
+
"lena.jpg"
;
Mat
img
=
imread
(
imgName
,
1
);
...
...
@@ -105,59 +97,65 @@ TEST_F(Haar, FaceDetect)
return
;
}
//int i = 0;
//double t = 0;
vector
<
Rect
>
faces
,
oclfaces
;
// const static Scalar colors[] = { CV_RGB(0, 0, 255),
// CV_RGB(0, 128, 255),
// CV_RGB(0, 255, 255),
// CV_RGB(0, 255, 0),
// CV_RGB(255, 128, 0),
// CV_RGB(255, 255, 0),
// CV_RGB(255, 0, 0),
// CV_RGB(255, 0, 255)
// } ;
Mat
gray
,
smallImg
(
cvRound
(
img
.
rows
/
scale
),
cvRound
(
img
.
cols
/
scale
),
CV_8UC1
);
MemStorage
storage
(
cvCreateMemStorage
(
0
));
cvtColor
(
img
,
gray
,
CV_BGR2GRAY
);
resize
(
gray
,
smallImg
,
smallImg
.
size
(),
0
,
0
,
INTER_LINEAR
);
equalizeHist
(
smallImg
,
smallImg
);
cv
::
ocl
::
oclMat
image
;
CvSeq
*
_objects
;
image
.
upload
(
smallImg
);
_objects
=
cascade
.
oclHaarDetectObjects
(
image
,
storage
,
1.1
,
3
,
0
|
CV_HAAR_SCALE_IMAGE
,
Size
(
30
,
30
),
Size
(
0
,
0
)
);
3
,
flags
,
Size
(
30
,
30
),
Size
(
0
,
0
)
);
vector
<
CvAvgComp
>
vecAvgComp
;
Seq
<
CvAvgComp
>
(
_objects
).
copyTo
(
vecAvgComp
);
oclfaces
.
resize
(
vecAvgComp
.
size
());
std
::
transform
(
vecAvgComp
.
begin
(),
vecAvgComp
.
end
(),
oclfaces
.
begin
(),
getRect
());
cpucascade
.
detectMultiScale
(
smallImg
,
faces
,
1.1
,
3
,
0
|
CV_HAAR_SCALE_IMAGE
,
Size
(
30
,
30
),
Size
(
0
,
0
)
);
cpucascade
.
detectMultiScale
(
smallImg
,
faces
,
1.1
,
3
,
flags
,
Size
(
30
,
30
),
Size
(
0
,
0
)
);
EXPECT_EQ
(
faces
.
size
(),
oclfaces
.
size
());
/* for( vector<Rect>::const_iterator r = faces.begin(); r != faces.end(); r++, i++ )
}
TEST_P
(
Haar
,
FaceDetectUseBuf
)
{
string
imgName
=
workdir
+
"lena.jpg"
;
Mat
img
=
imread
(
imgName
,
1
);
if
(
img
.
empty
())
{
Mat smallImgROI;
Point center;
Scalar color = colors[i%8];
int radius;
center.x = cvRound((r->x + r->width*0.5)*scale);
center.y = cvRound((r->y + r->height*0.5)*scale);
radius = cvRound((r->width + r->height)*0.25*scale);
circle( img, center, radius, color, 3, 8, 0 );
} */
//namedWindow("result");
//imshow("result",img);
//waitKey(0);
//destroyAllWindows();
std
::
cout
<<
"Couldn't read "
<<
imgName
<<
std
::
endl
;
return
;
}
vector
<
Rect
>
faces
,
oclfaces
;
Mat
gray
,
smallImg
(
cvRound
(
img
.
rows
/
scale
),
cvRound
(
img
.
cols
/
scale
),
CV_8UC1
);
MemStorage
storage
(
cvCreateMemStorage
(
0
));
cvtColor
(
img
,
gray
,
CV_BGR2GRAY
);
resize
(
gray
,
smallImg
,
smallImg
.
size
(),
0
,
0
,
INTER_LINEAR
);
equalizeHist
(
smallImg
,
smallImg
);
cv
::
ocl
::
oclMat
image
;
image
.
upload
(
smallImg
);
cascadebuf
.
detectMultiScale
(
image
,
oclfaces
,
1.1
,
3
,
flags
,
Size
(
30
,
30
),
Size
(
0
,
0
)
);
cascadebuf
.
release
();
cpucascade
.
detectMultiScale
(
smallImg
,
faces
,
1.1
,
3
,
flags
,
Size
(
30
,
30
),
Size
(
0
,
0
)
);
EXPECT_EQ
(
faces
.
size
(),
oclfaces
.
size
());
}
INSTANTIATE_TEST_CASE_P
(
FaceDetect
,
Haar
,
Combine
(
Values
(
1.0
),
Values
(
CV_HAAR_SCALE_IMAGE
,
0
)));
#endif // HAVE_OPENCL
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录