Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
6550c4f6
O
Opencv
项目概览
Greenplum
/
Opencv
11 个月 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
6550c4f6
编写于
6月 16, 2014
作者:
V
vbystricky
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Join kernel code for int and float destination types
上级
504bc763
变更
1
隐藏空白更改
内联
并排
Showing
1 changed file
with
45 addition
and
210 deletion
+45
-210
modules/imgproc/src/opencl/integral_sum.cl
modules/imgproc/src/opencl/integral_sum.cl
+45
-210
未找到文件。
modules/imgproc/src/opencl/integral_sum.cl
浏览文件 @
6550c4f6
...
...
@@ -61,24 +61,34 @@
#
define
GET_CONFLICT_OFFSET
(
lid
)
((
lid
)
>>
LOG_NUM_BANKS
)
#
if
sdepth
==
4
#
define
sumT
int
#
define
vecSumT
int4
#
define
convertToSum4
convert_int4
#
elif
sdepth
==
5
#
define
sumT
float
#
define
vecSumT
float4
#
define
convertToSum4
convert_float4
#
endif
kernel
void
integral_sum_cols
(
__global
uchar4
*src,
__global
int
*sum
,
kernel
void
integral_sum_cols
(
__global
uchar4
*src,
__global
uchar
*sum_ptr
,
int
src_offset,
int
rows,
int
cols,
int
src_step,
int
dst_step
)
{
sumT
*sum
=
(
sumT
*
)
sum_ptr
;
int
lid
=
get_local_id
(
0
)
;
int
gid
=
get_group_id
(
0
)
;
int4
src_t[2],
sum_t[2]
;
__local
int4
lm_sum[2][LSIZE
+
LOG_LSIZE]
;
__local
int
*
sum_p
;
vecSumT
src_t[2],
sum_t[2]
;
__local
vecSumT
lm_sum[2][LSIZE
+
LOG_LSIZE]
;
__local
sumT
*
sum_p
;
src_step
=
src_step
>>
2
;
gid
=
gid
<<
1
;
for
(
int
i
=
0
; i < rows; i =i + LSIZE_1)
for
(
int
i
=
0
; i < rows; i =i + LSIZE_1)
{
src_t[0]
=
(
i
+
lid
<
rows
?
convert
_int4
(
src[src_offset
+
(
lid+i
)
*
src_step
+
gid]
)
:
0
)
;
src_t[1]
=
(
i
+
lid
<
rows
?
convert
_int4
(
src[src_offset
+
(
lid+i
)
*
src_step
+
gid
+
1]
)
:
0
)
;
src_t[0]
=
(
i
+
lid
<
rows
?
convert
ToSum4
(
src[mad24
((
lid+i
)
,
src_step,
src_offset
+
gid
)
]
)
:
(
vecSumT
)
0
)
;
src_t[1]
=
(
i
+
lid
<
rows
?
convert
ToSum4
(
src[mad24
((
lid+i
)
,
src_step,
src_offset
+
gid
+
1
)
]
)
:
(
vecSumT
)
0
)
;
sum_t[0]
=
(
i
==
0
?
0
:
lm_sum[0][LSIZE_2
+
LOG_LSIZE]
)
;
sum_t[1]
=
(
i
==
0
?
0
:
lm_sum[1][LSIZE_2
+
LOG_LSIZE]
)
;
sum_t[0]
=
(
i
==
0
?
(
vecSumT
)
0
:
lm_sum[0][LSIZE_2
+
LOG_LSIZE]
)
;
sum_t[1]
=
(
i
==
0
?
(
vecSumT
)
0
:
lm_sum[1][LSIZE_2
+
LOG_LSIZE]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
bf_loc
=
lid
+
GET_CONFLICT_OFFSET
(
lid
)
;
...
...
@@ -87,7 +97,7 @@ kernel void integral_sum_cols(__global uchar4 *src, __global int *sum,
lm_sum[1][bf_loc]
=
src_t[1]
;
int
offset
=
1
;
for
(
int
d
=
LSIZE
>>
1
; d > 0; d>>=1)
for
(
int
d
=
LSIZE
>>
1
; d > 0; d>>=1)
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
ai
=
offset
*
(((
lid
&
127
)
<<1
)
+1
)
-
1
,
bi
=
ai
+
offset
;
...
...
@@ -101,11 +111,11 @@ kernel void integral_sum_cols(__global uchar4 *src, __global int *sum,
offset
<<=
1
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lid
<
2
)
if
(
lid
<
2
)
{
lm_sum[lid][LSIZE_2
+
LOG_LSIZE]
=
0
;
}
for
(
int
d
=
1
; d < LSIZE; d <<= 1)
for
(
int
d
=
1
; d < LSIZE; d <<= 1)
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
offset
>>=
1
;
...
...
@@ -120,19 +130,19 @@ kernel void integral_sum_cols(__global uchar4 *src, __global int *sum,
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lid
>
0
&&
(
i+lid
)
<=
rows
)
if
(
lid
>
0
&&
(
i+lid
)
<=
rows
)
{
int
loc_s0
=
gid
*
dst_step
+
i
+
lid
-
1
,
loc_s1
=
loc_s0
+
dst_step
;
int
loc_s0
=
mad24
(
gid,
dst_step,
i
+
lid
-
1
)
,
loc_s1
=
loc_s0
+
dst_step
;
lm_sum[0][bf_loc]
+=
sum_t[0]
;
lm_sum[1][bf_loc]
+=
sum_t[1]
;
sum_p
=
(
__local
int
*
)(
&
(
lm_sum[0][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
sum_p
=
(
__local
sumT
*
)(
&
(
lm_sum[0][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
4
+
k
>=
cols
)
continue
;
sum[loc_s0
+
k
*
dst_step
/
4]
=
sum_p[k]
;
}
sum_p
=
(
__local
int
*
)(
&
(
lm_sum[1][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
sum_p
=
(
__local
sumT
*
)(
&
(
lm_sum[1][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
4
+
k
+
4
>=
cols
)
break
;
sum[loc_s1
+
k
*
dst_step
/
4]
=
sum_p[k]
;
...
...
@@ -142,19 +152,21 @@ kernel void integral_sum_cols(__global uchar4 *src, __global int *sum,
}
}
kernel
void
integral_sum_rows
(
__global
int4
*srcsum,
__global
int
*sum
,
kernel
void
integral_sum_rows
(
__global
uchar
*srcsum_ptr,
__global
uchar
*sum_ptr
,
int
rows,
int
cols,
int
src_step,
int
sum_step,
int
sum_offset
)
{
vecSumT
*srcsum
=
(
vecSumT
*
)
srcsum_ptr
;
sumT
*sum
=
(
sumT
*
)
sum_ptr
;
int
lid
=
get_local_id
(
0
)
;
int
gid
=
get_group_id
(
0
)
;
int4
src_t[2],
sum_t[2]
;
__local
int4
lm_sum[2][LSIZE
+
LOG_LSIZE]
;
__local
int
*sum_p
;
vecSumT
src_t[2],
sum_t[2]
;
__local
vecSumT
lm_sum[2][LSIZE
+
LOG_LSIZE]
;
__local
sumT
*sum_p
;
src_step
=
src_step
>>
4
;
for
(
int
i
=
0
; i < rows; i =i + LSIZE_1)
for
(
int
i
=
0
; i < rows; i =i + LSIZE_1)
{
src_t[0]
=
i
+
lid
<
rows
?
srcsum[
(
lid+i
)
*
src_step
+
gid
*
2
]
:
0
;
src_t[1]
=
i
+
lid
<
rows
?
srcsum[
(
lid+i
)
*
src_step
+
gid
*
2
+
1
]
:
0
;
src_t[0]
=
i
+
lid
<
rows
?
srcsum[
mad24
((
lid+i
)
,
src_step,
gid
*
2
)
]
:
0
;
src_t[1]
=
i
+
lid
<
rows
?
srcsum[
mad24
((
lid+i
)
,
src_step,
gid
*
2
+
1
)
]
:
0
;
sum_t[0]
=
(
i
==
0
?
0
:
lm_sum[0][LSIZE_2
+
LOG_LSIZE]
)
;
sum_t[1]
=
(
i
==
0
?
0
:
lm_sum[1][LSIZE_2
+
LOG_LSIZE]
)
;
...
...
@@ -166,7 +178,7 @@ kernel void integral_sum_rows(__global int4 *srcsum, __global int *sum,
lm_sum[1][bf_loc]
=
src_t[1]
;
int
offset
=
1
;
for
(
int
d
=
LSIZE
>>
1
; d > 0; d>>=1)
for
(
int
d
=
LSIZE
>>
1
; d > 0; d>>=1)
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
ai
=
offset
*
(((
lid
&
127
)
<<1
)
+1
)
-
1
,
bi
=
ai
+
offset
;
...
...
@@ -180,11 +192,11 @@ kernel void integral_sum_rows(__global int4 *srcsum, __global int *sum,
offset
<<=
1
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lid
<
2
)
if
(
lid
<
2
)
{
lm_sum[lid][LSIZE_2
+
LOG_LSIZE]
=
0
;
}
for
(
int
d
=
1
; d < LSIZE; d <<= 1)
for
(
int
d
=
1
; d < LSIZE; d <<= 1)
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
offset
>>=
1
;
...
...
@@ -199,11 +211,11 @@ kernel void integral_sum_rows(__global int4 *srcsum, __global int *sum,
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
gid
==
0
&&
(
i
+
lid
)
<=
rows
)
if
(
gid
==
0
&&
(
i
+
lid
)
<=
rows
)
{
sum[sum_offset
+
i
+
lid]
=
0
;
}
if
(
i
+
lid
==
0
)
if
(
i
+
lid
==
0
)
{
int
loc0
=
gid
*
2
*
sum_step
;
for
(
int
k
=
1
; k <= 8; k++)
...
...
@@ -213,18 +225,18 @@ kernel void integral_sum_rows(__global int4 *srcsum, __global int *sum,
}
}
if
(
lid
>
0
&&
(
i+lid
)
<=
rows
)
if
(
lid
>
0
&&
(
i+lid
)
<=
rows
)
{
int
loc_s0
=
sum_offset
+
gid
*
2
*
sum_step
+
sum_step
/
4
+
i
+
lid,
loc_s1
=
loc_s0
+
sum_step
;
lm_sum[0][bf_loc]
+=
sum_t[0]
;
lm_sum[1][bf_loc]
+=
sum_t[1]
;
sum_p
=
(
__local
int
*
)(
&
(
lm_sum[0][bf_loc]
))
;
sum_p
=
(
__local
sumT
*
)(
&
(
lm_sum[0][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
8
+
k
>=
cols
)
break
;
sum[loc_s0
+
k
*
sum_step
/
4]
=
sum_p[k]
;
}
sum_p
=
(
__local
int
*
)(
&
(
lm_sum[1][bf_loc]
))
;
sum_p
=
(
__local
sumT
*
)(
&
(
lm_sum[1][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
8
+
4
+
k
>=
cols
)
break
;
...
...
@@ -234,180 +246,3 @@ kernel void integral_sum_rows(__global int4 *srcsum, __global int *sum,
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
#
elif
sdepth
==
5
kernel
void
integral_sum_cols
(
__global
uchar4
*src,
__global
float
*sum,
int
src_offset,
int
rows,
int
cols,
int
src_step,
int
dst_step
)
{
int
lid
=
get_local_id
(
0
)
;
int
gid
=
get_group_id
(
0
)
;
float4
src_t[2],
sum_t[2]
;
__local
float4
lm_sum[2][LSIZE
+
LOG_LSIZE]
;
__local
float*
sum_p
;
src_step
=
src_step
>>
2
;
gid
=
gid
<<
1
;
for
(
int
i
=
0
; i < rows; i =i + LSIZE_1)
{
src_t[0]
=
(
i
+
lid
<
rows
?
convert_float4
(
src[src_offset
+
(
lid+i
)
*
src_step
+
gid]
)
:
(
float4
)
0
)
;
src_t[1]
=
(
i
+
lid
<
rows
?
convert_float4
(
src[src_offset
+
(
lid+i
)
*
src_step
+
gid
+
1]
)
:
(
float4
)
0
)
;
sum_t[0]
=
(
i
==
0
?
(
float4
)
0
:
lm_sum[0][LSIZE_2
+
LOG_LSIZE]
)
;
sum_t[1]
=
(
i
==
0
?
(
float4
)
0
:
lm_sum[1][LSIZE_2
+
LOG_LSIZE]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
bf_loc
=
lid
+
GET_CONFLICT_OFFSET
(
lid
)
;
lm_sum[0][bf_loc]
=
src_t[0]
;
lm_sum[1][bf_loc]
=
src_t[1]
;
int
offset
=
1
;
for
(
int
d
=
LSIZE
>>
1
; d > 0; d>>=1)
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
ai
=
offset
*
(((
lid
&
127
)
<<1
)
+1
)
-
1
,
bi
=
ai
+
offset
;
ai
+=
GET_CONFLICT_OFFSET
(
ai
)
;
bi
+=
GET_CONFLICT_OFFSET
(
bi
)
;
if
((
lid
&
127
)
<
d
)
{
lm_sum[lid
>>
7][bi]
+=
lm_sum[lid
>>
7][ai]
;
}
offset
<<=
1
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lid
<
2
)
{
lm_sum[lid][LSIZE_2
+
LOG_LSIZE]
=
0
;
}
for
(
int
d
=
1
; d < LSIZE; d <<= 1)
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
offset
>>=
1
;
int
ai
=
offset
*
(((
lid
&
127
)
<<1
)
+1
)
-
1
,
bi
=
ai
+
offset
;
ai
+=
GET_CONFLICT_OFFSET
(
ai
)
;
bi
+=
GET_CONFLICT_OFFSET
(
bi
)
;
if
((
lid
&
127
)
<
d
)
{
lm_sum[lid
>>
7][bi]
+=
lm_sum[lid
>>
7][ai]
;
lm_sum[lid
>>
7][ai]
=
lm_sum[lid
>>
7][bi]
-
lm_sum[lid
>>
7][ai]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lid
>
0
&&
(
i+lid
)
<=
rows
)
{
int
loc_s0
=
gid
*
dst_step
+
i
+
lid
-
1
,
loc_s1
=
loc_s0
+
dst_step
;
lm_sum[0][bf_loc]
+=
sum_t[0]
;
lm_sum[1][bf_loc]
+=
sum_t[1]
;
sum_p
=
(
__local
float*
)(
&
(
lm_sum[0][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
4
+
k
>=
cols
)
continue
;
sum[loc_s0
+
k
*
dst_step
/
4]
=
sum_p[k]
;
}
sum_p
=
(
__local
float*
)(
&
(
lm_sum[1][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
4
+
k
+
4
>=
cols
)
break
;
sum[loc_s1
+
k
*
dst_step
/
4]
=
sum_p[k]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
kernel
void
integral_sum_rows
(
__global
float4
*srcsum,
__global
float
*sum,
int
rows,
int
cols,
int
src_step,
int
sum_step,
int
sum_offset
)
{
int
lid
=
get_local_id
(
0
)
;
int
gid
=
get_group_id
(
0
)
;
float4
src_t[2],
sum_t[2]
;
__local
float4
lm_sum[2][LSIZE
+
LOG_LSIZE]
;
__local
float
*sum_p
;
src_step
=
src_step
>>
4
;
for
(
int
i
=
0
; i < rows; i =i + LSIZE_1)
{
src_t[0]
=
i
+
lid
<
rows
?
srcsum[
(
lid+i
)
*
src_step
+
gid
*
2]
:
(
float4
)
0
;
src_t[1]
=
i
+
lid
<
rows
?
srcsum[
(
lid+i
)
*
src_step
+
gid
*
2
+
1]
:
(
float4
)
0
;
sum_t[0]
=
(
i
==
0
?
(
float4
)
0
:
lm_sum[0][LSIZE_2
+
LOG_LSIZE]
)
;
sum_t[1]
=
(
i
==
0
?
(
float4
)
0
:
lm_sum[1][LSIZE_2
+
LOG_LSIZE]
)
;
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
bf_loc
=
lid
+
GET_CONFLICT_OFFSET
(
lid
)
;
lm_sum[0][bf_loc]
=
src_t[0]
;
lm_sum[1][bf_loc]
=
src_t[1]
;
int
offset
=
1
;
for
(
int
d
=
LSIZE
>>
1
; d > 0; d>>=1)
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
int
ai
=
offset
*
(((
lid
&
127
)
<<1
)
+1
)
-
1
,
bi
=
ai
+
offset
;
ai
+=
GET_CONFLICT_OFFSET
(
ai
)
;
bi
+=
GET_CONFLICT_OFFSET
(
bi
)
;
if
((
lid
&
127
)
<
d
)
{
lm_sum[lid
>>
7][bi]
+=
lm_sum[lid
>>
7][ai]
;
}
offset
<<=
1
;
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
lid
<
2
)
{
lm_sum[lid][LSIZE_2
+
LOG_LSIZE]
=
0
;
}
for
(
int
d
=
1
; d < LSIZE; d <<= 1)
{
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
offset
>>=
1
;
int
ai
=
offset
*
(((
lid
&
127
)
<<1
)
+1
)
-
1
,
bi
=
ai
+
offset
;
ai
+=
GET_CONFLICT_OFFSET
(
ai
)
;
bi
+=
GET_CONFLICT_OFFSET
(
bi
)
;
if
((
lid
&
127
)
<
d
)
{
lm_sum[lid
>>
7][bi]
+=
lm_sum[lid
>>
7][ai]
;
lm_sum[lid
>>
7][ai]
=
lm_sum[lid
>>
7][bi]
-
lm_sum[lid
>>
7][ai]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
if
(
gid
==
0
&&
(
i
+
lid
)
<=
rows
)
{
sum[sum_offset
+
i
+
lid]
=
0
;
}
if
(
i
+
lid
==
0
)
{
int
loc0
=
gid
*
2
*
sum_step
;
for
(
int
k
=
1
; k <= 8; k++)
{
if
(
gid
*
8
+
k
>
cols
)
break
;
sum[sum_offset
+
loc0
+
k
*
sum_step
/
4]
=
0
;
}
}
if
(
lid
>
0
&&
(
i+lid
)
<=
rows
)
{
int
loc_s0
=
sum_offset
+
gid
*
2
*
sum_step
+
sum_step
/
4
+
i
+
lid,
loc_s1
=
loc_s0
+
sum_step
;
lm_sum[0][bf_loc]
+=
sum_t[0]
;
lm_sum[1][bf_loc]
+=
sum_t[1]
;
sum_p
=
(
__local
float*
)(
&
(
lm_sum[0][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
8
+
k
>=
cols
)
break
;
sum[loc_s0
+
k
*
sum_step
/
4]
=
sum_p[k]
;
}
sum_p
=
(
__local
float*
)(
&
(
lm_sum[1][bf_loc]
))
;
for
(
int
k
=
0
; k < 4; k++)
{
if
(
gid
*
8
+
4
+
k
>=
cols
)
break
;
sum[loc_s1
+
k
*
sum_step
/
4]
=
sum_p[k]
;
}
}
barrier
(
CLK_LOCAL_MEM_FENCE
)
;
}
}
#
endif
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录