Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
bcc086ba
O
Opencv
项目概览
Greenplum
/
Opencv
11 个月 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
bcc086ba
编写于
4月 05, 2013
作者:
Y
yao
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix all redefine build errors on some Intel OCL
上级
656594ad
变更
23
展开全部
隐藏空白更改
内联
并排
Showing
23 changed file
with
3082 addition
and
2186 deletion
+3082
-2186
modules/ocl/src/opencl/arithm_absdiff.cl
modules/ocl/src/opencl/arithm_absdiff.cl
+78
-44
modules/ocl/src/opencl/arithm_add.cl
modules/ocl/src/opencl/arithm_add.cl
+65
-31
modules/ocl/src/opencl/arithm_addWeighted.cl
modules/ocl/src/opencl/arithm_addWeighted.cl
+131
-109
modules/ocl/src/opencl/arithm_add_scalar.cl
modules/ocl/src/opencl/arithm_add_scalar.cl
+60
-35
modules/ocl/src/opencl/arithm_add_scalar_mask.cl
modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+60
-35
modules/ocl/src/opencl/arithm_bitwise_and.cl
modules/ocl/src/opencl/arithm_bitwise_and.cl
+107
-91
modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
+246
-189
modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
+193
-134
modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
+217
-157
modules/ocl/src/opencl/arithm_bitwise_not.cl
modules/ocl/src/opencl/arithm_bitwise_not.cl
+40
-25
modules/ocl/src/opencl/arithm_bitwise_or.cl
modules/ocl/src/opencl/arithm_bitwise_or.cl
+59
-43
modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
+245
-188
modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
+189
-129
modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
+218
-158
modules/ocl/src/opencl/arithm_bitwise_xor.cl
modules/ocl/src/opencl/arithm_bitwise_xor.cl
+101
-87
modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
+245
-188
modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
+194
-135
modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
+217
-157
modules/ocl/src/opencl/arithm_compare_eq.cl
modules/ocl/src/opencl/arithm_compare_eq.cl
+150
-89
modules/ocl/src/opencl/arithm_compare_ne.cl
modules/ocl/src/opencl/arithm_compare_ne.cl
+146
-85
modules/ocl/src/opencl/arithm_div.cl
modules/ocl/src/opencl/arithm_div.cl
+69
-56
modules/ocl/src/opencl/arithm_flip.cl
modules/ocl/src/opencl/arithm_flip.cl
+24
-8
modules/ocl/src/opencl/arithm_mul.cl
modules/ocl/src/opencl/arithm_mul.cl
+28
-13
未找到文件。
modules/ocl/src/opencl/arithm_absdiff.cl
浏览文件 @
bcc086ba
...
...
@@ -44,7 +44,11 @@
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
...
...
@@ -62,7 +66,10 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
...
...
@@ -110,8 +117,11 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
...
@@ -144,8 +154,11 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
...
@@ -248,8 +261,11 @@ __kernel void arithm_s_absdiff_C1_D0 (__global uchar *src1, int src1_step, int
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -287,8 +303,11 @@ __kernel void arithm_s_absdiff_C1_D2 (__global ushort *src1, int src1_step, in
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -318,8 +337,11 @@ __kernel void arithm_s_absdiff_C1_D3 (__global short *src1, int src1_step, int
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -387,8 +409,8 @@ __kernel void arithm_s_absdiff_C1_D5 (__global float *src1, int src1_step, int
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_absdiff_C1_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1
)
__global
double
*dst,
int
dst_step,
int
dst_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
...
...
@@ -421,8 +443,11 @@ __kernel void arithm_s_absdiff_C2_D0 (__global uchar *src1, int src1_step, int
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -465,7 +490,7 @@ __kernel void arithm_s_absdiff_C2_D2 (__global ushort *src1, int src1_step, in
}
__kernel
void
arithm_s_absdiff_C2_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
...
...
@@ -509,7 +534,7 @@ __kernel void arithm_s_absdiff_C2_D4 (__global int *src1, int src1_step, int s
}
__kernel
void
arithm_s_absdiff_C2_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1
)
float4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
...
...
@@ -563,8 +588,11 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
3
)
+
src1_offset
-
(
dst_align
*
3
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -617,8 +645,11 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -644,16 +675,16 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
data_0.xy
=
((
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
__kernel
void
arithm_s_absdiff_C3_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
...
...
@@ -667,8 +698,11 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -694,16 +728,16 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
data_0.xy
=
((
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
__kernel
void
arithm_s_absdiff_C3_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
...
...
@@ -735,9 +769,9 @@ __kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int s
int
tmp_data_1
=
convert_int_sat
(
abs_diff
(
src1_data_1,
src2_data_1
))
;
int
tmp_data_2
=
convert_int_sat
(
abs_diff
(
src1_data_2,
src2_data_2
))
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
}
}
__kernel
void
arithm_s_absdiff_C3_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
...
...
@@ -769,9 +803,9 @@ __kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int
float
tmp_data_1
=
fabs
(
src1_data_1
-
src2_data_1
)
;
float
tmp_data_2
=
fabs
(
src1_data_2
-
src2_data_2
)
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
}
}
...
...
@@ -805,9 +839,9 @@ __kernel void arithm_s_absdiff_C3_D6 (__global double *src1, int src1_step, in
double
tmp_data_1
=
fabs
(
src1_data_1
-
src2_data_1
)
;
double
tmp_data_2
=
fabs
(
src1_data_2
-
src2_data_2
)
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
tmp_data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
tmp_data_2
;
}
}
#
endif
...
...
modules/ocl/src/opencl/arithm_add.cl
浏览文件 @
bcc086ba
...
...
@@ -45,7 +45,11 @@
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
...
...
@@ -64,7 +68,10 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
...
...
@@ -112,7 +119,10 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
...
@@ -147,7 +157,10 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
...
@@ -252,7 +265,10 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -311,7 +327,10 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -348,7 +367,10 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -477,7 +499,10 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -664,7 +689,10 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
{
x
=
x
<<
2
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
3
)
+
src1_offset
-
(
dst_align
*
3
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
*
3
)
+
src2_offset
-
(
dst_align
*
3
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -724,7 +752,10 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
{
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
*
6
)
+
src2_offset
-
(
dst_align
*
6
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -754,16 +785,16 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
data_0.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
__kernel
void
arithm_add_with_mask_C3_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
...
...
@@ -780,7 +811,10 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
{
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
*
6
)
+
src2_offset
-
(
dst_align
*
6
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -810,16 +844,16 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
data_0.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
__kernel
void
arithm_add_with_mask_C3_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
...
...
@@ -861,9 +895,9 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, i
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
__kernel
void
arithm_add_with_mask_C3_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
...
...
@@ -905,9 +939,9 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
...
...
@@ -951,9 +985,9 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step,
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
data_2
;
}
}
#
endif
...
...
modules/ocl/src/opencl/arithm_addWeighted.cl
浏览文件 @
bcc086ba
...
...
@@ -42,8 +42,12 @@
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//M*/
#
if
defined
DOUBLE_SUPPORT
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
typedef
double
F
;
#
else
typedef
float
F
;
...
...
@@ -52,10 +56,10 @@ typedef float F;
/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
addWeighted_D0
(
__global
uchar
*src1,int
src1_step,int
src1_offset,
__global
uchar
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,F
gama,
__global
uchar
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
__global
uchar
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,F
gama,
__global
uchar
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -65,7 +69,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
...
...
@@ -87,7 +94,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
//
short4
tmp
=
convert_short4_sat
(
src1_data
)
*
alpha
+
convert_short4_sat
(
src2_data
)
*
beta
+
gama
;
short4
tmp
;
short4
tmp
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
...
...
@@ -100,7 +107,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
dst_data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
dst_data
;
//
dst[x
+
y
*
dst_step]
=
src1[x
+
y
*
src1_step]
*
alpha
+
src2[x
+
y
*
src2_step]
*
beta
+
gama
;
//
dst[x
+
y
*
dst_step]
=
src1[x
+
y
*
src1_step]
*
alpha
+
src2[x
+
y
*
src2_step]
*
beta
+
gama
;
}
}
...
...
@@ -108,10 +115,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
__kernel
void
addWeighted_D2
(
__global
ushort
*src1,
int
src1_step,int
src1_offset,
__global
ushort
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,F
gama,
__global
ushort
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
__global
ushort
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,F
gama,
__global
ushort
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -121,35 +128,38 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x<<
1
)
&
(
int
)
0xfffffff8
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
if
(
src1_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
//
int4
tmp
=
convert_int4_sat
(
src1_data
)
*
alpha
+
convert_int4_sat
(
src2_data
)
*
beta
+
gama
;
int4
tmp
;
//
int4
tmp
=
convert_int4_sat
(
src1_data
)
*
alpha
+
convert_int4_sat
(
src2_data
)
*
beta
+
gama
;
int4
tmp
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
...
...
@@ -181,8 +191,11 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
...
@@ -190,26 +203,26 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x<<
1
)
-
(
dst_align
<<
1
))
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
if
(
src1_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
//
int4
tmp
=
convert_int4_sat
(
src1_data
)
*
alpha
+
convert_int4_sat
(
src2_data
)
*
beta
+
gama
;
int4
tmp
;
//
int4
tmp
=
convert_int4_sat
(
src1_data
)
*
alpha
+
convert_int4_sat
(
src2_data
)
*
beta
+
gama
;
int4
tmp
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
...
...
@@ -228,7 +241,7 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
__kernel
void
addWeighted_D4
(
__global
int
*src1,
int
src1_step,int
src1_offset,
__global
int
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,
F
gama,
F
alpha,F
beta,
F
gama,
__global
int
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
{
...
...
@@ -241,9 +254,12 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
x
=
x
<<
2
;
#
define
bitOfInt
(
sizeof
(
int
)
==
4
?
2:
3
)
#
define
dst_align
((
dst_offset
>>
bitOfInt
)
&
3
)
#
define
bitOfInt
(
sizeof
(
int
)
==
4
?
2:
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
bitOfInt
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
bitOfInt
)
+
src1_offset
-
(
dst_align
<<
bitOfInt
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
bitOfInt
)
+
src2_offset
-
(
dst_align
<<
bitOfInt
))
;
...
...
@@ -252,26 +268,26 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
bitOfInt
)
-
(
dst_align
<<
bitOfInt
))
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int4
src1_data
=
vload4
(
0
,
(
__global
int
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
int4
src2_data
=
vload4
(
0
,
(
__global
int
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
{
int4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
int4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
if
(
src1_index
<
0
)
{
int4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
int4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
int4
dst_data
=
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
//
double4
tmp
=
convert_double4
(
src1_data
)
*
alpha
+
convert_double4
(
src2_data
)
*
beta
+
gama
;
float4
tmp
;
//
double4
tmp
=
convert_double4
(
src1_data
)
*
alpha
+
convert_double4
(
src2_data
)
*
beta
+
gama
;
float4
tmp
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
...
...
@@ -291,7 +307,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
__kernel
void
addWeighted_D5
(
__global
float
*src1,int
src1_step,int
src1_offset,
__global
float
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,
F
gama,
F
alpha,F
beta,
F
gama,
__global
float
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
{
...
...
@@ -303,8 +319,11 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
2
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
2
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
-
(
dst_align
<<
2
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
-
(
dst_align
<<
2
))
;
...
...
@@ -313,32 +332,32 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
2
)
-
(
dst_align
<<
2
))
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
float4
src1_data
=
vload4
(
0
,
(
__global
float
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
float4
src2_data
=
vload4
(
0
,
(
__global
float
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
float4
dst_data
=
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
if
(
src1_index
<
0
)
{
float4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
float4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
//
double4
tmp
=
convert_double4
(
src1_data
)
*
alpha
+
convert_double4
(
src2_data
)
*
beta
+
gama
;
//
float4
tmp_data
=
(
src1_data
)
*
alpha
+
(
src2_data
)
*
beta
+
gama
;
float4
tmp_data
;
if
(
src1_index
<
0
)
{
float4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
float4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
//
double4
tmp
=
convert_double4
(
src1_data
)
*
alpha
+
convert_double4
(
src2_data
)
*
beta
+
gama
;
//
float4
tmp_data
=
(
src1_data
)
*
alpha
+
(
src2_data
)
*
beta
+
gama
;
float4
tmp_data
;
tmp_data.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp_data.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp_data.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp_data.w
=
src1_data.w
*
alpha
+
src2_data.w
*
beta
+
gama
;
//
float4
tmp_data
=
convert_float4
(
tmp
)
;
//
float4
tmp_data
=
convert_float4
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
...
...
@@ -353,7 +372,7 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
addWeighted_D6
(
__global
double
*src1,
int
src1_step,int
src1_offset,
__global
double
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,
F
gama,
F
alpha,F
beta,
F
gama,
__global
double
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
{
...
...
@@ -365,8 +384,11 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
3
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
3
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
-
(
dst_align
<<
3
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
-
(
dst_align
<<
3
))
;
...
...
@@ -375,25 +397,25 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
3
)
-
(
dst_align
<<
3
))
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
double4
src1_data
=
vload4
(
0
,
(
__global
double
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
double4
src2_data
=
vload4
(
0
,
(
__global
double
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
double4
dst_data
=
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
if
(
src1_index
<
0
)
{
double4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
double4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
//
double4
tmp_data
=
(
src1_data
)
*
alpha
+
(
src2_data
)
*
beta
+
gama
;
double4
tmp_data
;
if
(
src1_index
<
0
)
{
double4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
double4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
//
double4
tmp_data
=
(
src1_data
)
*
alpha
+
(
src2_data
)
*
beta
+
gama
;
double4
tmp_data
;
tmp_data.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp_data.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp_data.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
...
...
modules/ocl/src/opencl/arithm_add_scalar.cl
浏览文件 @
bcc086ba
...
...
@@ -44,9 +44,13 @@
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
/**************************************add
with
scalar
without
mask**************************************/
__kernel
void
arithm_s_add_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
...
...
@@ -58,8 +62,11 @@ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -98,8 +105,11 @@ __kernel void arithm_s_add_C1_D2 (__global ushort *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -130,8 +140,11 @@ __kernel void arithm_s_add_C1_D3 (__global short *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -232,8 +245,11 @@ __kernel void arithm_s_add_C2_D0 (__global uchar *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -377,8 +393,11 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
3
)
+
src1_offset
-
(
dst_align
*
3
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -431,8 +450,11 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -458,16 +480,16 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
data_0.xy
=
((
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
__kernel
void
arithm_s_add_C3_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
...
...
@@ -481,8 +503,11 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -508,16 +533,16 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
data_0.xy
=
((
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
__kernel
void
arithm_s_add_C3_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
...
...
@@ -549,9 +574,9 @@ __kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_
int
tmp_data_1
=
convert_int_sat
((
long
)
src1_data_1
+
(
long
)
src2_data_1
)
;
int
tmp_data_2
=
convert_int_sat
((
long
)
src1_data_2
+
(
long
)
src2_data_2
)
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
}
}
__kernel
void
arithm_s_add_C3_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
...
...
@@ -583,9 +608,9 @@ __kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src
float
tmp_data_1
=
src1_data_1
+
src2_data_1
;
float
tmp_data_2
=
src1_data_2
+
src2_data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
}
}
...
...
@@ -619,9 +644,9 @@ __kernel void arithm_s_add_C3_D6 (__global double *src1, int src1_step, int sr
double
tmp_data_1
=
src1_data_1
+
src2_data_1
;
double
tmp_data_2
=
src1_data_2
+
src2_data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
tmp_data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
tmp_data_2
;
}
}
#
endif
...
...
modules/ocl/src/opencl/arithm_add_scalar_mask.cl
浏览文件 @
bcc086ba
...
...
@@ -44,7 +44,11 @@
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
/**************************************add
with
scalar
with
mask**************************************/
...
...
@@ -60,8 +64,11 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_ste
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -110,8 +117,11 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -145,8 +155,11 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -266,8 +279,11 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -442,8 +458,11 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_ste
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
3
)
+
src1_offset
-
(
dst_align
*
3
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -500,8 +519,11 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -530,16 +552,16 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
data_0.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
__kernel
void
arithm_s_add_with_mask_C3_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
...
...
@@ -554,8 +576,11 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
...
@@ -584,16 +609,16 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
data_0.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
__kernel
void
arithm_s_add_with_mask_C3_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
...
...
@@ -633,9 +658,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step,
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
__kernel
void
arithm_s_add_with_mask_C3_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
...
...
@@ -675,9 +700,9 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_ste
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
...
...
@@ -719,9 +744,9 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global double *src1, int src1_st
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
data_2
;
}
}
#
endif
...
...
modules/ocl/src/opencl/arithm_bitwise_and.cl
浏览文件 @
bcc086ba
...
...
@@ -43,7 +43,11 @@
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
...
...
@@ -51,9 +55,9 @@
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_and
without
mask**************************************/
__kernel
void
arithm_bitwise_and_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -61,31 +65,34 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
src1_data
&
src2_data
;
...
...
@@ -101,9 +108,9 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
__kernel
void
arithm_bitwise_and_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -111,8 +118,11 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
...
...
@@ -120,23 +130,23 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
char4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
{
char4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
char4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
char4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
{
char4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
char4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
char4
dst_data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
tmp_data
=
src1_data
&
src2_data
;
...
...
@@ -151,9 +161,9 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
__kernel
void
arithm_bitwise_and_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
...
...
@@ -162,8 +172,11 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
...
@@ -171,23 +184,23 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
if
(
src1_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
tmp_data
=
src1_data
&
src2_data
;
...
...
@@ -203,9 +216,9 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
__kernel
void
arithm_bitwise_and_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
...
...
@@ -214,8 +227,11 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
...
@@ -223,23 +239,23 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
if
(
src1_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
tmp_data
=
src1_data
&
src2_data
;
...
...
@@ -255,9 +271,9 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
__kernel
void
arithm_bitwise_and_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -277,9 +293,9 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1
}
__kernel
void
arithm_bitwise_and_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -300,9 +316,9 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_bitwise_and_D6
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
浏览文件 @
bcc086ba
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
浏览文件 @
bcc086ba
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
浏览文件 @
bcc086ba
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_not.cl
浏览文件 @
bcc086ba
...
...
@@ -43,9 +43,12 @@
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_NOT////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
...
...
@@ -60,26 +63,29 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
~
src1_data
;
/*
if
(
src1_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
*/
/*
if
(
src1_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
*/
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
...
...
@@ -91,8 +97,8 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
__kernel
void
arithm_bitwise_not_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -100,8 +106,11 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -124,8 +133,8 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
__kernel
void
arithm_bitwise_not_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
...
...
@@ -134,8 +143,11 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -159,8 +171,8 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
__kernel
void
arithm_bitwise_not_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
...
...
@@ -169,8 +181,11 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
...
@@ -194,8 +209,8 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
__kernel
void
arithm_bitwise_not_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
modules/ocl/src/opencl/arithm_bitwise_or.cl
浏览文件 @
bcc086ba
...
...
@@ -43,7 +43,11 @@
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
...
...
@@ -51,9 +55,9 @@
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_or
without
mask**************************************/
__kernel
void
arithm_bitwise_or_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -61,30 +65,33 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
if
(
src1_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
src1_data
| src2_data;
...
...
@@ -99,9 +106,9 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
__kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
...
...
@@ -109,8 +116,11 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
...
...
@@ -135,9 +145,9 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
__kernel
void
arithm_bitwise_or_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
...
...
@@ -146,8 +156,11 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
...
@@ -173,9 +186,9 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
__kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
...
...
@@ -184,8 +197,11 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...
...
@@ -211,9 +227,9 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
__kernel
void
arithm_bitwise_or_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -233,9 +249,9 @@ __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_
}
__kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
{
int x = get_global_id(0);
int y = get_global_id(1);
...
...
@@ -256,9 +272,9 @@ __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_bitwise_or_D6
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
浏览文件 @
bcc086ba
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
浏览文件 @
bcc086ba
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
浏览文件 @
bcc086ba
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_xor.cl
浏览文件 @
bcc086ba
...
...
@@ -43,17 +43,20 @@
//
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_xor
without
mask**************************************/
__kernel
void
arithm_bitwise_xor_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -61,8 +64,11 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
...
...
@@ -70,23 +76,23 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
if
(
src1_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
uchar4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
src1_data
^
src2_data
;
...
...
@@ -101,9 +107,9 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
__kernel
void
arithm_bitwise_xor_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -111,8 +117,11 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
...
...
@@ -120,23 +129,23 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
char4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
{
char4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
char4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
if
(
src1_index
<
0
)
{
char4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
char4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
char4
dst_data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
tmp_data
=
src1_data
^
src2_data
;
...
...
@@ -151,9 +160,9 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
__kernel
void
arithm_bitwise_xor_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
...
...
@@ -162,8 +171,11 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
...
@@ -171,23 +183,23 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
if
(
src1_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
ushort4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
tmp_data
=
src1_data
^
src2_data
;
...
...
@@ -203,9 +215,9 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
__kernel
void
arithm_bitwise_xor_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
...
...
@@ -214,8 +226,11 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
{
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
...
@@ -223,25 +238,25 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
if
(
src1_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
if
(
src1_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
if
(
src2_index
<
0
)
{
short4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
...
...
@@ -259,9 +274,9 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
__kernel
void
arithm_bitwise_xor_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -281,9 +296,9 @@ __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1
}
__kernel
void
arithm_bitwise_xor_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
@@ -301,12 +316,11 @@ __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src
*
((
__global
char4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_bitwise_xor_D6
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
...
...
modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
浏览文件 @
bcc086ba
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
浏览文件 @
bcc086ba
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
浏览文件 @
bcc086ba
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_compare_eq.cl
浏览文件 @
bcc086ba
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_compare_ne.cl
浏览文件 @
bcc086ba
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_div.cl
浏览文件 @
bcc086ba
...
...
@@ -44,7 +44,11 @@
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
typedef
double
F
;
typedef
double4
F4
;
#
define
convert_F4
convert_double4
...
...
@@ -56,34 +60,24 @@ typedef float4 F4;
#
define
convert_F
float
#
endif
uchar
round2_uchar
(
F
v
)
{
uchar
v1
=
convert_uchar_sat
(
round
(
v
))
;
//uchar
v2
=
convert_uchar_sat
(
v+
(
v>=0
?
0.5
:
-0.5
))
;
return
v1
;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
inline
uchar
round2_uchar
(
F
v
)
{
return
convert_uchar_sat
(
round
(
v
))
;
}
ushort
round2_ushort
(
F
v
)
{
ushort
v1
=
convert_ushort_sat
(
round
(
v
))
;
//ushort
v2
=
convert_ushort_sat
(
v+
(
v>=0
?
0.5
:
-0.5
))
;
return
v1
;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
inline
ushort
round2_ushort
(
F
v
)
{
return
convert_ushort_sat
(
round
(
v
))
;
}
short
round2_short
(
F
v
)
{
short
v1
=
convert_short_sat
(
round
(
v
))
;
//short
v2
=
convert_short_sat
(
v+
(
v>=0
?
0.5
:
-0.5
))
;
return
v1
;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
inline
short
round2_short
(
F
v
)
{
return
convert_short_sat
(
round
(
v
))
;
}
int
round2_int
(
F
v
)
{
int
v1
=
convert_int_sat
(
round
(
v
))
;
//int
v2
=
convert_int_sat
(
v+
(
v>=0
?
0.5
:
-0.5
))
;
return
v1
;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
inline
int
round2_int
(
F
v
)
{
return
convert_int_sat
(
round
(
v
))
;
}
///////////////////////////////////////////////////////////////////////////////////////
////////////////////////////divide///////////////////////////////////////////////////
...
...
@@ -94,39 +88,41 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1,
F
scalar
)
{
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int2
coor
=
(
int2
)(
get_global_id
(
0
)
,
get_global_id
(
1
))
;
if
(
x
<
cols
&&
y
<
rows
)
if
(
coor.x
<
cols
&&
coor.
y
<
rows
)
{
x
=
x
<<
2
;
coor.x
=
coor.x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int2
src_index
=
(
int2
)(
mad24
(
coor.y,
src1_step,
coor.x
+
src1_offset
-
dst_align
)
,
mad24
(
coor.y,
src2_step,
coor.x
+
src2_offset
-
dst_align
))
;
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int4
dst_args
=
(
int4
)(
mad24
(
coor.y,
dst_step,
dst_offset
)
,
mad24
(
coor.y,
dst_step,
dst_offset
+
dst_step1
)
,
mad24
(
coor.y,
dst_step,
dst_offset
+
coor.x
&
(
int
)
0xfffffffc
)
,
0
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index
)
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src_index.x
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src_index.y
)
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_args.z
))
;
F4
tmp
=
convert_F4
(
src1_data
)
*
scalar
;
uchar4
tmp_data
;
tmp_data.x
=
((
tmp.x
==
0
)
|
| (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x /
(F)
src2_data.x);
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y /
(F)
src2_data.y);
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z /
(F)
src2_data.z);
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w /
(F)
src2_data.w);
tmp_data.x
=
((
tmp.x
==
0
)
|
| (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x);
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y);
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z);
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w);
dst_data.x = ((dst_
index + 0 >= dst_start) && (dst_index + 0 < dst_end
)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_
index + 1 >= dst_start) && (dst_index + 1 < dst_end
)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_
index + 2 >= dst_start) && (dst_index + 2 < dst_end
)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_
index + 3 >= dst_start) && (dst_index + 3 < dst_end
)) ? tmp_data.w : dst_data.w;
dst_data.x = ((dst_
args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y
)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_
args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y
)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_
args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y
)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_
args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y
)) ? tmp_data.w : dst_data.w;
*((__global uchar4 *)(dst + dst_
index
)) = dst_data;
*((__global uchar4 *)(dst + dst_
args.z
)) = dst_data;
}
}
...
...
@@ -141,8 +137,11 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...
...
@@ -181,8 +180,11 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...
...
@@ -296,8 +298,11 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src_index = mad24(y, src_step, x + src_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
...
...
@@ -332,8 +337,11 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
...
...
@@ -367,8 +375,11 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
if (x < cols && y < rows)
{
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
...
...
@@ -455,3 +466,5 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
}
}
#
endif
modules/ocl/src/opencl/arithm_flip.cl
浏览文件 @
bcc086ba
...
...
@@ -44,7 +44,11 @@
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
...
...
@@ -60,8 +64,11 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
if
(
x
<
cols
&&
y
<
thread_rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src_index_0
=
mad24
(
y,
src_step,
x
+
src_offset
-
dst_align
)
;
int
src_index_1
=
mad24
(
rows
-
y
-
1
,
src_step,
x
+
src_offset
-
dst_align
)
;
...
...
@@ -115,8 +122,11 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off
if
(
x
<
cols
&&
y
<
thread_rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src_index_0
=
mad24
(
y,
src_step,
x
+
src_offset
-
dst_align
)
;
int
src_index_1
=
mad24
(
rows
-
y
-
1
,
src_step,
x
+
src_offset
-
dst_align
)
;
...
...
@@ -157,8 +167,11 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o
if
(
x
<
cols
&&
y
<
thread_rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(((
dst_offset
>>
1
)
&
3
)
<<
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
>>
1
)
&
3
)
<<
1
)
int
src_index_0
=
mad24
(
y,
src_step,
(
x
<<
1
)
+
src_offset
-
dst_align
)
;
int
src_index_1
=
mad24
(
rows
-
y
-
1
,
src_step,
(
x
<<
1
)
+
src_offset
-
dst_align
)
;
...
...
@@ -199,8 +212,11 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of
if
(
x
<
cols
&&
y
<
thread_rows
)
{
x
=
x
<<
2
;
#
define
dst_align
(((
dst_offset
>>
1
)
&
3
)
<<
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
>>
1
)
&
3
)
<<
1
)
int
src_index_0
=
mad24
(
y,
src_step,
(
x
<<
1
)
+
src_offset
-
dst_align
)
;
int
src_index_1
=
mad24
(
rows
-
y
-
1
,
src_step,
(
x
<<
1
)
+
src_offset
-
dst_align
)
;
...
...
modules/ocl/src/opencl/arithm_mul.cl
浏览文件 @
bcc086ba
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录