Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
a2d27429
O
Opencv
项目概览
Greenplum
/
Opencv
11 个月 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
a2d27429
编写于
4月 05, 2013
作者:
A
Andrey Kamaev
提交者:
OpenCV Buildbot
4月 05, 2013
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #775 from bitwangyaoyao:2.4_fixerr
上级
977562b6
bcc086ba
变更
23
展开全部
隐藏空白更改
内联
并排
Showing
23 changed file
with
3082 addition
and
2186 deletion
+3082
-2186
modules/ocl/src/opencl/arithm_absdiff.cl
modules/ocl/src/opencl/arithm_absdiff.cl
+78
-44
modules/ocl/src/opencl/arithm_add.cl
modules/ocl/src/opencl/arithm_add.cl
+65
-31
modules/ocl/src/opencl/arithm_addWeighted.cl
modules/ocl/src/opencl/arithm_addWeighted.cl
+131
-109
modules/ocl/src/opencl/arithm_add_scalar.cl
modules/ocl/src/opencl/arithm_add_scalar.cl
+60
-35
modules/ocl/src/opencl/arithm_add_scalar_mask.cl
modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+60
-35
modules/ocl/src/opencl/arithm_bitwise_and.cl
modules/ocl/src/opencl/arithm_bitwise_and.cl
+107
-91
modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
+246
-189
modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
+193
-134
modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
+217
-157
modules/ocl/src/opencl/arithm_bitwise_not.cl
modules/ocl/src/opencl/arithm_bitwise_not.cl
+40
-25
modules/ocl/src/opencl/arithm_bitwise_or.cl
modules/ocl/src/opencl/arithm_bitwise_or.cl
+59
-43
modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
+245
-188
modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
+189
-129
modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
+218
-158
modules/ocl/src/opencl/arithm_bitwise_xor.cl
modules/ocl/src/opencl/arithm_bitwise_xor.cl
+101
-87
modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
+245
-188
modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
+194
-135
modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
+217
-157
modules/ocl/src/opencl/arithm_compare_eq.cl
modules/ocl/src/opencl/arithm_compare_eq.cl
+150
-89
modules/ocl/src/opencl/arithm_compare_ne.cl
modules/ocl/src/opencl/arithm_compare_ne.cl
+146
-85
modules/ocl/src/opencl/arithm_div.cl
modules/ocl/src/opencl/arithm_div.cl
+69
-56
modules/ocl/src/opencl/arithm_flip.cl
modules/ocl/src/opencl/arithm_flip.cl
+24
-8
modules/ocl/src/opencl/arithm_mul.cl
modules/ocl/src/opencl/arithm_mul.cl
+28
-13
未找到文件。
modules/ocl/src/opencl/arithm_absdiff.cl
浏览文件 @
a2d27429
...
@@ -44,7 +44,11 @@
...
@@ -44,7 +44,11 @@
//M*/
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
...
@@ -62,7 +66,10 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
...
@@ -62,7 +66,10 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
...
@@ -110,8 +117,11 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_
...
@@ -110,8 +117,11 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
@@ -144,8 +154,11 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o
...
@@ -144,8 +154,11 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
@@ -248,8 +261,11 @@ __kernel void arithm_s_absdiff_C1_D0 (__global uchar *src1, int src1_step, int
...
@@ -248,8 +261,11 @@ __kernel void arithm_s_absdiff_C1_D0 (__global uchar *src1, int src1_step, int
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -287,8 +303,11 @@ __kernel void arithm_s_absdiff_C1_D2 (__global ushort *src1, int src1_step, in
...
@@ -287,8 +303,11 @@ __kernel void arithm_s_absdiff_C1_D2 (__global ushort *src1, int src1_step, in
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -318,8 +337,11 @@ __kernel void arithm_s_absdiff_C1_D3 (__global short *src1, int src1_step, int
...
@@ -318,8 +337,11 @@ __kernel void arithm_s_absdiff_C1_D3 (__global short *src1, int src1_step, int
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -387,8 +409,8 @@ __kernel void arithm_s_absdiff_C1_D5 (__global float *src1, int src1_step, int
...
@@ -387,8 +409,8 @@ __kernel void arithm_s_absdiff_C1_D5 (__global float *src1, int src1_step, int
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_s_absdiff_C1_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_s_absdiff_C1_D6
(
__global
double
*src1,
int
src1_step,
int
src1_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
__global
double
*dst,
int
dst_step,
int
dst_offset,
double4
src2,
int
rows,
int
cols,
int
dst_step1
)
double4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
...
@@ -421,8 +443,11 @@ __kernel void arithm_s_absdiff_C2_D0 (__global uchar *src1, int src1_step, int
...
@@ -421,8 +443,11 @@ __kernel void arithm_s_absdiff_C2_D0 (__global uchar *src1, int src1_step, int
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -465,7 +490,7 @@ __kernel void arithm_s_absdiff_C2_D2 (__global ushort *src1, int src1_step, in
...
@@ -465,7 +490,7 @@ __kernel void arithm_s_absdiff_C2_D2 (__global ushort *src1, int src1_step, in
}
}
__kernel
void
arithm_s_absdiff_C2_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_s_absdiff_C2_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
int4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
...
@@ -509,7 +534,7 @@ __kernel void arithm_s_absdiff_C2_D4 (__global int *src1, int src1_step, int s
...
@@ -509,7 +534,7 @@ __kernel void arithm_s_absdiff_C2_D4 (__global int *src1, int src1_step, int s
}
}
__kernel
void
arithm_s_absdiff_C2_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_s_absdiff_C2_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
__global
float
*dst,
int
dst_step,
int
dst_offset,
float4
src2,
int
rows,
int
cols,
int
dst_step1
)
float4
src2,
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
...
@@ -563,8 +588,11 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int
...
@@ -563,8 +588,11 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
3
)
+
src1_offset
-
(
dst_align
*
3
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
3
)
+
src1_offset
-
(
dst_align
*
3
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -617,8 +645,11 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
...
@@ -617,8 +645,11 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -644,16 +675,16 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
...
@@ -644,16 +675,16 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
data_0.xy
=
((
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_0.xy
=
((
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
data_1.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_1.y
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_2.xy
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
}
}
__kernel
void
arithm_s_absdiff_C3_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_s_absdiff_C3_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
...
@@ -667,8 +698,11 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
...
@@ -667,8 +698,11 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -694,16 +728,16 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
...
@@ -694,16 +728,16 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
data_0.xy
=
((
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_0.xy
=
((
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
data_1.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_1.y
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_2.xy
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
}
}
__kernel
void
arithm_s_absdiff_C3_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_s_absdiff_C3_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
...
@@ -735,9 +769,9 @@ __kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int s
...
@@ -735,9 +769,9 @@ __kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int s
int
tmp_data_1
=
convert_int_sat
(
abs_diff
(
src1_data_1,
src2_data_1
))
;
int
tmp_data_1
=
convert_int_sat
(
abs_diff
(
src1_data_1,
src2_data_1
))
;
int
tmp_data_2
=
convert_int_sat
(
abs_diff
(
src1_data_2,
src2_data_2
))
;
int
tmp_data_2
=
convert_int_sat
(
abs_diff
(
src1_data_2,
src2_data_2
))
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
}
}
}
}
__kernel
void
arithm_s_absdiff_C3_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_s_absdiff_C3_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
...
@@ -769,9 +803,9 @@ __kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int
...
@@ -769,9 +803,9 @@ __kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int
float
tmp_data_1
=
fabs
(
src1_data_1
-
src2_data_1
)
;
float
tmp_data_1
=
fabs
(
src1_data_1
-
src2_data_1
)
;
float
tmp_data_2
=
fabs
(
src1_data_2
-
src2_data_2
)
;
float
tmp_data_2
=
fabs
(
src1_data_2
-
src2_data_2
)
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
}
}
}
}
...
@@ -805,9 +839,9 @@ __kernel void arithm_s_absdiff_C3_D6 (__global double *src1, int src1_step, in
...
@@ -805,9 +839,9 @@ __kernel void arithm_s_absdiff_C3_D6 (__global double *src1, int src1_step, in
double
tmp_data_1
=
fabs
(
src1_data_1
-
src2_data_1
)
;
double
tmp_data_1
=
fabs
(
src1_data_1
-
src2_data_1
)
;
double
tmp_data_2
=
fabs
(
src1_data_2
-
src2_data_2
)
;
double
tmp_data_2
=
fabs
(
src1_data_2
-
src2_data_2
)
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
tmp_data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
tmp_data_2
;
}
}
}
}
#
endif
#
endif
...
...
modules/ocl/src/opencl/arithm_add.cl
浏览文件 @
a2d27429
...
@@ -45,7 +45,11 @@
...
@@ -45,7 +45,11 @@
//M*/
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
...
@@ -64,7 +68,10 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
...
@@ -64,7 +68,10 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
...
@@ -112,7 +119,10 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
...
@@ -112,7 +119,10 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
@@ -147,7 +157,10 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
...
@@ -147,7 +157,10 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
@@ -252,7 +265,10 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
...
@@ -252,7 +265,10 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -311,7 +327,10 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
...
@@ -311,7 +327,10 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -348,7 +367,10 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
...
@@ -348,7 +367,10 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -477,7 +499,10 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
...
@@ -477,7 +499,10 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -664,7 +689,10 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
...
@@ -664,7 +689,10 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
3
)
+
src1_offset
-
(
dst_align
*
3
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
3
)
+
src1_offset
-
(
dst_align
*
3
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
*
3
)
+
src2_offset
-
(
dst_align
*
3
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
*
3
)
+
src2_offset
-
(
dst_align
*
3
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -724,7 +752,10 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
...
@@ -724,7 +752,10 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
*
6
)
+
src2_offset
-
(
dst_align
*
6
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
*
6
)
+
src2_offset
-
(
dst_align
*
6
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -754,16 +785,16 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
...
@@ -754,16 +785,16 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
data_0.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_0.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
data_1.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_1.y
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_2.xy
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
}
}
__kernel
void
arithm_add_with_mask_C3_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_add_with_mask_C3_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
...
@@ -780,7 +811,10 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
...
@@ -780,7 +811,10 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
*
6
)
+
src2_offset
-
(
dst_align
*
6
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
*
6
)
+
src2_offset
-
(
dst_align
*
6
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -810,16 +844,16 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
...
@@ -810,16 +844,16 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
data_0.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_0.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
data_1.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_1.y
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_2.xy
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
}
}
__kernel
void
arithm_add_with_mask_C3_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_add_with_mask_C3_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
...
@@ -861,9 +895,9 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, i
...
@@ -861,9 +895,9 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, i
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
}
}
__kernel
void
arithm_add_with_mask_C3_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_add_with_mask_C3_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
...
@@ -905,9 +939,9 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i
...
@@ -905,9 +939,9 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
}
}
...
@@ -951,9 +985,9 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step,
...
@@ -951,9 +985,9 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step,
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
data_2
;
}
}
}
}
#
endif
#
endif
...
...
modules/ocl/src/opencl/arithm_addWeighted.cl
浏览文件 @
a2d27429
...
@@ -42,8 +42,12 @@
...
@@ -42,8 +42,12 @@
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
the
use
of
this
software,
even
if
advised
of
the
possibility
of
such
damage.
//
//
//M*/
//M*/
#
if
defined
DOUBLE_SUPPORT
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
typedef
double
F
;
typedef
double
F
;
#
else
#
else
typedef
float
F
;
typedef
float
F
;
...
@@ -52,10 +56,10 @@ typedef float F;
...
@@ -52,10 +56,10 @@ typedef float F;
/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
/////////////////////////////////////////////addWeighted//////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel
void
addWeighted_D0
(
__global
uchar
*src1,int
src1_step,int
src1_offset,
__kernel
void
addWeighted_D0
(
__global
uchar
*src1,int
src1_step,int
src1_offset,
__global
uchar
*src2,
int
src2_step,int
src2_offset,
__global
uchar
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,F
gama,
F
alpha,F
beta,F
gama,
__global
uchar
*dst,
int
dst_step,int
dst_offset,
__global
uchar
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
int
rows,
int
cols,int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
@@ -65,7 +69,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
...
@@ -65,7 +69,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
...
@@ -87,7 +94,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
...
@@ -87,7 +94,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
//
short4
tmp
=
convert_short4_sat
(
src1_data
)
*
alpha
+
convert_short4_sat
(
src2_data
)
*
beta
+
gama
;
//
short4
tmp
=
convert_short4_sat
(
src1_data
)
*
alpha
+
convert_short4_sat
(
src2_data
)
*
beta
+
gama
;
short4
tmp
;
short4
tmp
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
...
@@ -100,7 +107,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
...
@@ -100,7 +107,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
dst_data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
dst_data.w
=
((
dst_index
+
3
>=
dst_start
)
&&
(
dst_index
+
3
<
dst_end
))
?
tmp_data.w
:
dst_data.w
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
dst_data
;
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
=
dst_data
;
//
dst[x
+
y
*
dst_step]
=
src1[x
+
y
*
src1_step]
*
alpha
+
src2[x
+
y
*
src2_step]
*
beta
+
gama
;
//
dst[x
+
y
*
dst_step]
=
src1[x
+
y
*
src1_step]
*
alpha
+
src2[x
+
y
*
src2_step]
*
beta
+
gama
;
}
}
}
}
...
@@ -108,10 +115,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
...
@@ -108,10 +115,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
__kernel
void
addWeighted_D2
(
__global
ushort
*src1,
int
src1_step,int
src1_offset,
__kernel
void
addWeighted_D2
(
__global
ushort
*src1,
int
src1_step,int
src1_offset,
__global
ushort
*src2,
int
src2_step,int
src2_offset,
__global
ushort
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,F
gama,
F
alpha,F
beta,F
gama,
__global
ushort
*dst,
int
dst_step,int
dst_offset,
__global
ushort
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
int
rows,
int
cols,int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
@@ -121,35 +128,38 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
...
@@ -121,35 +128,38 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x<<
1
)
&
(
int
)
0xfffffff8
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x<<
1
)
&
(
int
)
0xfffffff8
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
ushort4
tmp
;
ushort4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
ushort4
tmp
;
ushort4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
//
int4
tmp
=
convert_int4_sat
(
src1_data
)
*
alpha
+
convert_int4_sat
(
src2_data
)
*
beta
+
gama
;
//
int4
tmp
=
convert_int4_sat
(
src1_data
)
*
alpha
+
convert_int4_sat
(
src2_data
)
*
beta
+
gama
;
int4
tmp
;
int4
tmp
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
...
@@ -181,8 +191,11 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
...
@@ -181,8 +191,11 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
@@ -190,26 +203,26 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
...
@@ -190,26 +203,26 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x<<
1
)
-
(
dst_align
<<
1
))
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x<<
1
)
-
(
dst_align
<<
1
))
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
short4
tmp
;
short4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
short4
tmp
;
short4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
//
int4
tmp
=
convert_int4_sat
(
src1_data
)
*
alpha
+
convert_int4_sat
(
src2_data
)
*
beta
+
gama
;
//
int4
tmp
=
convert_int4_sat
(
src1_data
)
*
alpha
+
convert_int4_sat
(
src2_data
)
*
beta
+
gama
;
int4
tmp
;
int4
tmp
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
...
@@ -228,7 +241,7 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
...
@@ -228,7 +241,7 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
__kernel
void
addWeighted_D4
(
__global
int
*src1,
int
src1_step,int
src1_offset,
__kernel
void
addWeighted_D4
(
__global
int
*src1,
int
src1_step,int
src1_offset,
__global
int
*src2,
int
src2_step,int
src2_offset,
__global
int
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,
F
gama,
F
alpha,F
beta,
F
gama,
__global
int
*dst,
int
dst_step,int
dst_offset,
__global
int
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
int
rows,
int
cols,int
dst_step1
)
{
{
...
@@ -241,9 +254,12 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
...
@@ -241,9 +254,12 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
bitOfInt
(
sizeof
(
int
)
==
4
?
2:
3
)
#
define
bitOfInt
(
sizeof
(
int
)
==
4
?
2:
3
)
#
define
dst_align
((
dst_offset
>>
bitOfInt
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
bitOfInt
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
bitOfInt
)
+
src1_offset
-
(
dst_align
<<
bitOfInt
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
bitOfInt
)
+
src1_offset
-
(
dst_align
<<
bitOfInt
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
bitOfInt
)
+
src2_offset
-
(
dst_align
<<
bitOfInt
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
bitOfInt
)
+
src2_offset
-
(
dst_align
<<
bitOfInt
))
;
...
@@ -252,26 +268,26 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
...
@@ -252,26 +268,26 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
bitOfInt
)
-
(
dst_align
<<
bitOfInt
))
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
bitOfInt
)
-
(
dst_align
<<
bitOfInt
))
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int4
src1_data
=
vload4
(
0
,
(
__global
int
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
int4
src1_data
=
vload4
(
0
,
(
__global
int
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
int4
src2_data
=
vload4
(
0
,
(
__global
int
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
int4
src2_data
=
vload4
(
0
,
(
__global
int
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
int4
tmp
;
int4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
int4
tmp
;
int4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
int4
dst_data
=
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
int4
dst_data
=
*
((
__global
int4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
//
double4
tmp
=
convert_double4
(
src1_data
)
*
alpha
+
convert_double4
(
src2_data
)
*
beta
+
gama
;
//
double4
tmp
=
convert_double4
(
src1_data
)
*
alpha
+
convert_double4
(
src2_data
)
*
beta
+
gama
;
float4
tmp
;
float4
tmp
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
...
@@ -291,7 +307,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
...
@@ -291,7 +307,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
__kernel
void
addWeighted_D5
(
__global
float
*src1,int
src1_step,int
src1_offset,
__kernel
void
addWeighted_D5
(
__global
float
*src1,int
src1_step,int
src1_offset,
__global
float
*src2,
int
src2_step,int
src2_offset,
__global
float
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,
F
gama,
F
alpha,F
beta,
F
gama,
__global
float
*dst,
int
dst_step,int
dst_offset,
__global
float
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
int
rows,
int
cols,int
dst_step1
)
{
{
...
@@ -303,8 +319,11 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
...
@@ -303,8 +319,11 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
2
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
2
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
-
(
dst_align
<<
2
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
2
)
+
src1_offset
-
(
dst_align
<<
2
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
-
(
dst_align
<<
2
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
2
)
+
src2_offset
-
(
dst_align
<<
2
))
;
...
@@ -313,32 +332,32 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
...
@@ -313,32 +332,32 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
2
)
-
(
dst_align
<<
2
))
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
2
)
-
(
dst_align
<<
2
))
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
float4
src1_data
=
vload4
(
0
,
(
__global
float
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
float4
src1_data
=
vload4
(
0
,
(
__global
float
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
float4
src2_data
=
vload4
(
0
,
(
__global
float
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
float4
src2_data
=
vload4
(
0
,
(
__global
float
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
float4
dst_data
=
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
float4
dst_data
=
*
((
__global
float4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
float4
tmp
;
float4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
float4
tmp
;
float4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
//
double4
tmp
=
convert_double4
(
src1_data
)
*
alpha
+
convert_double4
(
src2_data
)
*
beta
+
gama
;
//
double4
tmp
=
convert_double4
(
src1_data
)
*
alpha
+
convert_double4
(
src2_data
)
*
beta
+
gama
;
//
float4
tmp_data
=
(
src1_data
)
*
alpha
+
(
src2_data
)
*
beta
+
gama
;
//
float4
tmp_data
=
(
src1_data
)
*
alpha
+
(
src2_data
)
*
beta
+
gama
;
float4
tmp_data
;
float4
tmp_data
;
tmp_data.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp_data.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp_data.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp_data.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp_data.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp_data.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp_data.w
=
src1_data.w
*
alpha
+
src2_data.w
*
beta
+
gama
;
tmp_data.w
=
src1_data.w
*
alpha
+
src2_data.w
*
beta
+
gama
;
//
float4
tmp_data
=
convert_float4
(
tmp
)
;
//
float4
tmp_data
=
convert_float4
(
tmp
)
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.y
=
((
dst_index
+
4
>=
dst_start
)
&&
(
dst_index
+
4
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
...
@@ -353,7 +372,7 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
...
@@ -353,7 +372,7 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
addWeighted_D6
(
__global
double
*src1,
int
src1_step,int
src1_offset,
__kernel
void
addWeighted_D6
(
__global
double
*src1,
int
src1_step,int
src1_offset,
__global
double
*src2,
int
src2_step,int
src2_offset,
__global
double
*src2,
int
src2_step,int
src2_offset,
F
alpha,F
beta,
F
gama,
F
alpha,F
beta,
F
gama,
__global
double
*dst,
int
dst_step,int
dst_offset,
__global
double
*dst,
int
dst_step,int
dst_offset,
int
rows,
int
cols,int
dst_step1
)
int
rows,
int
cols,int
dst_step1
)
{
{
...
@@ -365,8 +384,11 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
...
@@ -365,8 +384,11 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
3
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
3
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
-
(
dst_align
<<
3
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
3
)
+
src1_offset
-
(
dst_align
<<
3
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
-
(
dst_align
<<
3
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
3
)
+
src2_offset
-
(
dst_align
<<
3
))
;
...
@@ -375,25 +397,25 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
...
@@ -375,25 +397,25 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
3
)
-
(
dst_align
<<
3
))
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
3
)
-
(
dst_align
<<
3
))
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
double4
src1_data
=
vload4
(
0
,
(
__global
double
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
double4
src1_data
=
vload4
(
0
,
(
__global
double
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
double4
src2_data
=
vload4
(
0
,
(
__global
double
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
double4
src2_data
=
vload4
(
0
,
(
__global
double
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
double4
dst_data
=
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
double4
dst_data
=
*
((
__global
double4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
double4
tmp
;
double4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
double4
tmp
;
double4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
//
double4
tmp_data
=
(
src1_data
)
*
alpha
+
(
src2_data
)
*
beta
+
gama
;
//
double4
tmp_data
=
(
src1_data
)
*
alpha
+
(
src2_data
)
*
beta
+
gama
;
double4
tmp_data
;
double4
tmp_data
;
tmp_data.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp_data.x
=
src1_data.x
*
alpha
+
src2_data.x
*
beta
+
gama
;
tmp_data.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp_data.y
=
src1_data.y
*
alpha
+
src2_data.y
*
beta
+
gama
;
tmp_data.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
tmp_data.z
=
src1_data.z
*
alpha
+
src2_data.z
*
beta
+
gama
;
...
...
modules/ocl/src/opencl/arithm_add_scalar.cl
浏览文件 @
a2d27429
...
@@ -44,9 +44,13 @@
...
@@ -44,9 +44,13 @@
//M*/
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
#
endif
/**************************************add
with
scalar
without
mask**************************************/
/**************************************add
with
scalar
without
mask**************************************/
__kernel
void
arithm_s_add_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_s_add_C1_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
...
@@ -58,8 +62,11 @@ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src
...
@@ -58,8 +62,11 @@ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -98,8 +105,11 @@ __kernel void arithm_s_add_C1_D2 (__global ushort *src1, int src1_step, int sr
...
@@ -98,8 +105,11 @@ __kernel void arithm_s_add_C1_D2 (__global ushort *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -130,8 +140,11 @@ __kernel void arithm_s_add_C1_D3 (__global short *src1, int src1_step, int src
...
@@ -130,8 +140,11 @@ __kernel void arithm_s_add_C1_D3 (__global short *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -232,8 +245,11 @@ __kernel void arithm_s_add_C2_D0 (__global uchar *src1, int src1_step, int src
...
@@ -232,8 +245,11 @@ __kernel void arithm_s_add_C2_D0 (__global uchar *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -377,8 +393,11 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src
...
@@ -377,8 +393,11 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
3
)
+
src1_offset
-
(
dst_align
*
3
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
3
)
+
src1_offset
-
(
dst_align
*
3
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -431,8 +450,11 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
...
@@ -431,8 +450,11 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -458,16 +480,16 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
...
@@ -458,16 +480,16 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
data_0.xy
=
((
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_0.xy
=
((
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
data_1.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_1.y
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_2.xy
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
}
}
__kernel
void
arithm_s_add_C3_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_s_add_C3_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
...
@@ -481,8 +503,11 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
...
@@ -481,8 +503,11 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -508,16 +533,16 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
...
@@ -508,16 +533,16 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
data_0.xy
=
((
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_0.xy
=
((
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
data_1.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_1.y
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_2.xy
=
((
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
}
}
__kernel
void
arithm_s_add_C3_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_s_add_C3_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
...
@@ -549,9 +574,9 @@ __kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_
...
@@ -549,9 +574,9 @@ __kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_
int
tmp_data_1
=
convert_int_sat
((
long
)
src1_data_1
+
(
long
)
src2_data_1
)
;
int
tmp_data_1
=
convert_int_sat
((
long
)
src1_data_1
+
(
long
)
src2_data_1
)
;
int
tmp_data_2
=
convert_int_sat
((
long
)
src1_data_2
+
(
long
)
src2_data_2
)
;
int
tmp_data_2
=
convert_int_sat
((
long
)
src1_data_2
+
(
long
)
src2_data_2
)
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
}
}
}
}
__kernel
void
arithm_s_add_C3_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_s_add_C3_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
...
@@ -583,9 +608,9 @@ __kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src
...
@@ -583,9 +608,9 @@ __kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src
float
tmp_data_1
=
src1_data_1
+
src2_data_1
;
float
tmp_data_1
=
src1_data_1
+
src2_data_1
;
float
tmp_data_2
=
src1_data_2
+
src2_data_2
;
float
tmp_data_2
=
src1_data_2
+
src2_data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
tmp_data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_2
;
}
}
}
}
...
@@ -619,9 +644,9 @@ __kernel void arithm_s_add_C3_D6 (__global double *src1, int src1_step, int sr
...
@@ -619,9 +644,9 @@ __kernel void arithm_s_add_C3_D6 (__global double *src1, int src1_step, int sr
double
tmp_data_1
=
src1_data_1
+
src2_data_1
;
double
tmp_data_1
=
src1_data_1
+
src2_data_1
;
double
tmp_data_2
=
src1_data_2
+
src2_data_2
;
double
tmp_data_2
=
src1_data_2
+
src2_data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
tmp_data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
tmp_data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
tmp_data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
tmp_data_2
;
}
}
}
}
#
endif
#
endif
...
...
modules/ocl/src/opencl/arithm_add_scalar_mask.cl
浏览文件 @
a2d27429
...
@@ -44,7 +44,11 @@
...
@@ -44,7 +44,11 @@
//M*/
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
#
endif
/**************************************add
with
scalar
with
mask**************************************/
/**************************************add
with
scalar
with
mask**************************************/
...
@@ -60,8 +64,11 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_ste
...
@@ -60,8 +64,11 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_ste
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -110,8 +117,11 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st
...
@@ -110,8 +117,11 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -145,8 +155,11 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste
...
@@ -145,8 +155,11 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -266,8 +279,11 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste
...
@@ -266,8 +279,11 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -442,8 +458,11 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_ste
...
@@ -442,8 +458,11 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_ste
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
3
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
3
)
+
src1_offset
-
(
dst_align
*
3
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
3
)
+
src1_offset
-
(
dst_align
*
3
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -500,8 +519,11 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
...
@@ -500,8 +519,11 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -530,16 +552,16 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
...
@@ -530,16 +552,16 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
data_0.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_0.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
data_1.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_1.y
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_2.xy
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
ushort2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
}
}
__kernel
void
arithm_s_add_with_mask_C3_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_s_add_with_mask_C3_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
...
@@ -554,8 +576,11 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
...
@@ -554,8 +576,11 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
1
;
x
=
x
<<
1
;
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
%
dst_step
)
/
6
)
&
1
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
*
6
)
+
src1_offset
-
(
dst_align
*
6
))
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
int
mask_index
=
mad24
(
y,
mask_step,
x
+
mask_offset
-
dst_align
)
;
...
@@ -584,16 +609,16 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
...
@@ -584,16 +609,16 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
data_0.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_0.xy
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
))
?
tmp_data_0.xy
:
data_0.xy
;
data_1.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
data_1.x
=
((
mask_data.x
)
&&
(
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data_1.x
:
data_1.x
;
?
tmp_data_1.x
:
data_1.x
;
data_1.y
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_1.y
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_1.y
:
data_1.y
;
?
tmp_data_1.y
:
data_1.y
;
data_2.xy
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
data_2.xy
=
((
mask_data.y
)
&&
(
dst_index
+
6
>=
dst_start
)
&&
(
dst_index
+
6
<
dst_end
))
?
tmp_data_2.xy
:
data_2.xy
;
?
tmp_data_2.xy
:
data_2.xy
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
short2
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
}
}
__kernel
void
arithm_s_add_with_mask_C3_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_s_add_with_mask_C3_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
...
@@ -633,9 +658,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step,
...
@@ -633,9 +658,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step,
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
int
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
}
}
__kernel
void
arithm_s_add_with_mask_C3_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_s_add_with_mask_C3_D5
(
__global
float
*src1,
int
src1_step,
int
src1_offset,
...
@@ -675,9 +700,9 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_ste
...
@@ -675,9 +700,9 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_ste
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
4
))
=
data_1
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
*
((
__global
float
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_2
;
}
}
}
}
...
@@ -719,9 +744,9 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global double *src1, int src1_st
...
@@ -719,9 +744,9 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global double *src1, int src1_st
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_1
=
mask_data
?
tmp_data_1
:
data_1
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
data_2
=
mask_data
?
tmp_data_2
:
data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
0
))
=
data_0
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
8
))
=
data_1
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
data_2
;
*
((
__global
double
*
)((
__global
char
*
)
dst
+
dst_index
+
16
))
=
data_2
;
}
}
}
}
#
endif
#
endif
...
...
modules/ocl/src/opencl/arithm_bitwise_and.cl
浏览文件 @
a2d27429
...
@@ -43,7 +43,11 @@
...
@@ -43,7 +43,11 @@
//
//
//M*/
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
...
@@ -51,9 +55,9 @@
...
@@ -51,9 +55,9 @@
///////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_and
without
mask**************************************/
/**************************************bitwise_and
without
mask**************************************/
__kernel
void
arithm_bitwise_and_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_and_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
@@ -61,31 +65,34 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
...
@@ -61,31 +65,34 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
uchar4
tmp
;
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
uchar4
tmp
;
uchar4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
src1_data
&
src2_data
;
uchar4
tmp_data
=
src1_data
&
src2_data
;
...
@@ -101,9 +108,9 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
...
@@ -101,9 +108,9 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
__kernel
void
arithm_bitwise_and_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_and_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
@@ -111,8 +118,11 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
...
@@ -111,8 +118,11 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
...
@@ -120,23 +130,23 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
...
@@ -120,23 +130,23 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
char4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
char4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
char4
tmp
;
char4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
char4
tmp
;
char4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
char4
dst_data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
dst_data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
tmp_data
=
src1_data
&
src2_data
;
char4
tmp_data
=
src1_data
&
src2_data
;
...
@@ -151,9 +161,9 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
...
@@ -151,9 +161,9 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
__kernel
void
arithm_bitwise_and_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_and_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
...
@@ -162,8 +172,11 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
...
@@ -162,8 +172,11 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
@@ -171,23 +184,23 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
...
@@ -171,23 +184,23 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
ushort4
tmp
;
ushort4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
ushort4
tmp
;
ushort4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
tmp_data
=
src1_data
&
src2_data
;
ushort4
tmp_data
=
src1_data
&
src2_data
;
...
@@ -203,9 +216,9 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
...
@@ -203,9 +216,9 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
__kernel
void
arithm_bitwise_and_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_and_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
...
@@ -214,8 +227,11 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
...
@@ -214,8 +227,11 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
@@ -223,23 +239,23 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
...
@@ -223,23 +239,23 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
short4
tmp
;
short4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
short4
tmp
;
short4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
tmp_data
=
src1_data
&
src2_data
;
short4
tmp_data
=
src1_data
&
src2_data
;
...
@@ -255,9 +271,9 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
...
@@ -255,9 +271,9 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
__kernel
void
arithm_bitwise_and_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_and_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
@@ -277,9 +293,9 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1
...
@@ -277,9 +293,9 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1
}
}
__kernel
void
arithm_bitwise_and_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_and_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
@@ -300,9 +316,9 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src
...
@@ -300,9 +316,9 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_bitwise_and_D6
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_and_D6
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
...
modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
浏览文件 @
a2d27429
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
浏览文件 @
a2d27429
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
浏览文件 @
a2d27429
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_not.cl
浏览文件 @
a2d27429
...
@@ -43,9 +43,12 @@
...
@@ -43,9 +43,12 @@
//
//
//M*/
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_NOT////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_NOT////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
...
@@ -60,26 +63,29 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
...
@@ -60,26 +63,29 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
~
src1_data
;
uchar4
tmp_data
=
~
src1_data
;
/*
if
(
src1_index
<
0
)
/*
if
(
src1_index
<
0
)
{
{
uchar4
tmp
;
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
*/
*/
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.x
=
((
dst_index
+
0
>=
dst_start
)
&&
(
dst_index
+
0
<
dst_end
))
?
tmp_data.x
:
dst_data.x
;
dst_data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.y
=
((
dst_index
+
1
>=
dst_start
)
&&
(
dst_index
+
1
<
dst_end
))
?
tmp_data.y
:
dst_data.y
;
dst_data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
dst_data.z
=
((
dst_index
+
2
>=
dst_start
)
&&
(
dst_index
+
2
<
dst_end
))
?
tmp_data.z
:
dst_data.z
;
...
@@ -91,8 +97,8 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
...
@@ -91,8 +97,8 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
__kernel
void
arithm_bitwise_not_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_not_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
@@ -100,8 +106,11 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
...
@@ -100,8 +106,11 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -124,8 +133,8 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
...
@@ -124,8 +133,8 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
__kernel
void
arithm_bitwise_not_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_not_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
...
@@ -134,8 +143,11 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
...
@@ -134,8 +143,11 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -159,8 +171,8 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
...
@@ -159,8 +171,8 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
__kernel
void
arithm_bitwise_not_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_not_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
...
@@ -169,8 +181,11 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
...
@@ -169,8 +181,11 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
...
@@ -194,8 +209,8 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
...
@@ -194,8 +209,8 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
__kernel
void
arithm_bitwise_not_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_not_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
...
modules/ocl/src/opencl/arithm_bitwise_or.cl
浏览文件 @
a2d27429
...
@@ -43,7 +43,11 @@
...
@@ -43,7 +43,11 @@
//
//
//M*/
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
...
@@ -51,9 +55,9 @@
...
@@ -51,9 +55,9 @@
///////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_or
without
mask**************************************/
/**************************************bitwise_or
without
mask**************************************/
__kernel
void
arithm_bitwise_or_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_or_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
@@ -61,30 +65,33 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
...
@@ -61,30 +65,33 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
uchar4
tmp
;
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
uchar4
tmp
;
uchar4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
src1_data
| src2_data;
uchar4
tmp_data
=
src1_data
| src2_data;
...
@@ -99,9 +106,9 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
...
@@ -99,9 +106,9 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
__kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset,
__kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
int rows, int cols, int dst_step1)
{
{
int x = get_global_id(0);
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
...
@@ -109,8 +116,11 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
...
@@ -109,8 +116,11 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
if (x < cols && y < rows)
if (x < cols && y < rows)
{
{
x = x << 2;
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
...
@@ -135,9 +145,9 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
...
@@ -135,9 +145,9 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
__kernel
void
arithm_bitwise_or_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_or_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
...
@@ -146,8 +156,11 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
...
@@ -146,8 +156,11 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
@@ -173,9 +186,9 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
...
@@ -173,9 +186,9 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
__kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset,
__kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset,
__global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset,
__global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
int rows, int cols, int dst_step1)
{
{
int x = get_global_id(0);
int x = get_global_id(0);
...
@@ -184,8 +197,11 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
...
@@ -184,8 +197,11 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
if (x < cols && y < rows)
if (x < cols && y < rows)
{
{
x = x << 2;
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...
@@ -211,9 +227,9 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
...
@@ -211,9 +227,9 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
__kernel
void
arithm_bitwise_or_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_or_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
@@ -233,9 +249,9 @@ __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_
...
@@ -233,9 +249,9 @@ __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_
}
}
__kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset,
__kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset,
__global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1)
int rows, int cols, int dst_step1)
{
{
int x = get_global_id(0);
int x = get_global_id(0);
int y = get_global_id(1);
int y = get_global_id(1);
...
@@ -256,9 +272,9 @@ __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1
...
@@ -256,9 +272,9 @@ __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_bitwise_or_D6
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_or_D6
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
...
modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
浏览文件 @
a2d27429
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
浏览文件 @
a2d27429
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
浏览文件 @
a2d27429
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_xor.cl
浏览文件 @
a2d27429
...
@@ -43,17 +43,20 @@
...
@@ -43,17 +43,20 @@
//
//
//M*/
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_xor
without
mask**************************************/
/**************************************bitwise_xor
without
mask**************************************/
__kernel
void
arithm_bitwise_xor_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_xor_D0
(
__global
uchar
*src1,
int
src1_step,
int
src1_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*src2,
int
src2_step,
int
src2_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
@@ -61,8 +64,11 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
...
@@ -61,8 +64,11 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
...
@@ -70,23 +76,23 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
...
@@ -70,23 +76,23 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
uchar4
tmp
;
uchar4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
uchar4
tmp
;
uchar4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
uchar4
tmp_data
=
src1_data
^
src2_data
;
uchar4
tmp_data
=
src1_data
^
src2_data
;
...
@@ -101,9 +107,9 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
...
@@ -101,9 +107,9 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
__kernel
void
arithm_bitwise_xor_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_xor_D1
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
@@ -111,8 +117,11 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
...
@@ -111,8 +117,11 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
...
@@ -120,23 +129,23 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
...
@@ -120,23 +129,23 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
char4
src1_data
=
vload4
(
0
,
src1
+
src1_index_fix
)
;
char4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
char4
src2_data
=
vload4
(
0
,
src2
+
src2_index_fix
)
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
char4
tmp
;
char4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
char4
tmp
;
char4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
char4
dst_data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
dst_data
=
*
((
__global
char4
*
)(
dst
+
dst_index
))
;
char4
tmp_data
=
src1_data
^
src2_data
;
char4
tmp_data
=
src1_data
^
src2_data
;
...
@@ -151,9 +160,9 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
...
@@ -151,9 +160,9 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
__kernel
void
arithm_bitwise_xor_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_xor_D2
(
__global
ushort
*src1,
int
src1_step,
int
src1_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*src2,
int
src2_step,
int
src2_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
__global
ushort
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
...
@@ -162,8 +171,11 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
...
@@ -162,8 +171,11 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
@@ -171,23 +183,23 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
...
@@ -171,23 +183,23 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
ushort4
src1_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
ushort4
src2_data
=
vload4
(
0
,
(
__global
ushort
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
ushort4
tmp
;
ushort4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
ushort4
tmp
;
ushort4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
dst_data
=
*
((
__global
ushort4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
ushort4
tmp_data
=
src1_data
^
src2_data
;
ushort4
tmp_data
=
src1_data
^
src2_data
;
...
@@ -203,9 +215,9 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
...
@@ -203,9 +215,9 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
__kernel
void
arithm_bitwise_xor_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_xor_D3
(
__global
short
*src1,
int
src1_step,
int
src1_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
short
*src2,
int
src2_step,
int
src2_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
__global
short
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
...
@@ -214,8 +226,11 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
...
@@ -214,8 +226,11 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
if
(
x
<
cols
&&
y
<
rows
)
if
(
x
<
cols
&&
y
<
rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
((
dst_offset
>>
1
)
&
3
)
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src1_index
=
mad24
(
y,
src1_step,
(
x
<<
1
)
+
src1_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
int
src2_index
=
mad24
(
y,
src2_step,
(
x
<<
1
)
+
src2_offset
-
(
dst_align
<<
1
))
;
...
@@ -223,25 +238,25 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
...
@@ -223,25 +238,25 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
(
x
<<
1
)
&
(
int
)
0xfffffff8
)
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src1_index_fix
=
src1_index
<
0
?
0
:
src1_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
int
src2_index_fix
=
src2_index
<
0
?
0
:
src2_index
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
short4
src1_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src1
+
src1_index_fix
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
short4
src2_data
=
vload4
(
0
,
(
__global
short
*
)((
__global
char
*
)
src2
+
src2_index_fix
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
short4
dst_data
=
*
((
__global
short4
*
)((
__global
char
*
)
dst
+
dst_index
))
;
if
(
src1_index
<
0
)
if
(
src1_index
<
0
)
{
{
short4
tmp
;
short4
tmp
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
tmp.xyzw
=
(
src1_index
==
-2
)
?
src1_data.zwxy:src1_data.yzwx
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
src1_data.xyzw
=
(
src1_index
==
-1
)
?
src1_data.wxyz:tmp.xyzw
;
}
}
if
(
src2_index
<
0
)
if
(
src2_index
<
0
)
{
{
short4
tmp
;
short4
tmp
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
tmp.xyzw
=
(
src2_index
==
-2
)
?
src2_data.zwxy:src2_data.yzwx
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
src2_data.xyzw
=
(
src2_index
==
-1
)
?
src2_data.wxyz:tmp.xyzw
;
}
}
...
@@ -259,9 +274,9 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
...
@@ -259,9 +274,9 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
__kernel
void
arithm_bitwise_xor_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_xor_D4
(
__global
int
*src1,
int
src1_step,
int
src1_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*src2,
int
src2_step,
int
src2_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
__global
int
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
@@ -281,9 +296,9 @@ __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1
...
@@ -281,9 +296,9 @@ __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1
}
}
__kernel
void
arithm_bitwise_xor_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_xor_D5
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
@@ -301,12 +316,11 @@ __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src
...
@@ -301,12 +316,11 @@ __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src
*
((
__global
char4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
*
((
__global
char4
*
)((
__global
char
*
)
dst
+
dst_index
))
=
tmp
;
}
}
}
}
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
__kernel
void
arithm_bitwise_xor_D6
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__kernel
void
arithm_bitwise_xor_D6
(
__global
char
*src1,
int
src1_step,
int
src1_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*src2,
int
src2_step,
int
src2_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
__global
char
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1
)
int
rows,
int
cols,
int
dst_step1
)
{
{
int
x
=
get_global_id
(
0
)
;
int
x
=
get_global_id
(
0
)
;
int
y
=
get_global_id
(
1
)
;
int
y
=
get_global_id
(
1
)
;
...
...
modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
浏览文件 @
a2d27429
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
浏览文件 @
a2d27429
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
浏览文件 @
a2d27429
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_compare_eq.cl
浏览文件 @
a2d27429
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_compare_ne.cl
浏览文件 @
a2d27429
此差异已折叠。
点击以展开。
modules/ocl/src/opencl/arithm_div.cl
浏览文件 @
a2d27429
...
@@ -44,7 +44,11 @@
...
@@ -44,7 +44,11 @@
//M*/
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
typedef
double
F
;
typedef
double
F
;
typedef
double4
F4
;
typedef
double4
F4
;
#
define
convert_F4
convert_double4
#
define
convert_F4
convert_double4
...
@@ -56,34 +60,24 @@ typedef float4 F4;
...
@@ -56,34 +60,24 @@ typedef float4 F4;
#
define
convert_F
float
#
define
convert_F
float
#
endif
#
endif
uchar
round2_uchar
(
F
v
)
{
inline
uchar
round2_uchar
(
F
v
)
{
uchar
v1
=
convert_uchar_sat
(
round
(
v
))
;
return
convert_uchar_sat
(
round
(
v
))
;
//uchar
v2
=
convert_uchar_sat
(
v+
(
v>=0
?
0.5
:
-0.5
))
;
return
v1
;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
}
}
ushort
round2_ushort
(
F
v
)
{
inline
ushort
round2_ushort
(
F
v
)
{
ushort
v1
=
convert_ushort_sat
(
round
(
v
))
;
return
convert_ushort_sat
(
round
(
v
))
;
//ushort
v2
=
convert_ushort_sat
(
v+
(
v>=0
?
0.5
:
-0.5
))
;
return
v1
;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
}
}
short
round2_short
(
F
v
)
{
short
v1
=
convert_short_sat
(
round
(
v
))
;
//short
v2
=
convert_short_sat
(
v+
(
v>=0
?
0.5
:
-0.5
))
;
return
v1
;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
inline
short
round2_short
(
F
v
)
{
return
convert_short_sat
(
round
(
v
))
;
}
}
int
round2_int
(
F
v
)
{
int
v1
=
convert_int_sat
(
round
(
v
))
;
//int
v2
=
convert_int_sat
(
v+
(
v>=0
?
0.5
:
-0.5
))
;
return
v1
;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
inline
int
round2_int
(
F
v
)
{
return
convert_int_sat
(
round
(
v
))
;
}
}
///////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////
////////////////////////////divide///////////////////////////////////////////////////
////////////////////////////divide///////////////////////////////////////////////////
...
@@ -94,39 +88,41 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
...
@@ -94,39 +88,41 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
__global
uchar
*dst,
int
dst_step,
int
dst_offset,
int
rows,
int
cols,
int
dst_step1,
F
scalar
)
int
rows,
int
cols,
int
dst_step1,
F
scalar
)
{
{
int
x
=
get_global_id
(
0
)
;
int2
coor
=
(
int2
)(
get_global_id
(
0
)
,
get_global_id
(
1
))
;
int
y
=
get_global_id
(
1
)
;
if
(
x
<
cols
&&
y
<
rows
)
if
(
coor.x
<
cols
&&
coor.
y
<
rows
)
{
{
x
=
x
<<
2
;
coor.x
=
coor.x
<<
2
;
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int2
src_index
=
(
int2
)(
mad24
(
coor.y,
src1_step,
coor.x
+
src1_offset
-
dst_align
)
,
mad24
(
coor.y,
src2_step,
coor.x
+
src2_offset
-
dst_align
))
;
#
define
dst_align
(
dst_offset
&
3
)
int4
dst_args
=
(
int4
)(
mad24
(
coor.y,
dst_step,
dst_offset
)
,
int
src1_index
=
mad24
(
y,
src1_step,
x
+
src1_offset
-
dst_align
)
;
mad24
(
coor.y,
dst_step,
dst_offset
+
dst_step1
)
,
int
src2_index
=
mad24
(
y,
src2_step,
x
+
src2_offset
-
dst_align
)
;
mad24
(
coor.y,
dst_step,
dst_offset
+
coor.x
&
(
int
)
0xfffffffc
)
,
0
)
;
int
dst_start
=
mad24
(
y,
dst_step,
dst_offset
)
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src_index.x
)
;
int
dst_end
=
mad24
(
y,
dst_step,
dst_offset
+
dst_step1
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src_index.y
)
;
int
dst_index
=
mad24
(
y,
dst_step,
dst_offset
+
x
&
(
int
)
0xfffffffc
)
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_args.z
))
;
uchar4
src1_data
=
vload4
(
0
,
src1
+
src1_index
)
;
uchar4
src2_data
=
vload4
(
0
,
src2
+
src2_index
)
;
uchar4
dst_data
=
*
((
__global
uchar4
*
)(
dst
+
dst_index
))
;
F4
tmp
=
convert_F4
(
src1_data
)
*
scalar
;
F4
tmp
=
convert_F4
(
src1_data
)
*
scalar
;
uchar4
tmp_data
;
uchar4
tmp_data
;
tmp_data.x
=
((
tmp.x
==
0
)
|
| (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x /
(F)
src2_data.x);
tmp_data.x
=
((
tmp.x
==
0
)
|
| (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x);
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y /
(F)
src2_data.y);
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y);
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z /
(F)
src2_data.z);
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z);
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w /
(F)
src2_data.w);
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w);
dst_data.x = ((dst_
index + 0 >= dst_start) && (dst_index + 0 < dst_end
)) ? tmp_data.x : dst_data.x;
dst_data.x = ((dst_
args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y
)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_
index + 1 >= dst_start) && (dst_index + 1 < dst_end
)) ? tmp_data.y : dst_data.y;
dst_data.y = ((dst_
args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y
)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_
index + 2 >= dst_start) && (dst_index + 2 < dst_end
)) ? tmp_data.z : dst_data.z;
dst_data.z = ((dst_
args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y
)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_
index + 3 >= dst_start) && (dst_index + 3 < dst_end
)) ? tmp_data.w : dst_data.w;
dst_data.w = ((dst_
args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y
)) ? tmp_data.w : dst_data.w;
*((__global uchar4 *)(dst + dst_
index
)) = dst_data;
*((__global uchar4 *)(dst + dst_
args.z
)) = dst_data;
}
}
}
}
...
@@ -141,8 +137,11 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
...
@@ -141,8 +137,11 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
if (x < cols && y < rows)
if (x < cols && y < rows)
{
{
x = x << 2;
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...
@@ -181,8 +180,11 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
...
@@ -181,8 +180,11 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
if (x < cols && y < rows)
if (x < cols && y < rows)
{
{
x = x << 2;
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...
@@ -296,8 +298,11 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
...
@@ -296,8 +298,11 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
if (x < cols && y < rows)
if (x < cols && y < rows)
{
{
x = x << 2;
x = x << 2;
#define dst_align (dst_offset & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src_index = mad24(y, src_step, x + src_offset - dst_align);
int src_index = mad24(y, src_step, x + src_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset);
int dst_start = mad24(y, dst_step, dst_offset);
...
@@ -332,8 +337,11 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
...
@@ -332,8 +337,11 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
if (x < cols && y < rows)
if (x < cols && y < rows)
{
{
x = x << 2;
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_start = mad24(y, dst_step, dst_offset);
...
@@ -367,8 +375,11 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
...
@@ -367,8 +375,11 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
if (x < cols && y < rows)
if (x < cols && y < rows)
{
{
x = x << 2;
x = x << 2;
#define dst_align ((dst_offset >> 1) & 3)
#ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset);
int dst_start = mad24(y, dst_step, dst_offset);
...
@@ -455,3 +466,5 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
...
@@ -455,3 +466,5 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
}
}
}
}
#
endif
#
endif
modules/ocl/src/opencl/arithm_flip.cl
浏览文件 @
a2d27429
...
@@ -44,7 +44,11 @@
...
@@ -44,7 +44,11 @@
//M*/
//M*/
#
if
defined
(
DOUBLE_SUPPORT
)
#
if
defined
(
DOUBLE_SUPPORT
)
#
ifdef
cl_khr_fp64
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
pragma
OPENCL
EXTENSION
cl_khr_fp64:enable
#
elif
defined
(
cl_amd_fp64
)
#
pragma
OPENCL
EXTENSION
cl_amd_fp64:enable
#
endif
#
endif
#
endif
//////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////////////////
...
@@ -60,8 +64,11 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
...
@@ -60,8 +64,11 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
if
(
x
<
cols
&&
y
<
thread_rows
)
if
(
x
<
cols
&&
y
<
thread_rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src_index_0
=
mad24
(
y,
src_step,
x
+
src_offset
-
dst_align
)
;
int
src_index_0
=
mad24
(
y,
src_step,
x
+
src_offset
-
dst_align
)
;
int
src_index_1
=
mad24
(
rows
-
y
-
1
,
src_step,
x
+
src_offset
-
dst_align
)
;
int
src_index_1
=
mad24
(
rows
-
y
-
1
,
src_step,
x
+
src_offset
-
dst_align
)
;
...
@@ -115,8 +122,11 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off
...
@@ -115,8 +122,11 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off
if
(
x
<
cols
&&
y
<
thread_rows
)
if
(
x
<
cols
&&
y
<
thread_rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(
dst_offset
&
3
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(
dst_offset
&
3
)
int
src_index_0
=
mad24
(
y,
src_step,
x
+
src_offset
-
dst_align
)
;
int
src_index_0
=
mad24
(
y,
src_step,
x
+
src_offset
-
dst_align
)
;
int
src_index_1
=
mad24
(
rows
-
y
-
1
,
src_step,
x
+
src_offset
-
dst_align
)
;
int
src_index_1
=
mad24
(
rows
-
y
-
1
,
src_step,
x
+
src_offset
-
dst_align
)
;
...
@@ -157,8 +167,11 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o
...
@@ -157,8 +167,11 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o
if
(
x
<
cols
&&
y
<
thread_rows
)
if
(
x
<
cols
&&
y
<
thread_rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(((
dst_offset
>>
1
)
&
3
)
<<
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
>>
1
)
&
3
)
<<
1
)
int
src_index_0
=
mad24
(
y,
src_step,
(
x
<<
1
)
+
src_offset
-
dst_align
)
;
int
src_index_0
=
mad24
(
y,
src_step,
(
x
<<
1
)
+
src_offset
-
dst_align
)
;
int
src_index_1
=
mad24
(
rows
-
y
-
1
,
src_step,
(
x
<<
1
)
+
src_offset
-
dst_align
)
;
int
src_index_1
=
mad24
(
rows
-
y
-
1
,
src_step,
(
x
<<
1
)
+
src_offset
-
dst_align
)
;
...
@@ -199,8 +212,11 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of
...
@@ -199,8 +212,11 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of
if
(
x
<
cols
&&
y
<
thread_rows
)
if
(
x
<
cols
&&
y
<
thread_rows
)
{
{
x
=
x
<<
2
;
x
=
x
<<
2
;
#
define
dst_align
(((
dst_offset
>>
1
)
&
3
)
<<
1
)
#
ifdef
dst_align
#
undef
dst_align
#
endif
#
define
dst_align
(((
dst_offset
>>
1
)
&
3
)
<<
1
)
int
src_index_0
=
mad24
(
y,
src_step,
(
x
<<
1
)
+
src_offset
-
dst_align
)
;
int
src_index_0
=
mad24
(
y,
src_step,
(
x
<<
1
)
+
src_offset
-
dst_align
)
;
int
src_index_1
=
mad24
(
rows
-
y
-
1
,
src_step,
(
x
<<
1
)
+
src_offset
-
dst_align
)
;
int
src_index_1
=
mad24
(
rows
-
y
-
1
,
src_step,
(
x
<<
1
)
+
src_offset
-
dst_align
)
;
...
...
modules/ocl/src/opencl/arithm_mul.cl
浏览文件 @
a2d27429
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录