提交 a2d27429 编写于 作者: A Andrey Kamaev 提交者: OpenCV Buildbot

Merge pull request #775 from bitwangyaoyao:2.4_fixerr

...@@ -44,7 +44,11 @@ ...@@ -44,7 +44,11 @@
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif #endif
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
...@@ -62,7 +66,10 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o ...@@ -62,7 +66,10 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3) #define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
...@@ -110,8 +117,11 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_ ...@@ -110,8 +117,11 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...@@ -144,8 +154,11 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o ...@@ -144,8 +154,11 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...@@ -248,8 +261,11 @@ __kernel void arithm_s_absdiff_C1_D0 (__global uchar *src1, int src1_step, int ...@@ -248,8 +261,11 @@ __kernel void arithm_s_absdiff_C1_D0 (__global uchar *src1, int src1_step, int
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -287,8 +303,11 @@ __kernel void arithm_s_absdiff_C1_D2 (__global ushort *src1, int src1_step, in ...@@ -287,8 +303,11 @@ __kernel void arithm_s_absdiff_C1_D2 (__global ushort *src1, int src1_step, in
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align ((dst_offset >> 1) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -318,8 +337,11 @@ __kernel void arithm_s_absdiff_C1_D3 (__global short *src1, int src1_step, int ...@@ -318,8 +337,11 @@ __kernel void arithm_s_absdiff_C1_D3 (__global short *src1, int src1_step, int
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align ((dst_offset >> 1) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -387,8 +409,8 @@ __kernel void arithm_s_absdiff_C1_D5 (__global float *src1, int src1_step, int ...@@ -387,8 +409,8 @@ __kernel void arithm_s_absdiff_C1_D5 (__global float *src1, int src1_step, int
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
__kernel void arithm_s_absdiff_C1_D6 (__global double *src1, int src1_step, int src1_offset, __kernel void arithm_s_absdiff_C1_D6 (__global double *src1, int src1_step, int src1_offset,
__global double *dst, int dst_step, int dst_offset, __global double *dst, int dst_step, int dst_offset,
double4 src2, int rows, int cols, int dst_step1) double4 src2, int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
...@@ -421,8 +443,11 @@ __kernel void arithm_s_absdiff_C2_D0 (__global uchar *src1, int src1_step, int ...@@ -421,8 +443,11 @@ __kernel void arithm_s_absdiff_C2_D0 (__global uchar *src1, int src1_step, int
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align ((dst_offset >> 1) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -465,7 +490,7 @@ __kernel void arithm_s_absdiff_C2_D2 (__global ushort *src1, int src1_step, in ...@@ -465,7 +490,7 @@ __kernel void arithm_s_absdiff_C2_D2 (__global ushort *src1, int src1_step, in
} }
__kernel void arithm_s_absdiff_C2_D3 (__global short *src1, int src1_step, int src1_offset, __kernel void arithm_s_absdiff_C2_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset, __global short *dst, int dst_step, int dst_offset,
int4 src2, int rows, int cols, int dst_step1) int4 src2, int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
...@@ -509,7 +534,7 @@ __kernel void arithm_s_absdiff_C2_D4 (__global int *src1, int src1_step, int s ...@@ -509,7 +534,7 @@ __kernel void arithm_s_absdiff_C2_D4 (__global int *src1, int src1_step, int s
} }
__kernel void arithm_s_absdiff_C2_D5 (__global float *src1, int src1_step, int src1_offset, __kernel void arithm_s_absdiff_C2_D5 (__global float *src1, int src1_step, int src1_offset,
__global float *dst, int dst_step, int dst_offset, __global float *dst, int dst_step, int dst_offset,
float4 src2, int rows, int cols, int dst_step1) float4 src2, int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
...@@ -563,8 +588,11 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int ...@@ -563,8 +588,11 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -617,8 +645,11 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in ...@@ -617,8 +645,11 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -644,16 +675,16 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in ...@@ -644,16 +675,16 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x; ? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y; ? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy; ? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
} }
} }
__kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int src1_offset, __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int src1_offset,
...@@ -667,8 +698,11 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int ...@@ -667,8 +698,11 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -694,16 +728,16 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int ...@@ -694,16 +728,16 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x; ? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y; ? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy; ? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
} }
} }
__kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int src1_offset, __kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int src1_offset,
...@@ -735,9 +769,9 @@ __kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int s ...@@ -735,9 +769,9 @@ __kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int s
int tmp_data_1 = convert_int_sat(abs_diff(src1_data_1, src2_data_1)); int tmp_data_1 = convert_int_sat(abs_diff(src1_data_1, src2_data_1));
int tmp_data_2 = convert_int_sat(abs_diff(src1_data_2, src2_data_2)); int tmp_data_2 = convert_int_sat(abs_diff(src1_data_2, src2_data_2));
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
} }
} }
__kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int src1_offset, __kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int src1_offset,
...@@ -769,9 +803,9 @@ __kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int ...@@ -769,9 +803,9 @@ __kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int
float tmp_data_1 = fabs(src1_data_1 - src2_data_1); float tmp_data_1 = fabs(src1_data_1 - src2_data_1);
float tmp_data_2 = fabs(src1_data_2 - src2_data_2); float tmp_data_2 = fabs(src1_data_2 - src2_data_2);
*((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0; *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1; *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2; *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
} }
} }
...@@ -805,9 +839,9 @@ __kernel void arithm_s_absdiff_C3_D6 (__global double *src1, int src1_step, in ...@@ -805,9 +839,9 @@ __kernel void arithm_s_absdiff_C3_D6 (__global double *src1, int src1_step, in
double tmp_data_1 = fabs(src1_data_1 - src2_data_1); double tmp_data_1 = fabs(src1_data_1 - src2_data_1);
double tmp_data_2 = fabs(src1_data_2 - src2_data_2); double tmp_data_2 = fabs(src1_data_2 - src2_data_2);
*((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2; *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
} }
} }
#endif #endif
......
...@@ -45,7 +45,11 @@ ...@@ -45,7 +45,11 @@
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif #endif
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
...@@ -64,7 +68,10 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse ...@@ -64,7 +68,10 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
...@@ -112,7 +119,10 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs ...@@ -112,7 +119,10 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...@@ -147,7 +157,10 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse ...@@ -147,7 +157,10 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...@@ -252,7 +265,10 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i ...@@ -252,7 +265,10 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -311,7 +327,10 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step, ...@@ -311,7 +327,10 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
{ {
x = x << 1; x = x << 1;
#define dst_align ((dst_offset >> 1) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -348,7 +367,10 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i ...@@ -348,7 +367,10 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
{ {
x = x << 1; x = x << 1;
#define dst_align ((dst_offset >> 1) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -477,7 +499,10 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i ...@@ -477,7 +499,10 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
{ {
x = x << 1; x = x << 1;
#define dst_align ((dst_offset >> 1) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -664,7 +689,10 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i ...@@ -664,7 +689,10 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
{ {
x = x << 2; x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -724,7 +752,10 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step, ...@@ -724,7 +752,10 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
{ {
x = x << 1; x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -754,16 +785,16 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step, ...@@ -754,16 +785,16 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x; ? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y; ? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy; ? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
} }
} }
__kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
...@@ -780,7 +811,10 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i ...@@ -780,7 +811,10 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
{ {
x = x << 1; x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -810,16 +844,16 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i ...@@ -810,16 +844,16 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x; ? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y; ? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy; ? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
} }
} }
__kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, __kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
...@@ -861,9 +895,9 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, i ...@@ -861,9 +895,9 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, i
data_1 = mask_data ? tmp_data_1 : data_1; data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2; data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0; *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1; *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2; *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
} }
} }
__kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
...@@ -905,9 +939,9 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i ...@@ -905,9 +939,9 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i
data_1 = mask_data ? tmp_data_1 : data_1; data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2; data_2 = mask_data ? tmp_data_2 : data_2;
*((__global float *)((__global char *)dst + dst_index + 0))= data_0; *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= data_1; *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= data_2; *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
} }
} }
...@@ -951,9 +985,9 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step, ...@@ -951,9 +985,9 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step,
data_1 = mask_data ? tmp_data_1 : data_1; data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2; data_2 = mask_data ? tmp_data_2 : data_2;
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= data_2; *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
} }
} }
#endif #endif
......
...@@ -42,8 +42,12 @@ ...@@ -42,8 +42,12 @@
// the use of this software, even if advised of the possibility of such damage. // the use of this software, even if advised of the possibility of such damage.
// //
//M*/ //M*/
#if defined DOUBLE_SUPPORT #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
typedef double F; typedef double F;
#else #else
typedef float F; typedef float F;
...@@ -52,10 +56,10 @@ typedef float F; ...@@ -52,10 +56,10 @@ typedef float F;
/////////////////////////////////////////////addWeighted////////////////////////////////////////////// /////////////////////////////////////////////addWeighted//////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
__kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset, __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
__global uchar *src2, int src2_step,int src2_offset, __global uchar *src2, int src2_step,int src2_offset,
F alpha,F beta,F gama, F alpha,F beta,F gama,
__global uchar *dst, int dst_step,int dst_offset, __global uchar *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1) int rows, int cols,int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -65,7 +69,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset ...@@ -65,7 +69,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
...@@ -87,7 +94,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset ...@@ -87,7 +94,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
// short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama; // short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
short4 tmp; short4 tmp;
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama; tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama; tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama; tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
...@@ -100,7 +107,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset ...@@ -100,7 +107,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
*((__global uchar4 *)(dst + dst_index)) = dst_data; *((__global uchar4 *)(dst + dst_index)) = dst_data;
// dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama; // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
} }
} }
...@@ -108,10 +115,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset ...@@ -108,10 +115,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
__kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset, __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
__global ushort *src2, int src2_step,int src2_offset, __global ushort *src2, int src2_step,int src2_offset,
F alpha,F beta,F gama, F alpha,F beta,F gama,
__global ushort *dst, int dst_step,int dst_offset, __global ushort *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1) int rows, int cols,int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -121,35 +128,38 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs ...@@ -121,35 +128,38 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8); int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama; // int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
int4 tmp; int4 tmp;
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama; tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama; tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama; tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
...@@ -181,8 +191,11 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse ...@@ -181,8 +191,11 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
...@@ -190,26 +203,26 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse ...@@ -190,26 +203,26 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 )); int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
// int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama; // int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
int4 tmp; int4 tmp;
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama; tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama; tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama; tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
...@@ -228,7 +241,7 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse ...@@ -228,7 +241,7 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
__kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
__global int *src2, int src2_step,int src2_offset, __global int *src2, int src2_step,int src2_offset,
F alpha,F beta, F gama, F alpha,F beta, F gama,
__global int *dst, int dst_step,int dst_offset, __global int *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1) int rows, int cols,int dst_step1)
{ {
...@@ -241,9 +254,12 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, ...@@ -241,9 +254,12 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
x = x << 2; x = x << 2;
#define bitOfInt (sizeof(int)== 4 ? 2: 3) #define bitOfInt (sizeof(int)== 4 ? 2: 3)
#define dst_align ((dst_offset >> bitOfInt) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> bitOfInt) & 3)
int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
...@@ -252,26 +268,26 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, ...@@ -252,26 +268,26 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt)); int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix)); int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
int4 tmp; int4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ; // double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
float4 tmp; float4 tmp;
tmp.x = src1_data.x * alpha + src2_data.x * beta + gama; tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
tmp.y = src1_data.y * alpha + src2_data.y * beta + gama; tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
tmp.z = src1_data.z * alpha + src2_data.z * beta + gama; tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
...@@ -291,7 +307,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, ...@@ -291,7 +307,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
__kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset, __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
__global float *src2, int src2_step,int src2_offset, __global float *src2, int src2_step,int src2_offset,
F alpha,F beta, F gama, F alpha,F beta, F gama,
__global float *dst, int dst_step,int dst_offset, __global float *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1) int rows, int cols,int dst_step1)
{ {
...@@ -303,8 +319,11 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset ...@@ -303,8 +319,11 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 2) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 2) & 3)
int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
...@@ -313,32 +332,32 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset ...@@ -313,32 +332,32 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix));
float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
if(src1_index < 0) if(src1_index < 0)
{ {
float4 tmp; float4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
float4 tmp; float4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
// double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ; // double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
// float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ; // float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
float4 tmp_data; float4 tmp_data;
tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama; tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama; tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama; tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama; tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
// float4 tmp_data = convert_float4(tmp); // float4 tmp_data = convert_float4(tmp);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y; dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
...@@ -353,7 +372,7 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset ...@@ -353,7 +372,7 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
__kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset, __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
__global double *src2, int src2_step,int src2_offset, __global double *src2, int src2_step,int src2_offset,
F alpha,F beta, F gama, F alpha,F beta, F gama,
__global double *dst, int dst_step,int dst_offset, __global double *dst, int dst_step,int dst_offset,
int rows, int cols,int dst_step1) int rows, int cols,int dst_step1)
{ {
...@@ -365,8 +384,11 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs ...@@ -365,8 +384,11 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 3) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 3) & 3)
int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
...@@ -375,25 +397,25 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs ...@@ -375,25 +397,25 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3)); int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix));
double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix));
double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
if(src1_index < 0) if(src1_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
double4 tmp; double4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
// double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ; // double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
double4 tmp_data; double4 tmp_data;
tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama; tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama; tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama; tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
......
...@@ -44,9 +44,13 @@ ...@@ -44,9 +44,13 @@
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif #endif
#endif
/**************************************add with scalar without mask**************************************/ /**************************************add with scalar without mask**************************************/
__kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *dst, int dst_step, int dst_offset, __global uchar *dst, int dst_step, int dst_offset,
...@@ -58,8 +62,11 @@ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src ...@@ -58,8 +62,11 @@ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -98,8 +105,11 @@ __kernel void arithm_s_add_C1_D2 (__global ushort *src1, int src1_step, int sr ...@@ -98,8 +105,11 @@ __kernel void arithm_s_add_C1_D2 (__global ushort *src1, int src1_step, int sr
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align ((dst_offset >> 1) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -130,8 +140,11 @@ __kernel void arithm_s_add_C1_D3 (__global short *src1, int src1_step, int src ...@@ -130,8 +140,11 @@ __kernel void arithm_s_add_C1_D3 (__global short *src1, int src1_step, int src
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align ((dst_offset >> 1) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -232,8 +245,11 @@ __kernel void arithm_s_add_C2_D0 (__global uchar *src1, int src1_step, int src ...@@ -232,8 +245,11 @@ __kernel void arithm_s_add_C2_D0 (__global uchar *src1, int src1_step, int src
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align ((dst_offset >> 1) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -377,8 +393,11 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src ...@@ -377,8 +393,11 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -431,8 +450,11 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr ...@@ -431,8 +450,11 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -458,16 +480,16 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr ...@@ -458,16 +480,16 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x; ? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y; ? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy; ? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
} }
} }
__kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src1_offset, __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src1_offset,
...@@ -481,8 +503,11 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src ...@@ -481,8 +503,11 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -508,16 +533,16 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src ...@@ -508,16 +533,16 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src
data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x; ? tmp_data_1.x : data_1.x;
data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y; ? tmp_data_1.y : data_1.y;
data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy; ? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
} }
} }
__kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_offset, __kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_offset,
...@@ -549,9 +574,9 @@ __kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_ ...@@ -549,9 +574,9 @@ __kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_
int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1); int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1);
int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2); int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2);
*((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
} }
} }
__kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src1_offset, __kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src1_offset,
...@@ -583,9 +608,9 @@ __kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src ...@@ -583,9 +608,9 @@ __kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src
float tmp_data_1 = src1_data_1 + src2_data_1; float tmp_data_1 = src1_data_1 + src2_data_1;
float tmp_data_2 = src1_data_2 + src2_data_2; float tmp_data_2 = src1_data_2 + src2_data_2;
*((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0; *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1; *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2; *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
} }
} }
...@@ -619,9 +644,9 @@ __kernel void arithm_s_add_C3_D6 (__global double *src1, int src1_step, int sr ...@@ -619,9 +644,9 @@ __kernel void arithm_s_add_C3_D6 (__global double *src1, int src1_step, int sr
double tmp_data_1 = src1_data_1 + src2_data_1; double tmp_data_1 = src1_data_1 + src2_data_1;
double tmp_data_2 = src1_data_2 + src2_data_2; double tmp_data_2 = src1_data_2 + src2_data_2;
*((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2; *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
} }
} }
#endif #endif
......
...@@ -44,7 +44,11 @@ ...@@ -44,7 +44,11 @@
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif #endif
/**************************************add with scalar with mask**************************************/ /**************************************add with scalar with mask**************************************/
...@@ -60,8 +64,11 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_ste ...@@ -60,8 +64,11 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_ste
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -110,8 +117,11 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st ...@@ -110,8 +117,11 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align ((dst_offset >> 1) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -145,8 +155,11 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste ...@@ -145,8 +155,11 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align ((dst_offset >> 1) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -266,8 +279,11 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste ...@@ -266,8 +279,11 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align ((dst_offset >> 1) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 1)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -442,8 +458,11 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_ste ...@@ -442,8 +458,11 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_ste
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (((dst_offset % dst_step) / 3 ) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -500,8 +519,11 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st ...@@ -500,8 +519,11 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -530,16 +552,16 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st ...@@ -530,16 +552,16 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x; ? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y; ? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy; ? tmp_data_2.xy : data_2.xy;
*((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
} }
} }
__kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
...@@ -554,8 +576,11 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste ...@@ -554,8 +576,11 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 1; x = x << 1;
#define dst_align (((dst_offset % dst_step) / 6 ) & 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
...@@ -584,16 +609,16 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste ...@@ -584,16 +609,16 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste
data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
? tmp_data_1.x : data_1.x; ? tmp_data_1.x : data_1.x;
data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_1.y : data_1.y; ? tmp_data_1.y : data_1.y;
data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
? tmp_data_2.xy : data_2.xy; ? tmp_data_2.xy : data_2.xy;
*((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
*((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
*((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
} }
} }
__kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset,
...@@ -633,9 +658,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step, ...@@ -633,9 +658,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step,
data_1 = mask_data ? tmp_data_1 : data_1; data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2; data_2 = mask_data ? tmp_data_2 : data_2;
*((__global int *)((__global char *)dst + dst_index + 0))= data_0; *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
*((__global int *)((__global char *)dst + dst_index + 4))= data_1; *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
*((__global int *)((__global char *)dst + dst_index + 8))= data_2; *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
} }
} }
__kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, __kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
...@@ -675,9 +700,9 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_ste ...@@ -675,9 +700,9 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_ste
data_1 = mask_data ? tmp_data_1 : data_1; data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2; data_2 = mask_data ? tmp_data_2 : data_2;
*((__global float *)((__global char *)dst + dst_index + 0))= data_0; *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
*((__global float *)((__global char *)dst + dst_index + 4))= data_1; *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
*((__global float *)((__global char *)dst + dst_index + 8))= data_2; *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
} }
} }
...@@ -719,9 +744,9 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global double *src1, int src1_st ...@@ -719,9 +744,9 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global double *src1, int src1_st
data_1 = mask_data ? tmp_data_1 : data_1; data_1 = mask_data ? tmp_data_1 : data_1;
data_2 = mask_data ? tmp_data_2 : data_2; data_2 = mask_data ? tmp_data_2 : data_2;
*((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
*((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
*((__global double *)((__global char *)dst + dst_index + 16))= data_2; *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
} }
} }
#endif #endif
......
...@@ -43,7 +43,11 @@ ...@@ -43,7 +43,11 @@
// //
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif #endif
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
...@@ -51,9 +55,9 @@ ...@@ -51,9 +55,9 @@
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_and without mask**************************************/ /**************************************bitwise_and without mask**************************************/
__kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset, __global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset, __global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -61,31 +65,34 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr ...@@ -61,31 +65,34 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data & src2_data; uchar4 tmp_data = src1_data & src2_data;
...@@ -101,9 +108,9 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr ...@@ -101,9 +108,9 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
__kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset, __global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset, __global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -111,8 +118,11 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src ...@@ -111,8 +118,11 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
...@@ -120,23 +130,23 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src ...@@ -120,23 +130,23 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
char4 src1_data = vload4(0, src1 + src1_index_fix); char4 src1_data = vload4(0, src1 + src1_index_fix);
char4 src2_data = vload4(0, src2 + src2_index_fix); char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
char4 tmp; char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
char4 tmp; char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
char4 dst_data = *((__global char4 *)(dst + dst_index)); char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data & src2_data; char4 tmp_data = src1_data & src2_data;
...@@ -151,9 +161,9 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src ...@@ -151,9 +161,9 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
__kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset, __global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset, __global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
...@@ -162,8 +172,11 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s ...@@ -162,8 +172,11 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...@@ -171,23 +184,23 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s ...@@ -171,23 +184,23 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 tmp_data = src1_data & src2_data; ushort4 tmp_data = src1_data & src2_data;
...@@ -203,9 +216,9 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s ...@@ -203,9 +216,9 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
__kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset, __global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset, __global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
...@@ -214,8 +227,11 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr ...@@ -214,8 +227,11 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...@@ -223,23 +239,23 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr ...@@ -223,23 +239,23 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
short4 tmp_data = src1_data & src2_data; short4 tmp_data = src1_data & src2_data;
...@@ -255,9 +271,9 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr ...@@ -255,9 +271,9 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
__kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset, __global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset, __global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -277,9 +293,9 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1 ...@@ -277,9 +293,9 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1
} }
__kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset, __global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset, __global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -300,9 +316,9 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src ...@@ -300,9 +316,9 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset, __global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset, __global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
......
...@@ -43,9 +43,12 @@ ...@@ -43,9 +43,12 @@
// //
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif #endif
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_NOT//////////////////////////////////////////////////// ////////////////////////////////////////////BITWISE_NOT////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
...@@ -60,26 +63,29 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr ...@@ -60,26 +63,29 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = ~ src1_data; uchar4 tmp_data = ~ src1_data;
/* if(src1_index < 0) /* if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
*/ */
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
...@@ -91,8 +97,8 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr ...@@ -91,8 +97,8 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
__kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *dst, int dst_step, int dst_offset, __global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -100,8 +106,11 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src ...@@ -100,8 +106,11 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -124,8 +133,8 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src ...@@ -124,8 +133,8 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
__kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *dst, int dst_step, int dst_offset, __global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
...@@ -134,8 +143,11 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s ...@@ -134,8 +143,11 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -159,8 +171,8 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s ...@@ -159,8 +171,8 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
__kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *dst, int dst_step, int dst_offset, __global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
...@@ -169,8 +181,11 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr ...@@ -169,8 +181,11 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -194,8 +209,8 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr ...@@ -194,8 +209,8 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
__kernel void arithm_bitwise_not_D4 (__global int *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_not_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *dst, int dst_step, int dst_offset, __global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
......
...@@ -43,7 +43,11 @@ ...@@ -43,7 +43,11 @@
// //
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif #endif
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
...@@ -51,9 +55,9 @@ ...@@ -51,9 +55,9 @@
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_or without mask**************************************/ /**************************************bitwise_or without mask**************************************/
__kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset, __global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset, __global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -61,30 +65,33 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src ...@@ -61,30 +65,33 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data | src2_data; uchar4 tmp_data = src1_data | src2_data;
...@@ -99,9 +106,9 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src ...@@ -99,9 +106,9 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
__kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset, __global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset, __global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -109,8 +116,11 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1 ...@@ -109,8 +116,11 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
...@@ -135,9 +145,9 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1 ...@@ -135,9 +145,9 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
__kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset, __global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset, __global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
...@@ -146,8 +156,11 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr ...@@ -146,8 +156,11 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...@@ -173,9 +186,9 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr ...@@ -173,9 +186,9 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
__kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset, __global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset, __global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
...@@ -184,8 +197,11 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src ...@@ -184,8 +197,11 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...@@ -211,9 +227,9 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src ...@@ -211,9 +227,9 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
__kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset, __global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset, __global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -233,9 +249,9 @@ __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_ ...@@ -233,9 +249,9 @@ __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_
} }
__kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset, __global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset, __global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -256,9 +272,9 @@ __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1 ...@@ -256,9 +272,9 @@ __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset, __global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset, __global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
......
...@@ -43,17 +43,20 @@ ...@@ -43,17 +43,20 @@
// //
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif #endif
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////BITWISE_XOR//////////////////////////////////////////////////// ////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////
/**************************************bitwise_xor without mask**************************************/ /**************************************bitwise_xor without mask**************************************/
__kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int src1_offset,
__global uchar *src2, int src2_step, int src2_offset, __global uchar *src2, int src2_step, int src2_offset,
__global uchar *dst, int dst_step, int dst_offset, __global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -61,8 +64,11 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr ...@@ -61,8 +64,11 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
...@@ -70,23 +76,23 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr ...@@ -70,23 +76,23 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src1_data = vload4(0, src1 + src1_index_fix);
uchar4 src2_data = vload4(0, src2 + src2_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
uchar4 tmp; uchar4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
uchar4 tmp_data = src1_data ^ src2_data; uchar4 tmp_data = src1_data ^ src2_data;
...@@ -101,9 +107,9 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr ...@@ -101,9 +107,9 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
__kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset, __global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset, __global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -111,8 +117,11 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src ...@@ -111,8 +117,11 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
...@@ -120,23 +129,23 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src ...@@ -120,23 +129,23 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
char4 src1_data = vload4(0, src1 + src1_index_fix); char4 src1_data = vload4(0, src1 + src1_index_fix);
char4 src2_data = vload4(0, src2 + src2_index_fix); char4 src2_data = vload4(0, src2 + src2_index_fix);
if(src1_index < 0) if(src1_index < 0)
{ {
char4 tmp; char4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
char4 tmp; char4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
char4 dst_data = *((__global char4 *)(dst + dst_index)); char4 dst_data = *((__global char4 *)(dst + dst_index));
char4 tmp_data = src1_data ^ src2_data; char4 tmp_data = src1_data ^ src2_data;
...@@ -151,9 +160,9 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src ...@@ -151,9 +160,9 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
__kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int src1_offset,
__global ushort *src2, int src2_step, int src2_offset, __global ushort *src2, int src2_step, int src2_offset,
__global ushort *dst, int dst_step, int dst_offset, __global ushort *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
...@@ -162,8 +171,11 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s ...@@ -162,8 +171,11 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...@@ -171,23 +183,23 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s ...@@ -171,23 +183,23 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix)); ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
if(src1_index < 0) if(src1_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
ushort4 tmp; ushort4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
ushort4 tmp_data = src1_data ^ src2_data; ushort4 tmp_data = src1_data ^ src2_data;
...@@ -203,9 +215,9 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s ...@@ -203,9 +215,9 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
__kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int src1_offset,
__global short *src2, int src2_step, int src2_offset, __global short *src2, int src2_step, int src2_offset,
__global short *dst, int dst_step, int dst_offset, __global short *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
...@@ -214,8 +226,11 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr ...@@ -214,8 +226,11 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...@@ -223,25 +238,25 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr ...@@ -223,25 +238,25 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_end = mad24(y, dst_step, dst_offset + dst_step1);
int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src1_index_fix = src1_index < 0 ? 0 : src1_index;
int src2_index_fix = src2_index < 0 ? 0 : src2_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index;
short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix)); short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
if(src1_index < 0) if(src1_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
} }
if(src2_index < 0) if(src2_index < 0)
{ {
short4 tmp; short4 tmp;
tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
} }
...@@ -259,9 +274,9 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr ...@@ -259,9 +274,9 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
__kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1_offset,
__global int *src2, int src2_step, int src2_offset, __global int *src2, int src2_step, int src2_offset,
__global int *dst, int dst_step, int dst_offset, __global int *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -281,9 +296,9 @@ __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1 ...@@ -281,9 +296,9 @@ __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1
} }
__kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset, __global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset, __global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
...@@ -301,12 +316,11 @@ __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src ...@@ -301,12 +316,11 @@ __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src
*((__global char4 *)((__global char *)dst + dst_index)) = tmp; *((__global char4 *)((__global char *)dst + dst_index)) = tmp;
} }
} }
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
__kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset, __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset,
__global char *src2, int src2_step, int src2_offset, __global char *src2, int src2_step, int src2_offset,
__global char *dst, int dst_step, int dst_offset, __global char *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1) int rows, int cols, int dst_step1)
{ {
int x = get_global_id(0); int x = get_global_id(0);
int y = get_global_id(1); int y = get_global_id(1);
......
...@@ -44,7 +44,11 @@ ...@@ -44,7 +44,11 @@
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
typedef double F ; typedef double F ;
typedef double4 F4; typedef double4 F4;
#define convert_F4 convert_double4 #define convert_F4 convert_double4
...@@ -56,34 +60,24 @@ typedef float4 F4; ...@@ -56,34 +60,24 @@ typedef float4 F4;
#define convert_F float #define convert_F float
#endif #endif
uchar round2_uchar(F v){ inline uchar round2_uchar(F v)
{
uchar v1 = convert_uchar_sat(round(v)); return convert_uchar_sat(round(v));
//uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
} }
ushort round2_ushort(F v){ inline ushort round2_ushort(F v)
{
ushort v1 = convert_ushort_sat(round(v)); return convert_ushort_sat(round(v));
//ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
} }
short round2_short(F v){
short v1 = convert_short_sat(round(v));
//short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; inline short round2_short(F v)
{
return convert_short_sat(round(v));
} }
int round2_int(F v){
int v1 = convert_int_sat(round(v));
//int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; inline int round2_int(F v)
{
return convert_int_sat(round(v));
} }
/////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////
////////////////////////////divide/////////////////////////////////////////////////// ////////////////////////////divide///////////////////////////////////////////////////
...@@ -94,39 +88,41 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse ...@@ -94,39 +88,41 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
__global uchar *dst, int dst_step, int dst_offset, __global uchar *dst, int dst_step, int dst_offset,
int rows, int cols, int dst_step1, F scalar) int rows, int cols, int dst_step1, F scalar)
{ {
int x = get_global_id(0); int2 coor = (int2)(get_global_id(0), get_global_id(1));
int y = get_global_id(1);
if (x < cols && y < rows) if (coor.x < cols && coor.y < rows)
{ {
x = x << 2; coor.x = coor.x << 2;
#ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int2 src_index = (int2)(mad24(coor.y, src1_step, coor.x + src1_offset - dst_align),
mad24(coor.y, src2_step, coor.x + src2_offset - dst_align));
#define dst_align (dst_offset & 3) int4 dst_args = (int4)(mad24(coor.y, dst_step, dst_offset),
int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); mad24(coor.y, dst_step, dst_offset + dst_step1),
int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); mad24(coor.y, dst_step, dst_offset + coor.x & (int)0xfffffffc),
0);
int dst_start = mad24(y, dst_step, dst_offset); uchar4 src1_data = vload4(0, src1 + src_index.x);
int dst_end = mad24(y, dst_step, dst_offset + dst_step1); uchar4 src2_data = vload4(0, src2 + src_index.y);
int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); uchar4 dst_data = *((__global uchar4 *)(dst + dst_args.z));
uchar4 src1_data = vload4(0, src1 + src1_index);
uchar4 src2_data = vload4(0, src2 + src2_index);
uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
F4 tmp = convert_F4(src1_data) * scalar; F4 tmp = convert_F4(src1_data) * scalar;
uchar4 tmp_data; uchar4 tmp_data;
tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (F)src2_data.x); tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x);
tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (F)src2_data.y); tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y);
tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (F)src2_data.z); tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z);
tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (F)src2_data.w); tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w);
dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.x = ((dst_args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y)) ? tmp_data.x : dst_data.x;
dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; dst_data.y = ((dst_args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y)) ? tmp_data.y : dst_data.y;
dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; dst_data.z = ((dst_args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y)) ? tmp_data.z : dst_data.z;
dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; dst_data.w = ((dst_args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y)) ? tmp_data.w : dst_data.w;
*((__global uchar4 *)(dst + dst_index)) = dst_data; *((__global uchar4 *)(dst + dst_args.z)) = dst_data;
} }
} }
...@@ -141,8 +137,11 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs ...@@ -141,8 +137,11 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...@@ -181,8 +180,11 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse ...@@ -181,8 +180,11 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
...@@ -296,8 +298,11 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset ...@@ -296,8 +298,11 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src_index = mad24(y, src_step, x + src_offset - dst_align); int src_index = mad24(y, src_step, x + src_offset - dst_align);
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -332,8 +337,11 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse ...@@ -332,8 +337,11 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1)); int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -367,8 +375,11 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset ...@@ -367,8 +375,11 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
if (x < cols && y < rows) if (x < cols && y < rows)
{ {
x = x << 2; x = x << 2;
#define dst_align ((dst_offset >> 1) & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align ((dst_offset >> 1) & 3)
int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1)); int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
int dst_start = mad24(y, dst_step, dst_offset); int dst_start = mad24(y, dst_step, dst_offset);
...@@ -455,3 +466,5 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse ...@@ -455,3 +466,5 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
} }
} }
#endif #endif
...@@ -44,7 +44,11 @@ ...@@ -44,7 +44,11 @@
//M*/ //M*/
#if defined (DOUBLE_SUPPORT) #if defined (DOUBLE_SUPPORT)
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64:enable #pragma OPENCL EXTENSION cl_khr_fp64:enable
#elif defined (cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64:enable
#endif
#endif #endif
////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////
...@@ -60,8 +64,11 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of ...@@ -60,8 +64,11 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
if (x < cols && y < thread_rows) if (x < cols && y < thread_rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align); int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
...@@ -115,8 +122,11 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off ...@@ -115,8 +122,11 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off
if (x < cols && y < thread_rows) if (x < cols && y < thread_rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (dst_offset & 3) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (dst_offset & 3)
int src_index_0 = mad24(y, src_step, x + src_offset - dst_align); int src_index_0 = mad24(y, src_step, x + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
...@@ -157,8 +167,11 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o ...@@ -157,8 +167,11 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o
if (x < cols && y < thread_rows) if (x < cols && y < thread_rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (((dst_offset >> 1) & 3) << 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset >> 1) & 3) << 1)
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align); int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
...@@ -199,8 +212,11 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of ...@@ -199,8 +212,11 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of
if (x < cols && y < thread_rows) if (x < cols && y < thread_rows)
{ {
x = x << 2; x = x << 2;
#define dst_align (((dst_offset >> 1) & 3) << 1) #ifdef dst_align
#undef dst_align
#endif
#define dst_align (((dst_offset >> 1) & 3) << 1)
int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align); int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align);
int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册