diff --git a/modules/ocl/src/opencl/arithm_absdiff.cl b/modules/ocl/src/opencl/arithm_absdiff.cl index 37f154216a3a3cdcb1d67f0630fb9f560f1b5f0a..6ae869d61cfc59005aede30f98732bb637ce1a22 100644 --- a/modules/ocl/src/opencl/arithm_absdiff.cl +++ b/modules/ocl/src/opencl/arithm_absdiff.cl @@ -44,7 +44,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -62,7 +66,10 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o if (x < cols && y < rows) { x = x << 2; - + +#ifdef dst_align +#undef dst_align +#endif #define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -110,8 +117,11 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_ if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -144,8 +154,11 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -248,8 +261,11 @@ __kernel void arithm_s_absdiff_C1_D0 (__global uchar *src1, int src1_step, int if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); @@ -287,8 +303,11 @@ __kernel void arithm_s_absdiff_C1_D2 (__global ushort *src1, int src1_step, in if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -318,8 +337,11 @@ __kernel void arithm_s_absdiff_C1_D3 (__global short *src1, int src1_step, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -387,8 +409,8 @@ __kernel void arithm_s_absdiff_C1_D5 (__global float *src1, int src1_step, int #if defined (DOUBLE_SUPPORT) __kernel void arithm_s_absdiff_C1_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *dst, int dst_step, int dst_offset, - double4 src2, int rows, int cols, int dst_step1) + __global double *dst, int dst_step, int dst_offset, + double4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -421,8 +443,11 @@ __kernel void arithm_s_absdiff_C2_D0 (__global uchar *src1, int src1_step, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -465,7 +490,7 @@ __kernel void arithm_s_absdiff_C2_D2 (__global ushort *src1, int src1_step, in } __kernel void arithm_s_absdiff_C2_D3 (__global short *src1, int src1_step, int src1_offset, __global short *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -509,7 +534,7 @@ __kernel void arithm_s_absdiff_C2_D4 (__global int *src1, int src1_step, int s } __kernel void arithm_s_absdiff_C2_D5 (__global float *src1, int src1_step, int src1_offset, __global float *dst, int dst_step, int dst_offset, - float4 src2, int rows, int cols, int dst_step1) + float4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -563,8 +588,11 @@ __kernel void arithm_s_absdiff_C3_D0 (__global uchar *src1, int src1_step, int if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int dst_start = mad24(y, dst_step, dst_offset); @@ -617,8 +645,11 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int dst_start = mad24(y, dst_step, dst_offset); @@ -644,16 +675,16 @@ __kernel void arithm_s_absdiff_C3_D2 (__global ushort *src1, int src1_step, in data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; } } __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int src1_offset, @@ -667,8 +698,11 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int dst_start = mad24(y, dst_step, dst_offset); @@ -694,16 +728,16 @@ __kernel void arithm_s_absdiff_C3_D3 (__global short *src1, int src1_step, int data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; } } __kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int src1_offset, @@ -735,9 +769,9 @@ __kernel void arithm_s_absdiff_C3_D4 (__global int *src1, int src1_step, int s int tmp_data_1 = convert_int_sat(abs_diff(src1_data_1, src2_data_1)); int tmp_data_2 = convert_int_sat(abs_diff(src1_data_2, src2_data_2)); - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; + *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; + *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; + *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; } } __kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int src1_offset, @@ -769,9 +803,9 @@ __kernel void arithm_s_absdiff_C3_D5 (__global float *src1, int src1_step, int float tmp_data_1 = fabs(src1_data_1 - src2_data_1); float tmp_data_2 = fabs(src1_data_2 - src2_data_2); - *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2; + *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0; + *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1; + *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2; } } @@ -805,9 +839,9 @@ __kernel void arithm_s_absdiff_C3_D6 (__global double *src1, int src1_step, in double tmp_data_1 = fabs(src1_data_1 - src2_data_1); double tmp_data_2 = fabs(src1_data_2 - src2_data_2); - *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2; + *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; + *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; + *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2; } } #endif diff --git a/modules/ocl/src/opencl/arithm_add.cl b/modules/ocl/src/opencl/arithm_add.cl index 789a42444c5916bb840e36303070f46622867023..647171578dacd2436ada37f5d2841c3fa0e9ddcc 100644 --- a/modules/ocl/src/opencl/arithm_add.cl +++ b/modules/ocl/src/opencl/arithm_add.cl @@ -45,7 +45,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -64,7 +68,10 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse { x = x << 2; - #define dst_align (dst_offset & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -112,7 +119,10 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs { x = x << 2; - #define dst_align ((dst_offset >> 1) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -147,7 +157,10 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse { x = x << 2; - #define dst_align ((dst_offset >> 1) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -252,7 +265,10 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i { x = x << 2; - #define dst_align (dst_offset & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -311,7 +327,10 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step, { x = x << 1; - #define dst_align ((dst_offset >> 1) & 1) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -348,7 +367,10 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i { x = x << 1; - #define dst_align ((dst_offset >> 1) & 1) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -477,7 +499,10 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i { x = x << 1; - #define dst_align ((dst_offset >> 1) & 1) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -664,7 +689,10 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i { x = x << 2; - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -724,7 +752,10 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step, { x = x << 1; - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -754,16 +785,16 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step, data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; } } __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, @@ -780,7 +811,10 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i { x = x << 1; - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -810,16 +844,16 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; } } __kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, @@ -861,9 +895,9 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int *src1, int src1_step, i data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; + *((__global int *)((__global char *)dst + dst_index + 0))= data_0; + *((__global int *)((__global char *)dst + dst_index + 4))= data_1; + *((__global int *)((__global char *)dst + dst_index + 8))= data_2; } } __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, @@ -905,9 +939,9 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global float *)((__global char *)dst + dst_index + 0))= data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= data_2; + *((__global float *)((__global char *)dst + dst_index + 0))= data_0; + *((__global float *)((__global char *)dst + dst_index + 4))= data_1; + *((__global float *)((__global char *)dst + dst_index + 8))= data_2; } } @@ -951,9 +985,9 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step, data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= data_2; + *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; + *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; + *((__global double *)((__global char *)dst + dst_index + 16))= data_2; } } #endif diff --git a/modules/ocl/src/opencl/arithm_addWeighted.cl b/modules/ocl/src/opencl/arithm_addWeighted.cl index d76f994aa0fa71e5b5532872fd996bd5ff3509b9..d3a002625d28dcf2e1af5edff6f81b75c45240c3 100644 --- a/modules/ocl/src/opencl/arithm_addWeighted.cl +++ b/modules/ocl/src/opencl/arithm_addWeighted.cl @@ -42,8 +42,12 @@ // the use of this software, even if advised of the possibility of such damage. // //M*/ -#if defined DOUBLE_SUPPORT +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif typedef double F; #else typedef float F; @@ -52,10 +56,10 @@ typedef float F; /////////////////////////////////////////////addWeighted////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset, - __global uchar *src2, int src2_step,int src2_offset, - F alpha,F beta,F gama, - __global uchar *dst, int dst_step,int dst_offset, - int rows, int cols,int dst_step1) + __global uchar *src2, int src2_step,int src2_offset, + F alpha,F beta,F gama, + __global uchar *dst, int dst_step,int dst_offset, + int rows, int cols,int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -65,7 +69,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset { x = x << 2; - #define dst_align (dst_offset & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -87,7 +94,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); // short4 tmp = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama; - short4 tmp; + short4 tmp; tmp.x = src1_data.x * alpha + src2_data.x * beta + gama; tmp.y = src1_data.y * alpha + src2_data.y * beta + gama; tmp.z = src1_data.z * alpha + src2_data.z * beta + gama; @@ -100,7 +107,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; *((__global uchar4 *)(dst + dst_index)) = dst_data; - // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama; + // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama; } } @@ -108,10 +115,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset, - __global ushort *src2, int src2_step,int src2_offset, - F alpha,F beta,F gama, - __global ushort *dst, int dst_step,int dst_offset, - int rows, int cols,int dst_step1) + __global ushort *src2, int src2_step,int src2_offset, + F alpha,F beta,F gama, + __global ushort *dst, int dst_step,int dst_offset, + int rows, int cols,int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -121,35 +128,38 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); - // int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama; - int4 tmp; + // int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama; + int4 tmp; tmp.x = src1_data.x * alpha + src2_data.x * beta + gama; tmp.y = src1_data.y * alpha + src2_data.y * beta + gama; tmp.z = src1_data.z * alpha + src2_data.z * beta + gama; @@ -181,8 +191,11 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1)); @@ -190,26 +203,26 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 )); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - short4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - short4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - // int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama; - int4 tmp; + // int4 tmp = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama; + int4 tmp; tmp.x = src1_data.x * alpha + src2_data.x * beta + gama; tmp.y = src1_data.y * alpha + src2_data.y * beta + gama; tmp.z = src1_data.z * alpha + src2_data.z * beta + gama; @@ -228,7 +241,7 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, __global int *src2, int src2_step,int src2_offset, - F alpha,F beta, F gama, + F alpha,F beta, F gama, __global int *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { @@ -241,9 +254,12 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, x = x << 2; - #define bitOfInt (sizeof(int)== 4 ? 2: 3) - - #define dst_align ((dst_offset >> bitOfInt) & 3) +#define bitOfInt (sizeof(int)== 4 ? 2: 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> bitOfInt) & 3) int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt)); int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt)); @@ -252,26 +268,26 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt)); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - int4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - int4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + int4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + int4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index)); - // double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ; - float4 tmp; + // double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ; + float4 tmp; tmp.x = src1_data.x * alpha + src2_data.x * beta + gama; tmp.y = src1_data.y * alpha + src2_data.y * beta + gama; tmp.z = src1_data.z * alpha + src2_data.z * beta + gama; @@ -291,7 +307,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset, __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset, __global float *src2, int src2_step,int src2_offset, - F alpha,F beta, F gama, + F alpha,F beta, F gama, __global float *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { @@ -303,8 +319,11 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset { x = x << 2; - - #define dst_align ((dst_offset >> 2) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 2) & 3) int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); @@ -313,32 +332,32 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2)); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index)); - if(src1_index < 0) - { - float4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - float4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - // double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ; - - // float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ; - float4 tmp_data; + if(src1_index < 0) + { + float4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + float4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + // double4 tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ; + + // float4 tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ; + float4 tmp_data; tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama; tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama; tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama; tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama; - // float4 tmp_data = convert_float4(tmp); + // float4 tmp_data = convert_float4(tmp); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y; @@ -353,7 +372,7 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset #if defined (DOUBLE_SUPPORT) __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset, __global double *src2, int src2_step,int src2_offset, - F alpha,F beta, F gama, + F alpha,F beta, F gama, __global double *dst, int dst_step,int dst_offset, int rows, int cols,int dst_step1) { @@ -365,8 +384,11 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs { x = x << 2; - - #define dst_align ((dst_offset >> 3) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 3) & 3) int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); @@ -375,25 +397,25 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3)); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; double4 src1_data = vload4(0, (__global double *)((__global char *)src1 + src1_index_fix)); double4 src2_data = vload4(0, (__global double *)((__global char *)src2 + src2_index_fix)); double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index)); - if(src1_index < 0) - { - double4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - double4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } - // double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ; - double4 tmp_data; + if(src1_index < 0) + { + double4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + double4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } + // double4 tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ; + double4 tmp_data; tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama; tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama; tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama; diff --git a/modules/ocl/src/opencl/arithm_add_scalar.cl b/modules/ocl/src/opencl/arithm_add_scalar.cl index 05b813dc8cb7db5a757b517e9a6891d63c8bbf0f..15ae95df25927e9e741e510cffc0287c9b7b6ff1 100644 --- a/modules/ocl/src/opencl/arithm_add_scalar.cl +++ b/modules/ocl/src/opencl/arithm_add_scalar.cl @@ -44,9 +44,13 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable #endif +#endif /**************************************add with scalar without mask**************************************/ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, __global uchar *dst, int dst_step, int dst_offset, @@ -58,8 +62,11 @@ __kernel void arithm_s_add_C1_D0 (__global uchar *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); @@ -98,8 +105,11 @@ __kernel void arithm_s_add_C1_D2 (__global ushort *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -130,8 +140,11 @@ __kernel void arithm_s_add_C1_D3 (__global short *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -232,8 +245,11 @@ __kernel void arithm_s_add_C2_D0 (__global uchar *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -377,8 +393,11 @@ __kernel void arithm_s_add_C3_D0 (__global uchar *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int dst_start = mad24(y, dst_step, dst_offset); @@ -431,8 +450,11 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int dst_start = mad24(y, dst_step, dst_offset); @@ -458,16 +480,16 @@ __kernel void arithm_s_add_C3_D2 (__global ushort *src1, int src1_step, int sr data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; } } __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src1_offset, @@ -481,8 +503,11 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int dst_start = mad24(y, dst_step, dst_offset); @@ -508,16 +533,16 @@ __kernel void arithm_s_add_C3_D3 (__global short *src1, int src1_step, int src data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; } } __kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_offset, @@ -549,9 +574,9 @@ __kernel void arithm_s_add_C3_D4 (__global int *src1, int src1_step, int src1_ int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1); int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2); - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; + *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; + *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; + *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; } } __kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src1_offset, @@ -583,9 +608,9 @@ __kernel void arithm_s_add_C3_D5 (__global float *src1, int src1_step, int src float tmp_data_1 = src1_data_1 + src2_data_1; float tmp_data_2 = src1_data_2 + src2_data_2; - *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2; + *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0; + *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1; + *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2; } } @@ -619,9 +644,9 @@ __kernel void arithm_s_add_C3_D6 (__global double *src1, int src1_step, int sr double tmp_data_1 = src1_data_1 + src2_data_1; double tmp_data_2 = src1_data_2 + src2_data_2; - *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2; + *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; + *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; + *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2; } } #endif diff --git a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl index 4acb5be6a231cb64db7d21755f2d06a50a8d6d83..1e2ae71af6c11156a255793686c54052c834011a 100644 --- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl @@ -44,7 +44,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif /**************************************add with scalar with mask**************************************/ @@ -60,8 +64,11 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global uchar *src1, int src1_ste if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -110,8 +117,11 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global ushort *src1, int src1_st if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -145,8 +155,11 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global short *src1, int src1_ste if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -266,8 +279,11 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global uchar *src1, int src1_ste if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -442,8 +458,11 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global uchar *src1, int src1_ste if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -500,8 +519,11 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -530,16 +552,16 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global ushort *src1, int src1_st data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; } } __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, @@ -554,8 +576,11 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -584,16 +609,16 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global short *src1, int src1_ste data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; } } __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, @@ -633,9 +658,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global int *src1, int src1_step, data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; + *((__global int *)((__global char *)dst + dst_index + 0))= data_0; + *((__global int *)((__global char *)dst + dst_index + 4))= data_1; + *((__global int *)((__global char *)dst + dst_index + 8))= data_2; } } __kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset, @@ -675,9 +700,9 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global float *src1, int src1_ste data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global float *)((__global char *)dst + dst_index + 0))= data_0; - *((__global float *)((__global char *)dst + dst_index + 4))= data_1; - *((__global float *)((__global char *)dst + dst_index + 8))= data_2; + *((__global float *)((__global char *)dst + dst_index + 0))= data_0; + *((__global float *)((__global char *)dst + dst_index + 4))= data_1; + *((__global float *)((__global char *)dst + dst_index + 8))= data_2; } } @@ -719,9 +744,9 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global double *src1, int src1_st data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global double *)((__global char *)dst + dst_index + 16))= data_2; + *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0; + *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1; + *((__global double *)((__global char *)dst + dst_index + 16))= data_2; } } #endif diff --git a/modules/ocl/src/opencl/arithm_bitwise_and.cl b/modules/ocl/src/opencl/arithm_bitwise_and.cl index 8adc56de5f04308aae046e05a4e734473008606d..a369d8743955d6f4ba515e42108207336322e2ea 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_and.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_and.cl @@ -43,7 +43,11 @@ // //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -51,9 +55,9 @@ /////////////////////////////////////////////////////////////////////////////////////////////////////// /**************************************bitwise_and without mask**************************************/ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -61,31 +65,34 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; - uchar4 src1_data = vload4(0, src1 + src1_index_fix); - uchar4 src2_data = vload4(0, src2 + src2_index_fix); - - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + uchar4 src1_data = vload4(0, src1 + src1_index_fix); + uchar4 src2_data = vload4(0, src2 + src2_index_fix); + + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = src1_data & src2_data; @@ -101,9 +108,9 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global char *src2, int src2_step, int src2_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -111,8 +118,11 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -120,23 +130,23 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; - char4 src1_data = vload4(0, src1 + src1_index_fix); - char4 src2_data = vload4(0, src2 + src2_index_fix); - - if(src1_index < 0) - { - char4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - char4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; + char4 src1_data = vload4(0, src1 + src1_index_fix); + char4 src2_data = vload4(0, src2 + src2_index_fix); + + if(src1_index < 0) + { + char4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + char4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } char4 dst_data = *((__global char4 *)(dst + dst_index)); char4 tmp_data = src1_data & src2_data; @@ -151,9 +161,9 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global ushort *src2, int src2_step, int src2_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -162,8 +172,11 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -171,23 +184,23 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 tmp_data = src1_data & src2_data; @@ -203,9 +216,9 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global short *src2, int src2_step, int src2_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -214,8 +227,11 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -223,23 +239,23 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - short4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - short4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); short4 tmp_data = src1_data & src2_data; @@ -255,9 +271,9 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global int *src2, int src2_step, int src2_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -277,9 +293,9 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1 } __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global char *src2, int src2_step, int src2_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -300,9 +316,9 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src #if defined (DOUBLE_SUPPORT) __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global char *src2, int src2_step, int src2_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); diff --git a/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl index 595fb2ceb7250f80347f3148dbc8fc9ef000ac2b..fbc42364acf95bab834d2e6721018bcc3a796358 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl @@ -43,18 +43,22 @@ // //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif - ////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////BITWISE_AND//////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// /**************************************bitwise_and with mask**************************************/ -__kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C1_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -63,8 +67,11 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1 if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -91,11 +98,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1 -__kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C1_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -104,8 +112,11 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_ if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -132,11 +143,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_ -__kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C1_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -145,8 +157,11 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -171,11 +186,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src -__kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C1_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -184,8 +200,11 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1 if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -198,8 +217,8 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1 short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index)); uchar2 mask_data = vload2(0, mask + mask_index); - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - short2 tmp_data = src1_data & src2_data; + short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); + short2 tmp_data = src1_data & src2_data; data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y; @@ -210,11 +229,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1 -__kernel void arithm_bitwise_and_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C1_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -242,11 +262,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D4 (__global int *src1, int src1 -__kernel void arithm_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C1_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -274,12 +295,12 @@ __kernel void arithm_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_ -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_and_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C1_D6 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -305,15 +326,15 @@ __kernel void arithm_bitwise_and_with_mask_C1_D6 (__global char *src1, int src1_ } } -#endif -__kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C2_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -322,8 +343,11 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1 if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -347,11 +371,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1 } -__kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C2_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -360,8 +385,11 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_ if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -384,11 +412,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_ } } -__kernel void arithm_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C2_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -413,11 +442,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src *((__global ushort2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C2_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -442,11 +472,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1 *((__global short2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C2_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -471,11 +502,12 @@ __kernel void arithm_bitwise_and_with_mask_C2_D4 (__global int *src1, int src1 *((__global int2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C2_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -500,12 +532,13 @@ __kernel void arithm_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_ *((__global char8 *)((__global char *)dst + dst_index)) = data; } } -#if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_and_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + +__kernel void arithm_bitwise_and_with_mask_C2_D6 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -530,15 +563,15 @@ __kernel void arithm_bitwise_and_with_mask_C2_D6 (__global char *src1, int src1_ *((__global char16 *)((__global char *)dst + dst_index)) = data; } } -#endif -__kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C3_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -547,8 +580,11 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1 if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -596,11 +632,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1 } -__kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C3_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -609,8 +646,11 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_ if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -657,11 +697,12 @@ __kernel void arithm_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_ } } -__kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C3_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -670,8 +711,11 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -701,23 +745,24 @@ __kernel void arithm_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C3_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -726,8 +771,11 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1 if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -757,23 +805,24 @@ __kernel void arithm_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1 data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_bitwise_and_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C3_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -808,16 +857,17 @@ __kernel void arithm_bitwise_and_with_mask_C3_D4 (__global int *src1, int src1 data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; + *((__global int *)((__global char *)dst + dst_index + 0))= data_0; + *((__global int *)((__global char *)dst + dst_index + 4))= data_1; + *((__global int *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C3_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -852,17 +902,18 @@ __kernel void arithm_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_ data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_and_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C3_D6 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -897,20 +948,20 @@ __kernel void arithm_bitwise_and_with_mask_C3_D6 (__global char *src1, int src1_ data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; + *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; + *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; + *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; } } #endif - -__kernel void arithm_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C4_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -937,11 +988,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1 } -__kernel void arithm_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C4_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_ } } -__kernel void arithm_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C4_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src *((__global ushort4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C4_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1 *((__global short4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C4_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D4 (__global int *src1, int src1 *((__global int4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C4_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_ } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_and_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_and_with_mask_C4_D6 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); diff --git a/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl index a5152ce0bff9e3ea19fa6de28efcd6e33159f25b..5058d318e0ba319cec37625a7dc908c3670d1bbe 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl @@ -42,19 +42,22 @@ // the use of this software, even if advised of the possibility of such damage. // // -#if defined (__ATI__) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#elif defined (__NVIDIA__) +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif ////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////BITWISE_AND//////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// /**************************************and with scalar without mask**************************************/ -__kernel void arithm_s_bitwise_and_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C1_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -62,8 +65,11 @@ __kernel void arithm_s_bitwise_and_C1_D0 (__global uchar *src1, int src1_step, if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); @@ -86,9 +92,10 @@ __kernel void arithm_s_bitwise_and_C1_D0 (__global uchar *src1, int src1_step, } -__kernel void arithm_s_bitwise_and_C1_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C1_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -96,8 +103,11 @@ __kernel void arithm_s_bitwise_and_C1_D1 (__global char *src1, int src1_step, if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); @@ -119,9 +129,10 @@ __kernel void arithm_s_bitwise_and_C1_D1 (__global char *src1, int src1_step, } } -__kernel void arithm_s_bitwise_and_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C1_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -130,8 +141,11 @@ __kernel void arithm_s_bitwise_and_C1_D2 (__global ushort *src1, int src1_step if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -150,9 +164,10 @@ __kernel void arithm_s_bitwise_and_C1_D2 (__global ushort *src1, int src1_step *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C1_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -161,8 +176,11 @@ __kernel void arithm_s_bitwise_and_C1_D3 (__global short *src1, int src1_step, if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -181,9 +199,10 @@ __kernel void arithm_s_bitwise_and_C1_D3 (__global short *src1, int src1_step, *((__global short2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C1_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -202,9 +221,10 @@ __kernel void arithm_s_bitwise_and_C1_D4 (__global int *src1, int src1_step, i *((__global int *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_C1_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C1_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -232,11 +252,11 @@ __kernel void arithm_s_bitwise_and_C1_D5 (__global char *src1, int src1_step, *((__global char4 *)((__global char *)dst + dst_index)) = data; } } - #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C1_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -256,9 +276,10 @@ __kernel void arithm_s_bitwise_and_C1_D6 (__global short *src1, int src1_step, i } } #endif -__kernel void arithm_s_bitwise_and_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C2_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -267,8 +288,11 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global uchar *src1, int src1_step, if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -290,9 +314,10 @@ __kernel void arithm_s_bitwise_and_C2_D0 (__global uchar *src1, int src1_step, } -__kernel void arithm_s_bitwise_and_C2_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C2_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -301,8 +326,11 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global char *src1, int src1_step, if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -322,9 +350,10 @@ __kernel void arithm_s_bitwise_and_C2_D1 (__global char *src1, int src1_step, } } -__kernel void arithm_s_bitwise_and_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C2_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -343,9 +372,10 @@ __kernel void arithm_s_bitwise_and_C2_D2 (__global ushort *src1, int src1_step *((__global ushort2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C2_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -364,9 +394,10 @@ __kernel void arithm_s_bitwise_and_C2_D3 (__global short *src1, int src1_step, *((__global short2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C2_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -384,9 +415,10 @@ __kernel void arithm_s_bitwise_and_C2_D4 (__global int *src1, int src1_step, i *((__global int2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_C2_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C2_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -403,12 +435,13 @@ __kernel void arithm_s_bitwise_and_C2_D5 (__global char *src1, int src1_step, char8 tmp_data = src1_data & src2_data; *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data; - } + } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C2_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -428,9 +461,10 @@ __kernel void arithm_s_bitwise_and_C2_D6 (__global short *src1, int src1_step, i } } #endif -__kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C3_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -439,8 +473,11 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step, if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int dst_start = mad24(y, dst_step, dst_offset); @@ -484,9 +521,10 @@ __kernel void arithm_s_bitwise_and_C3_D0 (__global uchar *src1, int src1_step, } -__kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C3_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -495,8 +533,11 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step, if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int dst_start = mad24(y, dst_step, dst_offset); @@ -539,9 +580,10 @@ __kernel void arithm_s_bitwise_and_C3_D1 (__global char *src1, int src1_step, } } -__kernel void arithm_s_bitwise_and_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C3_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -550,8 +592,11 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global ushort *src1, int src1_step if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int dst_start = mad24(y, dst_step, dst_offset); @@ -577,21 +622,22 @@ __kernel void arithm_s_bitwise_and_C3_D2 (__global ushort *src1, int src1_step data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_and_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C3_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -600,8 +646,11 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global short *src1, int src1_step, if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int dst_start = mad24(y, dst_step, dst_offset); @@ -627,21 +676,22 @@ __kernel void arithm_s_bitwise_and_C3_D3 (__global short *src1, int src1_step, data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_and_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C3_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -668,14 +718,15 @@ __kernel void arithm_s_bitwise_and_C3_D4 (__global int *src1, int src1_step, i int tmp_data_1 = src1_data_1 & src2_data_1; int tmp_data_2 = src1_data_2 & src2_data_2; - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; + *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; + *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; + *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; } } -__kernel void arithm_s_bitwise_and_C3_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C3_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -702,15 +753,16 @@ __kernel void arithm_s_bitwise_and_C3_D5 (__global char *src1, int src1_step, char4 tmp_data_1 = src1_data_1 & src2_data_1; char4 tmp_data_2 = src1_data_2 & src2_data_2; - *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; + *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; + *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; + *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C3_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -737,15 +789,16 @@ __kernel void arithm_s_bitwise_and_C3_D6 (__global short *src1, int src1_step, i short4 tmp_data_1 = src1_data_1 & src2_data_1; short4 tmp_data_2 = src1_data_2 & src2_data_2; - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; + *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; + *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; + *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; } } #endif -__kernel void arithm_s_bitwise_and_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C4_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -765,9 +818,10 @@ __kernel void arithm_s_bitwise_and_C4_D0 (__global uchar *src1, int src1_step, } -__kernel void arithm_s_bitwise_and_C4_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C4_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -786,9 +840,10 @@ __kernel void arithm_s_bitwise_and_C4_D1 (__global char *src1, int src1_step, } } -__kernel void arithm_s_bitwise_and_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C4_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -806,9 +861,10 @@ __kernel void arithm_s_bitwise_and_C4_D2 (__global ushort *src1, int src1_step *((__global ushort4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C4_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -826,9 +882,10 @@ __kernel void arithm_s_bitwise_and_C4_D3 (__global short *src1, int src1_step, *((__global short4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C4_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -846,9 +903,10 @@ __kernel void arithm_s_bitwise_and_C4_D4 (__global int *src1, int src1_step, i *((__global int4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_C4_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C4_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -869,9 +927,10 @@ __kernel void arithm_s_bitwise_and_C4_D5 (__global char *src1, int src1_step, } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_C4_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -897,10 +956,10 @@ __kernel void arithm_s_bitwise_and_C4_D6 (__global short *src1, int src1_step, i short4 tmp_data_2 = src1_data_2 & src2_data_2; short4 tmp_data_3 = src1_data_3 & src2_data_3; - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3; + *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; + *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; + *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; + *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3; } } diff --git a/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl index beafd7e0a703cf253980cd4c4722200cfb48f843..71371737da24ef4aee6bd8c2094ddc1bd857d6b0 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl @@ -42,20 +42,22 @@ // the use of this software, even if advised of the possibility of such damage. // //M*/ -#if defined (__ATI__) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#elif defined (__NVIDIA__) +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif - ////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////BITWISE_AND//////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// /**************************************bitwise_and with scalar with mask**************************************/ -__kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C1_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -64,8 +66,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -90,10 +95,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D0 (__global uchar *src1, int } -__kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C1_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -102,8 +108,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global char *src1, int s if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -127,10 +136,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D1 (__global char *src1, int s } } -__kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C1_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -139,8 +149,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -161,10 +174,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D2 (__global ushort *src1, int *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C1_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -173,8 +187,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global short *src1, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -195,10 +212,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D3 (__global short *src1, int *((__global short2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C1_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -223,10 +241,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D4 (__global int *src1, int } } -__kernel void arithm_s_bitwise_and_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C1_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -252,10 +271,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D5 (__global char *src1, int src } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C1_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -280,10 +300,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C1_D6 (__global short *src1, int sr } } #endif -__kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C2_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -292,8 +313,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -316,10 +340,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D0 (__global uchar *src1, int } -__kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C2_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -328,8 +353,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global char *src1, int s if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -351,10 +379,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D1 (__global char *src1, int s } } -__kernel void arithm_s_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C2_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -378,10 +407,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D2 (__global ushort *src1, int *((__global ushort2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C2_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -405,10 +435,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D3 (__global short *src1, int *((__global short2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C2_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -432,10 +463,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D4 (__global int *src1, int sr *((__global int2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C2_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -461,10 +493,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D5 (__global char *src1, int s } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_with_mask_C2_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C2_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -489,10 +522,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C2_D6 (__global short *src1, int sr } } #endif -__kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C3_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -501,8 +535,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -549,10 +586,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D0 (__global uchar *src1, int } -__kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C3_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -561,8 +599,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int s if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -608,10 +649,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D1 (__global char *src1, int s } } -__kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C3_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -620,8 +662,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -650,22 +695,23 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D2 (__global ushort *src1, int data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C3_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -674,8 +720,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global short *src1, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -704,22 +753,23 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D3 (__global short *src1, int data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_and_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C3_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -753,15 +803,16 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D4 (__global int *src1, int sr data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; + *((__global int *)((__global char *)dst + dst_index + 0))= data_0; + *((__global int *)((__global char *)dst + dst_index + 4))= data_1; + *((__global int *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_and_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C3_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -795,16 +846,17 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D5 (__global char *src1, int s data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C3_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -838,16 +890,17 @@ __kernel void arithm_s_bitwise_and_with_mask_C3_D6 (__global short *src1, int sr data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2; + *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0; + *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1; + *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2; } } #endif -__kernel void arithm_s_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C4_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -872,10 +925,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D0 (__global uchar *src1, int } -__kernel void arithm_s_bitwise_and_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C4_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -899,10 +953,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D1 (__global char *src1, int s } } -__kernel void arithm_s_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C4_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -925,10 +980,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D2 (__global ushort *src1, int *((__global ushort4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C4_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -951,10 +1007,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D3 (__global short *src1, int *((__global short4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C4_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -977,10 +1034,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D4 (__global int *src1, int sr *((__global int4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_and_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C4_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -1006,10 +1064,11 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D5 (__global char *src1, int s } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_and_with_mask_C4_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -1055,3 +1114,4 @@ __kernel void arithm_s_bitwise_and_with_mask_C4_D6 (__global short *src1, int sr } } #endif + diff --git a/modules/ocl/src/opencl/arithm_bitwise_not.cl b/modules/ocl/src/opencl/arithm_bitwise_not.cl index fd9d2ccf99d2d314f17b4c40623a13ed7aaf01e1..8eb9ece75d50ecf99121cd143d816b2ad86a94ce 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_not.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_not.cl @@ -43,9 +43,12 @@ // //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif - ////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////BITWISE_NOT//////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -60,26 +63,29 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = ~ src1_data; - /* if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - */ + /* if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + */ dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; @@ -91,8 +97,8 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -100,8 +106,11 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); @@ -124,8 +133,8 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -134,8 +143,11 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -159,8 +171,8 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -169,8 +181,11 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -194,8 +209,8 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr __kernel void arithm_bitwise_not_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); diff --git a/modules/ocl/src/opencl/arithm_bitwise_or.cl b/modules/ocl/src/opencl/arithm_bitwise_or.cl index a95e59e0caa281a9ef7e18c8908175680fee210b..4d47b21271f8180da6ad02a3266d7efa58172051 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_or.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or.cl @@ -43,7 +43,11 @@ // //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -51,9 +55,9 @@ /////////////////////////////////////////////////////////////////////////////////////////////////////// /**************************************bitwise_or without mask**************************************/ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -61,30 +65,33 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix); - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = src1_data | src2_data; @@ -99,9 +106,9 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global char *src2, int src2_step, int src2_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -109,8 +116,11 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1 if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -135,9 +145,9 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1 __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global ushort *src2, int src2_step, int src2_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -146,8 +156,11 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -173,9 +186,9 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global short *src2, int src2_step, int src2_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -184,8 +197,11 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -211,9 +227,9 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global int *src2, int src2_step, int src2_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -233,9 +249,9 @@ __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_ } __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global char *src2, int src2_step, int src2_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -256,9 +272,9 @@ __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1 #if defined (DOUBLE_SUPPORT) __kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global char *src2, int src2_step, int src2_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl index aedb68c47429c45b3897476d9a09e9cbd7fb29cb..2523eddcd9e6b1eaf3e47292a67653c3e423fc41 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl @@ -43,18 +43,22 @@ // //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif - ////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////BITWISE_OR//////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// /**************************************bitwise_or with mask**************************************/ -__kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C1_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -63,8 +67,11 @@ __kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_ if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -91,11 +98,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_ -__kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C1_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -104,8 +112,11 @@ __kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_s if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -132,11 +143,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_s -__kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C1_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -145,8 +157,11 @@ __kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1 if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -171,11 +186,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1 -__kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C1_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -184,8 +200,11 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_ if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -198,8 +217,8 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_ short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index)); uchar2 mask_data = vload2(0, mask + mask_index); - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - short2 tmp_data = src1_data | src2_data; + short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); + short2 tmp_data = src1_data | src2_data; data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y; @@ -210,11 +229,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_ -__kernel void arithm_bitwise_or_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C1_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -242,11 +262,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D4 (__global int *src1, int src1_ -__kernel void arithm_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C1_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -273,13 +294,13 @@ __kernel void arithm_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_s } - #if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_or_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C1_D6 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -308,12 +329,12 @@ __kernel void arithm_bitwise_or_with_mask_C1_D6 (__global char *src1, int src1_s #endif - -__kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C2_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -322,8 +343,11 @@ __kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_ if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -347,11 +371,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_ } -__kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C2_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -360,8 +385,11 @@ __kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_s if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -384,11 +412,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_s } } -__kernel void arithm_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C2_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -413,11 +442,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1 *((__global ushort2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C2_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -442,11 +472,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_ *((__global short2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_or_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C2_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -471,11 +502,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D4 (__global int *src1, int src1_ *((__global int2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C2_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -501,11 +533,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_s } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C2_D6 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -533,12 +566,12 @@ __kernel void arithm_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_s #endif - -__kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C3_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -547,8 +580,11 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_ if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -596,11 +632,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_ } -__kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C3_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -609,8 +646,11 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -657,11 +697,12 @@ __kernel void arithm_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_s } } -__kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C3_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -670,8 +711,11 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1 if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -701,23 +745,24 @@ __kernel void arithm_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1 data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C3_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -726,8 +771,11 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_ if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -757,23 +805,24 @@ __kernel void arithm_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_ data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_bitwise_or_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C3_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -808,16 +857,17 @@ __kernel void arithm_bitwise_or_with_mask_C3_D4 (__global int *src1, int src1_ data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; + *((__global int *)((__global char *)dst + dst_index + 0))= data_0; + *((__global int *)((__global char *)dst + dst_index + 4))= data_1; + *((__global int *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C3_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -852,17 +902,18 @@ __kernel void arithm_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_s data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_or_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C3_D6 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -897,20 +948,20 @@ __kernel void arithm_bitwise_or_with_mask_C3_D6 (__global char *src1, int src1_s data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; + *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; + *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; + *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; } } #endif - -__kernel void arithm_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C4_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -937,11 +988,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_ } -__kernel void arithm_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C4_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_s } } -__kernel void arithm_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C4_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1 *((__global ushort4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C4_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_ *((__global short4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_or_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C4_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D4 (__global int *src1, int src1_ *((__global int4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C4_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_s } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_or_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_or_with_mask_C4_D6 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl index 5b94591a30c8e5b03d60ed742617570e1d6c22e9..fdcc00c4ef8616310258a17a8380bcae76c70c02 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl @@ -43,16 +43,21 @@ // //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif ////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////BITWISE_OR//////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// /**************************************and with scalar without mask**************************************/ -__kernel void arithm_s_bitwise_or_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C1_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -60,8 +65,11 @@ __kernel void arithm_s_bitwise_or_C1_D0 (__global uchar *src1, int src1_step, if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); @@ -84,9 +92,10 @@ __kernel void arithm_s_bitwise_or_C1_D0 (__global uchar *src1, int src1_step, } -__kernel void arithm_s_bitwise_or_C1_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C1_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -94,8 +103,11 @@ __kernel void arithm_s_bitwise_or_C1_D1 (__global char *src1, int src1_step, i if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); @@ -117,9 +129,10 @@ __kernel void arithm_s_bitwise_or_C1_D1 (__global char *src1, int src1_step, i } } -__kernel void arithm_s_bitwise_or_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C1_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -128,8 +141,11 @@ __kernel void arithm_s_bitwise_or_C1_D2 (__global ushort *src1, int src1_step, if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -148,9 +164,10 @@ __kernel void arithm_s_bitwise_or_C1_D2 (__global ushort *src1, int src1_step, *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C1_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -159,8 +176,11 @@ __kernel void arithm_s_bitwise_or_C1_D3 (__global short *src1, int src1_step, if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -179,9 +199,10 @@ __kernel void arithm_s_bitwise_or_C1_D3 (__global short *src1, int src1_step, *((__global short2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C1_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -200,9 +221,10 @@ __kernel void arithm_s_bitwise_or_C1_D4 (__global int *src1, int src1_step, in *((__global int *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_C1_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C1_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -222,9 +244,10 @@ __kernel void arithm_s_bitwise_or_C1_D5 (__global char *src1, int src1_step, i } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C1_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short16 src2, int rows, int cols, int dst_step1) { @@ -245,10 +268,10 @@ __kernel void arithm_s_bitwise_or_C1_D6 (__global short *src1, int src1_step, in } } #endif - -__kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C2_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + uchar4 src2, int rows, int cols, int dst_step1) { @@ -258,8 +281,11 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step, if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -280,9 +306,10 @@ __kernel void arithm_s_bitwise_or_C2_D0 (__global uchar *src1, int src1_step, } -__kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C2_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char4 src2, int rows, int cols, int dst_step1) { @@ -292,8 +319,11 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, i if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -313,9 +343,10 @@ __kernel void arithm_s_bitwise_or_C2_D1 (__global char *src1, int src1_step, i } } -__kernel void arithm_s_bitwise_or_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C2_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + ushort4 src2, int rows, int cols, int dst_step1) { @@ -335,9 +366,10 @@ __kernel void arithm_s_bitwise_or_C2_D2 (__global ushort *src1, int src1_step, *((__global ushort2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C2_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short4 src2, int rows, int cols, int dst_step1) { @@ -358,8 +390,8 @@ __kernel void arithm_s_bitwise_or_C2_D3 (__global short *src1, int src1_step, } } __kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) + __global int *dst, int dst_step, int dst_offset, + int4 src2, int rows, int cols, int dst_step1) { @@ -378,9 +410,10 @@ __kernel void arithm_s_bitwise_or_C2_D4 (__global int *src1, int src1_step, in *((__global int2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_C2_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C2_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char16 src2, int rows, int cols, int dst_step1) { @@ -400,9 +433,10 @@ __kernel void arithm_s_bitwise_or_C2_D5 (__global char *src1, int src1_step, i } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C2_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short16 src2, int rows, int cols, int dst_step1) { @@ -423,9 +457,10 @@ __kernel void arithm_s_bitwise_or_C2_D6 (__global short *src1, int src1_step, in } } #endif -__kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C3_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + uchar4 src2, int rows, int cols, int dst_step1) { @@ -435,8 +470,11 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step, if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int dst_start = mad24(y, dst_step, dst_offset); @@ -480,9 +518,10 @@ __kernel void arithm_s_bitwise_or_C3_D0 (__global uchar *src1, int src1_step, } -__kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C3_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char4 src2, int rows, int cols, int dst_step1) { @@ -492,8 +531,11 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int dst_start = mad24(y, dst_step, dst_offset); @@ -536,9 +578,10 @@ __kernel void arithm_s_bitwise_or_C3_D1 (__global char *src1, int src1_step, i } } -__kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C3_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + ushort4 src2, int rows, int cols, int dst_step1) { @@ -548,8 +591,11 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step, if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int dst_start = mad24(y, dst_step, dst_offset); @@ -575,21 +621,22 @@ __kernel void arithm_s_bitwise_or_C3_D2 (__global ushort *src1, int src1_step, data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C3_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short4 src2, int rows, int cols, int dst_step1) { @@ -599,8 +646,11 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step, if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int dst_start = mad24(y, dst_step, dst_offset); @@ -626,21 +676,22 @@ __kernel void arithm_s_bitwise_or_C3_D3 (__global short *src1, int src1_step, data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_or_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C3_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + int4 src2, int rows, int cols, int dst_step1) { @@ -668,14 +719,15 @@ __kernel void arithm_s_bitwise_or_C3_D4 (__global int *src1, int src1_step, in int tmp_data_1 = src1_data_1 | src2_data_1; int tmp_data_2 = src1_data_2 | src2_data_2; - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; + *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; + *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; + *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; } } -__kernel void arithm_s_bitwise_or_C3_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C3_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char16 src2, int rows, int cols, int dst_step1) { @@ -700,15 +752,16 @@ __kernel void arithm_s_bitwise_or_C3_D5 (__global char *src1, int src1_step, i char4 tmp_data_1 = src1_data_1 | src2_data_1; char4 tmp_data_2 = src1_data_2 | src2_data_2; - *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; + *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; + *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; + *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C3_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short16 src2, int rows, int cols, int dst_step1) { @@ -736,15 +789,16 @@ __kernel void arithm_s_bitwise_or_C3_D6 (__global short *src1, int src1_step, in short4 tmp_data_1 = src1_data_1 | src2_data_1; short4 tmp_data_2 = src1_data_2 | src2_data_2; - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; + *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; + *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; + *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; } } #endif -__kernel void arithm_s_bitwise_or_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C4_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + uchar4 src2, int rows, int cols, int dst_step1) { @@ -765,9 +819,10 @@ __kernel void arithm_s_bitwise_or_C4_D0 (__global uchar *src1, int src1_step, } -__kernel void arithm_s_bitwise_or_C4_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C4_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char4 src2, int rows, int cols, int dst_step1) { @@ -787,9 +842,10 @@ __kernel void arithm_s_bitwise_or_C4_D1 (__global char *src1, int src1_step, i } } -__kernel void arithm_s_bitwise_or_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C4_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + ushort4 src2, int rows, int cols, int dst_step1) { @@ -808,9 +864,10 @@ __kernel void arithm_s_bitwise_or_C4_D2 (__global ushort *src1, int src1_step, *((__global ushort4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C4_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short4 src2, int rows, int cols, int dst_step1) { @@ -829,9 +886,10 @@ __kernel void arithm_s_bitwise_or_C4_D3 (__global short *src1, int src1_step, *((__global short4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C4_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + int4 src2, int rows, int cols, int dst_step1) { @@ -850,9 +908,10 @@ __kernel void arithm_s_bitwise_or_C4_D4 (__global int *src1, int src1_step, in *((__global int4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_C4_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C4_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char16 src2, int rows, int cols, int dst_step1) { @@ -874,9 +933,10 @@ __kernel void arithm_s_bitwise_or_C4_D5 (__global char *src1, int src1_step, i } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_C4_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short16 src2, int rows, int cols, int dst_step1) { @@ -903,10 +963,10 @@ __kernel void arithm_s_bitwise_or_C4_D6 (__global short *src1, int src1_step, in short4 tmp_data_2 = src1_data_2 | src2_data_2; short4 tmp_data_3 = src1_data_3 | src2_data_3; - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3; + *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; + *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; + *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; + *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3; } } diff --git a/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl index 54066c21a04f47924580d585b1630c4060928fa8..8baa9a2ca207409a2668ffe0c000a99e4a7d9730 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl @@ -43,17 +43,21 @@ // //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif - ////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////BITWISE_OR//////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// /**************************************bitwise_or with scalar with mask**************************************/ -__kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C1_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + uchar4 src2, int rows, int cols, int dst_step1) { @@ -63,8 +67,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int s if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -89,10 +96,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D0 (__global uchar *src1, int s } -__kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C1_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char4 src2, int rows, int cols, int dst_step1) { @@ -102,8 +110,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global char *src1, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -127,10 +138,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D1 (__global char *src1, int sr } } -__kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C1_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + ushort4 src2, int rows, int cols, int dst_step1) { @@ -140,8 +152,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -162,10 +177,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D2 (__global ushort *src1, int *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C1_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short4 src2, int rows, int cols, int dst_step1) { @@ -175,8 +191,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global short *src1, int s if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -197,10 +216,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D3 (__global short *src1, int s *((__global short2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C1_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + int4 src2, int rows, int cols, int dst_step1) { @@ -226,10 +246,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D4 (__global int *src1, int s } } -__kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C1_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char16 src2, int rows, int cols, int dst_step1) { @@ -254,12 +275,12 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D5 (__global char *src1, int *((__global char4 *)((__global char *)dst + dst_index)) = data; } } - #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C1_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short16 src2, int rows, int cols, int dst_step1) { @@ -285,10 +306,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C1_D6 (__global short *src1, int src } } #endif -__kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C2_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + uchar4 src2, int rows, int cols, int dst_step1) { @@ -298,8 +320,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int s if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -322,10 +347,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D0 (__global uchar *src1, int s } -__kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C2_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char4 src2, int rows, int cols, int dst_step1) { @@ -335,8 +361,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global char *src1, int sr if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -358,10 +387,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D1 (__global char *src1, int sr } } -__kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C2_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + ushort4 src2, int rows, int cols, int dst_step1) { @@ -386,10 +416,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D2 (__global ushort *src1, int *((__global ushort2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C2_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short4 src2, int rows, int cols, int dst_step1) { @@ -414,10 +445,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D3 (__global short *src1, int s *((__global short2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C2_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + int4 src2, int rows, int cols, int dst_step1) { @@ -442,10 +474,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D4 (__global int *src1, int src *((__global int2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C2_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char16 src2, int rows, int cols, int dst_step1) { @@ -463,17 +496,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D5 (__global char *src1, int sr char8 src_data1 = *((__global char8 *)((__global char *)src1 + src1_index)); char8 src_data2 = (char8)(src2.s0, src2.s1, src2.s2, src2.s3, src2.s4, src2.s5, src2.s6, src2.s7); char8 dst_data = *((__global char8 *)((__global char *)dst + dst_index)); - char8 data = src_data1 | src_data2; + char8 data = src_data1 | src_data2; data = mask_data ? data : dst_data; *((__global char8 *)((__global char *)dst + dst_index)) = data; - } + } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C2_D6 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short16 src2, int rows, int cols, int dst_step1) { @@ -499,10 +533,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C2_D6 (__global char *src1, int sr } } #endif -__kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C3_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + uchar4 src2, int rows, int cols, int dst_step1) { @@ -512,8 +547,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -560,10 +598,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D0 (__global uchar *src1, int s } -__kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C3_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char4 src2, int rows, int cols, int dst_step1) { @@ -573,8 +612,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -620,10 +662,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D1 (__global char *src1, int sr } } -__kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C3_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + ushort4 src2, int rows, int cols, int dst_step1) { @@ -633,8 +676,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -663,22 +709,23 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D2 (__global ushort *src1, int data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C3_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short4 src2, int rows, int cols, int dst_step1) { @@ -688,8 +735,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int s if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -718,22 +768,23 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D3 (__global short *src1, int s data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C3_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + int4 src2, int rows, int cols, int dst_step1) { @@ -768,15 +819,16 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D4 (__global int *src1, int src data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; + *((__global int *)((__global char *)dst + dst_index + 0))= data_0; + *((__global int *)((__global char *)dst + dst_index + 4))= data_1; + *((__global int *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C3_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char16 src2, int rows, int cols, int dst_step1) { @@ -811,17 +863,18 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D5 (__global char *src1, int sr data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; - } + } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C3_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -855,16 +908,17 @@ __kernel void arithm_s_bitwise_or_with_mask_C3_D6 (__global short *src1, int src data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2; + *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0; + *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1; + *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2; } } #endif -__kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C4_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + uchar4 src2, int rows, int cols, int dst_step1) { @@ -890,10 +944,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D0 (__global uchar *src1, int s } -__kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C4_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char4 src2, int rows, int cols, int dst_step1) { @@ -918,10 +973,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D1 (__global char *src1, int sr } } -__kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C4_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + ushort4 src2, int rows, int cols, int dst_step1) { @@ -945,10 +1001,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D2 (__global ushort *src1, int *((__global ushort4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C4_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short4 src2, int rows, int cols, int dst_step1) { @@ -972,10 +1029,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D3 (__global short *src1, int s *((__global short4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C4_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + int4 src2, int rows, int cols, int dst_step1) { @@ -999,10 +1057,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D4 (__global int *src1, int src *((__global int4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C4_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char16 src2, int rows, int cols, int dst_step1) { @@ -1029,10 +1088,11 @@ __kernel void arithm_s_bitwise_or_with_mask_C4_D5 (__global char *src1, int sr } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_or_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_or_with_mask_C4_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor.cl b/modules/ocl/src/opencl/arithm_bitwise_xor.cl index 4f743776a43bc4113c2a8f668e6a4b63b7882b70..c8b00ca39d6592b10220bef5e09145170a5ec18e 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_xor.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_xor.cl @@ -43,17 +43,20 @@ // //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif - ////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////BITWISE_XOR//////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// /**************************************bitwise_xor without mask**************************************/ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -61,8 +64,11 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -70,23 +76,23 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; uchar4 src1_data = vload4(0, src1 + src1_index_fix); uchar4 src2_data = vload4(0, src2 + src2_index_fix); - if(src1_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - uchar4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + uchar4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = src1_data ^ src2_data; @@ -101,9 +107,9 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global char *src2, int src2_step, int src2_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -111,8 +117,11 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -120,23 +129,23 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; char4 src1_data = vload4(0, src1 + src1_index_fix); char4 src2_data = vload4(0, src2 + src2_index_fix); - if(src1_index < 0) - { - char4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - char4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + char4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + char4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } char4 dst_data = *((__global char4 *)(dst + dst_index)); char4 tmp_data = src1_data ^ src2_data; @@ -151,9 +160,9 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global ushort *src2, int src2_step, int src2_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -162,8 +171,11 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -171,23 +183,23 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix)); ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix)); - if(src1_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - ushort4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + ushort4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index)); ushort4 tmp_data = src1_data ^ src2_data; @@ -203,9 +215,9 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global short *src2, int src2_step, int src2_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -214,8 +226,11 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -223,25 +238,25 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr int dst_end = mad24(y, dst_step, dst_offset + dst_step1); int dst_index = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8); - int src1_index_fix = src1_index < 0 ? 0 : src1_index; - int src2_index_fix = src2_index < 0 ? 0 : src2_index; + int src1_index_fix = src1_index < 0 ? 0 : src1_index; + int src2_index_fix = src2_index < 0 ? 0 : src2_index; short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix)); short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix)); short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index)); - if(src1_index < 0) - { - short4 tmp; - tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; - src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; - } - if(src2_index < 0) - { - short4 tmp; - tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; - src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } + if(src1_index < 0) + { + short4 tmp; + tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; + src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw; + } + if(src2_index < 0) + { + short4 tmp; + tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; + src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; + } @@ -259,9 +274,9 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global int *src2, int src2_step, int src2_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -281,9 +296,9 @@ __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1 } __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global char *src2, int src2_step, int src2_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -301,12 +316,11 @@ __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src *((__global char4 *)((__global char *)dst + dst_index)) = tmp; } } - #if defined (DOUBLE_SUPPORT) __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global char *src2, int src2_step, int src2_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl index 4359d860a5b6280e1dfbb04dbf50f7487f9b35b9..48bd3e444a6e4bb78aa6701de2b942634b602341 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl @@ -43,18 +43,22 @@ // //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif - ////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////BITWISE_XOR//////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// /**************************************bitwise_xor with mask**************************************/ -__kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C1_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -63,8 +67,11 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1 if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -91,11 +98,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1 -__kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C1_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -104,8 +112,11 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_ if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -132,11 +143,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_ -__kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C1_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -145,8 +157,11 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -171,11 +186,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src -__kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C1_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -184,8 +200,11 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1 if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -198,8 +217,8 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1 short2 src2_data = vload2(0, (__global short *)((__global char *)src2 + src2_index)); uchar2 mask_data = vload2(0, mask + mask_index); - short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); - short2 tmp_data = src1_data ^ src2_data; + short2 data = *((__global short2 *)((__global uchar *)dst + dst_index)); + short2 tmp_data = src1_data ^ src2_data; data.x = convert_short((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : data.x; data.y = convert_short((mask_data.y) && (dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.y : data.y; @@ -210,11 +229,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1 -__kernel void arithm_bitwise_xor_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C1_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -242,11 +262,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D4 (__global int *src1, int src1 -__kernel void arithm_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C1_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -273,13 +294,13 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_ } - #if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_xor_with_mask_C1_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C1_D6 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -308,12 +329,12 @@ __kernel void arithm_bitwise_xor_with_mask_C1_D6 (__global char *src1, int src1_ - -__kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C2_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -322,8 +343,11 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1 if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -347,11 +371,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1 } -__kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C2_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -360,8 +385,11 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_ if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -384,11 +412,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_ } } -__kernel void arithm_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C2_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -413,11 +442,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src *((__global ushort2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C2_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -442,11 +472,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1 *((__global short2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_xor_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C2_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -471,11 +502,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D4 (__global int *src1, int src1 *((__global int2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C2_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -501,11 +533,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_ } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_xor_with_mask_C2_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C2_D6 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -533,12 +566,12 @@ __kernel void arithm_bitwise_xor_with_mask_C2_D6 (__global char *src1, int src1_ #endif - -__kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C3_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -547,8 +580,11 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1 if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -596,11 +632,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1 } -__kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C3_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -609,8 +646,11 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_ if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -657,11 +697,12 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_ } } -__kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C3_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -670,8 +711,11 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -701,23 +745,24 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C3_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -726,8 +771,11 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1 if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -757,23 +805,24 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1 data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_bitwise_xor_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C3_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -808,16 +857,17 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D4 (__global int *src1, int src1 data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; + *((__global int *)((__global char *)dst + dst_index + 0))= data_0; + *((__global int *)((__global char *)dst + dst_index + 4))= data_1; + *((__global int *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C3_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -852,17 +902,18 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_ data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_xor_with_mask_C3_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C3_D6 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -897,20 +948,20 @@ __kernel void arithm_bitwise_xor_with_mask_C3_D6 (__global char *src1, int src1_ data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; + *((__global char8 *)((__global char *)dst + dst_index + 0 ))= data_0; + *((__global char8 *)((__global char *)dst + dst_index + 8 ))= data_1; + *((__global char8 *)((__global char *)dst + dst_index + 16))= data_2; } } #endif - -__kernel void arithm_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C4_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -937,11 +988,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1 } -__kernel void arithm_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C4_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -967,11 +1019,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_ } } -__kernel void arithm_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global ushort *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C4_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global ushort *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -996,11 +1049,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src *((__global ushort4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global short *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C4_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global short *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -1025,11 +1079,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1 *((__global short4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_xor_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global int *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C4_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global int *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -1054,11 +1109,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D4 (__global int *src1, int src1 *((__global int4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C4_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -1084,11 +1140,12 @@ __kernel void arithm_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_ } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_bitwise_xor_with_mask_C4_D6 (__global char *src1, int src1_step, int src1_offset, - __global char *src2, int src2_step, int src2_offset, - __global uchar *mask, int mask_step, int mask_offset, - __global char *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) +__kernel void arithm_bitwise_xor_with_mask_C4_D6 ( + __global char *src1, int src1_step, int src1_offset, + __global char *src2, int src2_step, int src2_offset, + __global uchar *mask, int mask_step, int mask_offset, + __global char *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl index 318432a1861cc90bb8c6bf67323501ba928e61f5..2c6dd50cd431abba13cf50ae4f875e2c8b46e901 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl @@ -42,19 +42,21 @@ // the use of this software, even if advised of the possibility of such damage. // // -#if defined (__ATI__) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#elif defined (__NVIDIA__) +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif - ////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////BITWISE_XOR//////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// /**************************************xor with scalar without mask**************************************/ -__kernel void arithm_s_bitwise_xor_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C1_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -62,8 +64,11 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (__global uchar *src1, int src1_step, if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); @@ -86,9 +91,10 @@ __kernel void arithm_s_bitwise_xor_C1_D0 (__global uchar *src1, int src1_step, } -__kernel void arithm_s_bitwise_xor_C1_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C1_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -96,8 +102,11 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (__global char *src1, int src1_step, if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); @@ -119,9 +128,10 @@ __kernel void arithm_s_bitwise_xor_C1_D1 (__global char *src1, int src1_step, } } -__kernel void arithm_s_bitwise_xor_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C1_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -130,8 +140,11 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (__global ushort *src1, int src1_step if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -150,9 +163,10 @@ __kernel void arithm_s_bitwise_xor_C1_D2 (__global ushort *src1, int src1_step *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C1_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -161,8 +175,11 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (__global short *src1, int src1_step, if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -181,9 +198,10 @@ __kernel void arithm_s_bitwise_xor_C1_D3 (__global short *src1, int src1_step, *((__global short2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C1_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -202,9 +220,10 @@ __kernel void arithm_s_bitwise_xor_C1_D4 (__global int *src1, int src1_step, i *((__global int *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C1_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C1_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -234,9 +253,10 @@ __kernel void arithm_s_bitwise_xor_C1_D5 (__global char *src1, int src1_step, } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C1_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -256,9 +276,10 @@ __kernel void arithm_s_bitwise_xor_C1_D6 (__global short *src1, int src1_step, i } } #endif -__kernel void arithm_s_bitwise_xor_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C2_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -267,8 +288,11 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global uchar *src1, int src1_step, if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -290,9 +314,10 @@ __kernel void arithm_s_bitwise_xor_C2_D0 (__global uchar *src1, int src1_step, } -__kernel void arithm_s_bitwise_xor_C2_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C2_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -301,8 +326,11 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global char *src1, int src1_step, if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -322,9 +350,10 @@ __kernel void arithm_s_bitwise_xor_C2_D1 (__global char *src1, int src1_step, } } -__kernel void arithm_s_bitwise_xor_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C2_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -343,9 +372,10 @@ __kernel void arithm_s_bitwise_xor_C2_D2 (__global ushort *src1, int src1_step *((__global ushort2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C2_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -364,9 +394,10 @@ __kernel void arithm_s_bitwise_xor_C2_D3 (__global short *src1, int src1_step, *((__global short2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C2_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -384,9 +415,10 @@ __kernel void arithm_s_bitwise_xor_C2_D4 (__global int *src1, int src1_step, i *((__global int2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C2_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C2_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -403,12 +435,13 @@ __kernel void arithm_s_bitwise_xor_C2_D5 (__global char *src1, int src1_step, char8 tmp_data = src1_data ^ src2_data; *((__global char8 *)((__global char *)dst + dst_index)) = tmp_data; - } + } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C2_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -428,9 +461,10 @@ __kernel void arithm_s_bitwise_xor_C2_D6 (__global short *src1, int src1_step, i } } #endif -__kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C3_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -439,8 +473,11 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step, if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int dst_start = mad24(y, dst_step, dst_offset); @@ -484,9 +521,10 @@ __kernel void arithm_s_bitwise_xor_C3_D0 (__global uchar *src1, int src1_step, } -__kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C3_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -495,8 +533,11 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step, if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int dst_start = mad24(y, dst_step, dst_offset); @@ -539,9 +580,10 @@ __kernel void arithm_s_bitwise_xor_C3_D1 (__global char *src1, int src1_step, } } -__kernel void arithm_s_bitwise_xor_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C3_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -550,8 +592,11 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global ushort *src1, int src1_step if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int dst_start = mad24(y, dst_step, dst_offset); @@ -577,21 +622,22 @@ __kernel void arithm_s_bitwise_xor_C3_D2 (__global ushort *src1, int src1_step data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_xor_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C3_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -600,8 +646,11 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global short *src1, int src1_step, if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int dst_start = mad24(y, dst_step, dst_offset); @@ -627,21 +676,22 @@ __kernel void arithm_s_bitwise_xor_C3_D3 (__global short *src1, int src1_step, data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_xor_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C3_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -668,14 +718,15 @@ __kernel void arithm_s_bitwise_xor_C3_D4 (__global int *src1, int src1_step, i int tmp_data_1 = src1_data_1 ^ src2_data_1; int tmp_data_2 = src1_data_2 ^ src2_data_2; - *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; + *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0; + *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1; + *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2; } } -__kernel void arithm_s_bitwise_xor_C3_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C3_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -702,15 +753,16 @@ __kernel void arithm_s_bitwise_xor_C3_D5 (__global char *src1, int src1_step, char4 tmp_data_1 = src1_data_1 ^ src2_data_1; char4 tmp_data_2 = src1_data_2 ^ src2_data_2; - *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; + *((__global char4 *)((__global char *)dst + dst_index + 0))= tmp_data_0; + *((__global char4 *)((__global char *)dst + dst_index + 4))= tmp_data_1; + *((__global char4 *)((__global char *)dst + dst_index + 8))= tmp_data_2; } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C3_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -737,15 +789,16 @@ __kernel void arithm_s_bitwise_xor_C3_D6 (__global short *src1, int src1_step, i short4 tmp_data_1 = src1_data_1 ^ src2_data_1; short4 tmp_data_2 = src1_data_2 ^ src2_data_2; - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; + *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; + *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; + *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; } } #endif -__kernel void arithm_s_bitwise_xor_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C4_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -765,9 +818,10 @@ __kernel void arithm_s_bitwise_xor_C4_D0 (__global uchar *src1, int src1_step, } -__kernel void arithm_s_bitwise_xor_C4_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C4_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -786,9 +840,10 @@ __kernel void arithm_s_bitwise_xor_C4_D1 (__global char *src1, int src1_step, } } -__kernel void arithm_s_bitwise_xor_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C4_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -806,9 +861,10 @@ __kernel void arithm_s_bitwise_xor_C4_D2 (__global ushort *src1, int src1_step *((__global ushort4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C4_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -826,9 +882,10 @@ __kernel void arithm_s_bitwise_xor_C4_D3 (__global short *src1, int src1_step, *((__global short4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C4_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -846,9 +903,10 @@ __kernel void arithm_s_bitwise_xor_C4_D4 (__global int *src1, int src1_step, i *((__global int4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_C4_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C4_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -869,9 +927,10 @@ __kernel void arithm_s_bitwise_xor_C4_D5 (__global char *src1, int src1_step, } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_C4_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -897,11 +956,11 @@ __kernel void arithm_s_bitwise_xor_C4_D6 (__global short *src1, int src1_step, i short4 tmp_data_2 = src1_data_2 ^ src2_data_2; short4 tmp_data_3 = src1_data_3 ^ src2_data_3; - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; - *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3; + *((__global short4 *)((__global char *)dst + dst_index + 0 ))= tmp_data_0; + *((__global short4 *)((__global char *)dst + dst_index + 8 ))= tmp_data_1; + *((__global short4 *)((__global char *)dst + dst_index + 16))= tmp_data_2; + *((__global short4 *)((__global char *)dst + dst_index + 24))= tmp_data_3; } } -#endif +#endif \ No newline at end of file diff --git a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl index 57ad9ee7134df83e4513685d2185cdd1ebf4619e..26ca59c3a36ba22416b9fd036fa1f715e621cfa8 100644 --- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl +++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl @@ -42,20 +42,23 @@ // the use of this software, even if advised of the possibility of such damage. // //M*/ -#if defined (__ATI__) -#pragma OPENCL EXTENSION cl_amd_fp64:enable -#elif defined (__NVIDIA__) + +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif - ////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////BITWISE_XOR//////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////// /**************************************bitwise_xor with scalar with mask**************************************/ -__kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C1_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -64,8 +67,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -90,10 +96,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D0 (__global uchar *src1, int } -__kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C1_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -102,8 +109,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global char *src1, int s if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -127,10 +137,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D1 (__global char *src1, int s } } -__kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C1_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -139,8 +150,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -161,10 +175,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D2 (__global ushort *src1, int *((__global ushort2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C1_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -173,8 +188,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global short *src1, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -195,10 +213,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D3 (__global short *src1, int *((__global short2 *)((__global uchar *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C1_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -223,10 +242,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D4 (__global int *src1, int } } -__kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C1_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -252,10 +272,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D5 (__global char *src1, int src } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C1_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -280,10 +301,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C1_D6 (__global short *src1, int sr } } #endif -__kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C2_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -292,8 +314,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -316,10 +341,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D0 (__global uchar *src1, int } -__kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C2_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -328,8 +354,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global char *src1, int s if (x < cols && y < rows) { x = x << 1; - - #define dst_align ((dst_offset >> 1) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 1) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -351,10 +380,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D1 (__global char *src1, int s } } -__kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C2_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -378,10 +408,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D2 (__global ushort *src1, int *((__global ushort2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C2_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -405,10 +436,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D3 (__global short *src1, int *((__global short2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C2_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -432,10 +464,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D4 (__global int *src1, int sr *((__global int2 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C2_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -461,10 +494,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D5 (__global char *src1, int s } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C2_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -489,10 +523,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C2_D6 (__global short *src1, int sr } } #endif -__kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C3_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -501,8 +536,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -549,10 +587,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D0 (__global uchar *src1, int } -__kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C3_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -561,8 +600,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int s if (x < cols && y < rows) { x = x << 2; - - #define dst_align (((dst_offset % dst_step) / 3 ) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 3 ) & 3) int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -608,10 +650,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D1 (__global char *src1, int s } } -__kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C3_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -620,8 +663,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -650,22 +696,23 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D2 (__global ushort *src1, int data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C3_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -674,8 +721,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global short *src1, int if (x < cols && y < rows) { x = x << 1; - - #define dst_align (((dst_offset % dst_step) / 6 ) & 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset % dst_step) / 6 ) & 1) int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6)); int mask_index = mad24(y, mask_step, x + mask_offset - dst_align); @@ -704,22 +754,23 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D3 (__global short *src1, int data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy; data_1.x = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) - ? tmp_data_1.x : data_1.x; + ? tmp_data_1.x : data_1.x; data_1.y = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_1.y : data_1.y; + ? tmp_data_1.y : data_1.y; data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end)) - ? tmp_data_2.xy : data_2.xy; + ? tmp_data_2.xy : data_2.xy; - *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C3_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -753,15 +804,16 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D4 (__global int *src1, int sr data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global int *)((__global char *)dst + dst_index + 0))= data_0; - *((__global int *)((__global char *)dst + dst_index + 4))= data_1; - *((__global int *)((__global char *)dst + dst_index + 8))= data_2; + *((__global int *)((__global char *)dst + dst_index + 0))= data_0; + *((__global int *)((__global char *)dst + dst_index + 4))= data_1; + *((__global int *)((__global char *)dst + dst_index + 8))= data_2; } } -__kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C3_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -795,16 +847,17 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D5 (__global char *src1, int s data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; - *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; - *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; + *((__global char4 *)((__global char *)dst + dst_index + 0))= data_0; + *((__global char4 *)((__global char *)dst + dst_index + 4))= data_1; + *((__global char4 *)((__global char *)dst + dst_index + 8))= data_2; } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C3_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -838,16 +891,17 @@ __kernel void arithm_s_bitwise_xor_with_mask_C3_D6 (__global short *src1, int sr data_1 = mask_data ? tmp_data_1 : data_1; data_2 = mask_data ? tmp_data_2 : data_2; - *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0; - *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1; - *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2; + *((__global short4 *)((__global char *)dst + dst_index + 0 ))= data_0; + *((__global short4 *)((__global char *)dst + dst_index + 8 ))= data_1; + *((__global short4 *)((__global char *)dst + dst_index + 16))= data_2; } } #endif -__kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - uchar4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C4_D0 ( + __global uchar *src1, int src1_step, int src1_offset, + __global uchar *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + uchar4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -872,10 +926,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D0 (__global uchar *src1, int } -__kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C4_D1 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -899,10 +954,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D1 (__global char *src1, int s } } -__kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - ushort4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C4_D2 ( + __global ushort *src1, int src1_step, int src1_offset, + __global ushort *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + ushort4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -925,10 +981,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D2 (__global ushort *src1, int *((__global ushort4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C4_D3 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -951,10 +1008,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D3 (__global short *src1, int *((__global short4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - int4 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C4_D4 ( + __global int *src1, int src1_step, int src1_offset, + __global int *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + int4 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -977,10 +1035,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D4 (__global int *src1, int sr *((__global int4 *)((__global char *)dst + dst_index)) = data; } } -__kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (__global char *src1, int src1_step, int src1_offset, - __global char *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - char16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C4_D5 ( + __global char *src1, int src1_step, int src1_offset, + __global char *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + char16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -1006,10 +1065,11 @@ __kernel void arithm_s_bitwise_xor_with_mask_C4_D5 (__global char *src1, int s } } #if defined (DOUBLE_SUPPORT) -__kernel void arithm_s_bitwise_xor_with_mask_C4_D6 (__global short *src1, int src1_step, int src1_offset, - __global short *dst, int dst_step, int dst_offset, - __global uchar *mask, int mask_step, int mask_offset, - short16 src2, int rows, int cols, int dst_step1) +__kernel void arithm_s_bitwise_xor_with_mask_C4_D6 ( + __global short *src1, int src1_step, int src1_offset, + __global short *dst, int dst_step, int dst_offset, + __global uchar *mask, int mask_step, int mask_offset, + short16 src2, int rows, int cols, int dst_step1) { int x = get_global_id(0); diff --git a/modules/ocl/src/opencl/arithm_compare_eq.cl b/modules/ocl/src/opencl/arithm_compare_eq.cl index f818532ba2ea01409bd3a7f0608ed7e6b7e64adf..a660d41727a2bf13b81f49173d370c8cb4beda0a 100644 --- a/modules/ocl/src/opencl/arithm_compare_eq.cl +++ b/modules/ocl/src/opencl/arithm_compare_eq.cl @@ -43,7 +43,11 @@ // //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -51,9 +55,9 @@ /////////////////////////////////////////////////////////////////////////////////////////////////////// __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -61,8 +65,11 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -102,9 +109,9 @@ __kernel void arithm_compare_eq_D0 (__global uchar *src1, int src1_step, int src __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -113,8 +120,11 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1)& 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1)& 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -153,9 +163,9 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global short *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -164,8 +174,11 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -207,9 +220,9 @@ __kernel void arithm_compare_eq_D3 (__global short *src1, int src1_step, int src __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global int *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -217,7 +230,10 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_ if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 2) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 2) & 3) int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); @@ -227,7 +243,7 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_ int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index; - int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); + int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); if(src1_index < 0) { @@ -255,9 +271,9 @@ __kernel void arithm_compare_eq_D4 (__global int *src1, int src1_step, int src1_ } __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global float *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -265,7 +281,10 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 2) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 2) & 3) int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); @@ -275,7 +294,8 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index; float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); - float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src2_index < 0) + float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); + if(src2_index < 0) { float4 tmp; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; @@ -297,9 +317,9 @@ __kernel void arithm_compare_eq_D5 (__global float *src1, int src1_step, int src #if defined (DOUBLE_SUPPORT) __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global double *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -307,7 +327,10 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 3) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 3) & 3) int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); @@ -347,9 +370,9 @@ __kernel void arithm_compare_eq_D6 (__global double *src1, int src1_step, int sr /***********************************Compare GT**************************/ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -357,8 +380,11 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -397,9 +423,9 @@ __kernel void arithm_compare_gt_D0 (__global uchar *src1, int src1_step, int src } __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -408,8 +434,11 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -450,9 +479,9 @@ __kernel void arithm_compare_gt_D2 (__global ushort *src1, int src1_step, int sr __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global short *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -461,8 +490,11 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -501,9 +533,9 @@ __kernel void arithm_compare_gt_D3 (__global short *src1, int src1_step, int src } __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global int *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -511,7 +543,10 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_ if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 2) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 2) & 3) int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); @@ -521,7 +556,7 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_ int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index; - int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); + int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index)); int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index)); if(src1_index < 0) { @@ -550,9 +585,9 @@ __kernel void arithm_compare_gt_D4 (__global int *src1, int src1_step, int src1_ } __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global float *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -560,7 +595,10 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 2) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 2) & 3) int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); @@ -599,9 +637,9 @@ __kernel void arithm_compare_gt_D5 (__global float *src1, int src1_step, int src #if defined (DOUBLE_SUPPORT) __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global double *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -609,7 +647,10 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 3) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 3) & 3) int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); @@ -649,9 +690,9 @@ __kernel void arithm_compare_gt_D6 (__global double *src1, int src1_step, int sr /***********************************Compare GE**************************/ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -659,8 +700,11 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -702,9 +746,9 @@ __kernel void arithm_compare_ge_D0 (__global uchar *src1, int src1_step, int src __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -713,8 +757,11 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -757,9 +804,9 @@ __kernel void arithm_compare_ge_D2 (__global ushort *src1, int src1_step, int sr __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global short *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -768,8 +815,11 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1)& 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1)& 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -809,9 +859,9 @@ __kernel void arithm_compare_ge_D3 (__global short *src1, int src1_step, int src } __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global int *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -819,8 +869,11 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_ if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 2)& 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 2)& 3) int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); @@ -845,7 +898,7 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_ tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -858,9 +911,9 @@ __kernel void arithm_compare_ge_D4 (__global int *src1, int src1_step, int src1_ } __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global float *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -868,8 +921,11 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 2)& 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 2)& 3) int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); @@ -909,9 +965,9 @@ __kernel void arithm_compare_ge_D5 (__global float *src1, int src1_step, int src #if defined (DOUBLE_SUPPORT) __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global double *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -919,8 +975,11 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 3)& 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 3)& 3) int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); @@ -942,7 +1001,8 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr double4 tmp; tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx; src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; - } uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); + } + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data >= src2_data)); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -954,3 +1014,4 @@ __kernel void arithm_compare_ge_D6 (__global double *src1, int src1_step, int sr } } #endif + diff --git a/modules/ocl/src/opencl/arithm_compare_ne.cl b/modules/ocl/src/opencl/arithm_compare_ne.cl index 713dc1316917da95222f7d2660df8fbddc976075..f0128846b8a30eb583fbd3e527907e7ed58b8c3e 100644 --- a/modules/ocl/src/opencl/arithm_compare_ne.cl +++ b/modules/ocl/src/opencl/arithm_compare_ne.cl @@ -43,13 +43,17 @@ // //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif /***********************************Compare NE*******************************/ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -57,8 +61,11 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -98,9 +105,9 @@ __kernel void arithm_compare_ne_D0 (__global uchar *src1, int src1_step, int src __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -109,8 +116,11 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1)& 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1)& 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -150,9 +160,9 @@ __kernel void arithm_compare_ne_D2 (__global ushort *src1, int src1_step, int sr __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global short *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -161,8 +171,11 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1)& 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1)& 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -200,9 +213,9 @@ __kernel void arithm_compare_ne_D3 (__global short *src1, int src1_step, int src } __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global int *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -210,7 +223,10 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_ if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 2)& 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 2)& 3) int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); @@ -249,9 +265,9 @@ __kernel void arithm_compare_ne_D4 (__global int *src1, int src1_step, int src1_ } __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global float *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -259,7 +275,10 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 2) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 2) & 3) int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); @@ -269,7 +288,8 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src int src1_index_fix = src1_index < 0 ? 0 : src1_index; int src2_index_fix = src2_index < 0 ? 0 : src2_index; float4 src1_data = vload4(0, (__global float *)((__global char *)src1 + src1_index_fix)); - float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); if(src1_index < 0) + float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix)); + if(src1_index < 0) { float4 tmp; tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx; @@ -282,7 +302,7 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw; } - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data != src2_data)); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -296,9 +316,9 @@ __kernel void arithm_compare_ne_D5 (__global float *src1, int src1_step, int src #if defined (DOUBLE_SUPPORT) __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global double *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -306,7 +326,10 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 3) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 3) & 3) int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); @@ -347,9 +370,9 @@ __kernel void arithm_compare_ne_D6 (__global double *src1, int src1_step, int sr /***********************************Compare LT*******************************/ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -357,8 +380,11 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -398,9 +424,9 @@ __kernel void arithm_compare_lt_D0 (__global uchar *src1, int src1_step, int src __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -409,8 +435,11 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -451,9 +480,9 @@ __kernel void arithm_compare_lt_D2 (__global ushort *src1, int src1_step, int sr __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global short *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -462,8 +491,11 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -502,9 +534,9 @@ __kernel void arithm_compare_lt_D3 (__global short *src1, int src1_step, int src } __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global int *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -512,7 +544,10 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_ if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 2) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 2) & 3) int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); @@ -554,9 +589,9 @@ __kernel void arithm_compare_lt_D4 (__global int *src1, int src1_step, int src1_ } __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global float *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -564,7 +599,10 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 2) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 2) & 3) int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); @@ -589,7 +627,7 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src } - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -603,9 +641,9 @@ __kernel void arithm_compare_lt_D5 (__global float *src1, int src1_step, int src #if defined (DOUBLE_SUPPORT) __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global double *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -613,7 +651,10 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 3) & 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 3) & 3) int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); @@ -638,7 +679,7 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr } - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); + uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); uchar4 tmp_data = convert_uchar4((src1_data < src2_data)); dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; @@ -653,9 +694,9 @@ __kernel void arithm_compare_lt_D6 (__global double *src1, int src1_step, int sr /***********************************Compare LE*******************************/ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src1_offset, - __global uchar *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global uchar *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -663,8 +704,11 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -705,9 +749,9 @@ __kernel void arithm_compare_le_D0 (__global uchar *src1, int src1_step, int src __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int src1_offset, - __global ushort *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global ushort *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -716,8 +760,11 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -758,9 +805,9 @@ __kernel void arithm_compare_le_D2 (__global ushort *src1, int src1_step, int sr __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src1_offset, - __global short *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global short *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); @@ -769,8 +816,11 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -809,9 +859,9 @@ __kernel void arithm_compare_le_D3 (__global short *src1, int src1_step, int src } __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_offset, - __global int *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global int *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -819,7 +869,10 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_ if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 2)& 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 2)& 3) int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); @@ -857,9 +910,9 @@ __kernel void arithm_compare_le_D4 (__global int *src1, int src1_step, int src1_ } __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global float *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -867,7 +920,10 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 2)& 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 2)& 3) int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2)); int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2)); @@ -905,9 +961,9 @@ __kernel void arithm_compare_le_D5 (__global float *src1, int src1_step, int src #if defined (DOUBLE_SUPPORT) __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int src1_offset, - __global double *src2, int src2_step, int src2_offset, - __global uchar *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1) + __global double *src2, int src2_step, int src2_offset, + __global uchar *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1) { int x = get_global_id(0); int y = get_global_id(1); @@ -915,7 +971,10 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr if (x < cols && y < rows) { x = x << 2; - #define dst_align ((dst_offset >> 3)& 3) +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 3)& 3) int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3)); int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3)); @@ -952,3 +1011,5 @@ __kernel void arithm_compare_le_D6 (__global double *src1, int src1_step, int sr } } #endif + + diff --git a/modules/ocl/src/opencl/arithm_div.cl b/modules/ocl/src/opencl/arithm_div.cl index dcbe30310647945083b420efc590cc4af910b281..896277cf58e3f19933fc06fb149dd3d0c8ffe0d2 100644 --- a/modules/ocl/src/opencl/arithm_div.cl +++ b/modules/ocl/src/opencl/arithm_div.cl @@ -44,7 +44,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif typedef double F ; typedef double4 F4; #define convert_F4 convert_double4 @@ -56,34 +60,24 @@ typedef float4 F4; #define convert_F float #endif -uchar round2_uchar(F v){ - - uchar v1 = convert_uchar_sat(round(v)); - //uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5)); - - return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; +inline uchar round2_uchar(F v) +{ + return convert_uchar_sat(round(v)); } -ushort round2_ushort(F v){ - - ushort v1 = convert_ushort_sat(round(v)); - //ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5)); - - return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; +inline ushort round2_ushort(F v) +{ + return convert_ushort_sat(round(v)); } -short round2_short(F v){ - - short v1 = convert_short_sat(round(v)); - //short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5)); - return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; +inline short round2_short(F v) +{ + return convert_short_sat(round(v)); } -int round2_int(F v){ - - int v1 = convert_int_sat(round(v)); - //int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5)); - return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2; +inline int round2_int(F v) +{ + return convert_int_sat(round(v)); } /////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////divide/////////////////////////////////////////////////// @@ -94,39 +88,41 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse __global uchar *dst, int dst_step, int dst_offset, int rows, int cols, int dst_step1, F scalar) { - int x = get_global_id(0); - int y = get_global_id(1); + int2 coor = (int2)(get_global_id(0), get_global_id(1)); - if (x < cols && y < rows) + if (coor.x < cols && coor.y < rows) { - x = x << 2; + coor.x = coor.x << 2; + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) + int2 src_index = (int2)(mad24(coor.y, src1_step, coor.x + src1_offset - dst_align), + mad24(coor.y, src2_step, coor.x + src2_offset - dst_align)); - #define dst_align (dst_offset & 3) - int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); - int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); + int4 dst_args = (int4)(mad24(coor.y, dst_step, dst_offset), + mad24(coor.y, dst_step, dst_offset + dst_step1), + mad24(coor.y, dst_step, dst_offset + coor.x & (int)0xfffffffc), + 0); - int dst_start = mad24(y, dst_step, dst_offset); - int dst_end = mad24(y, dst_step, dst_offset + dst_step1); - int dst_index = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc); - - uchar4 src1_data = vload4(0, src1 + src1_index); - uchar4 src2_data = vload4(0, src2 + src2_index); - uchar4 dst_data = *((__global uchar4 *)(dst + dst_index)); + uchar4 src1_data = vload4(0, src1 + src_index.x); + uchar4 src2_data = vload4(0, src2 + src_index.y); + uchar4 dst_data = *((__global uchar4 *)(dst + dst_args.z)); F4 tmp = convert_F4(src1_data) * scalar; - uchar4 tmp_data; - tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (F)src2_data.x); - tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (F)src2_data.y); - tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (F)src2_data.z); - tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (F)src2_data.w); + tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x); + tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y); + tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z); + tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w); - dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x; - dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y; - dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z; - dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w; + dst_data.x = ((dst_args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y)) ? tmp_data.x : dst_data.x; + dst_data.y = ((dst_args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y)) ? tmp_data.y : dst_data.y; + dst_data.z = ((dst_args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y)) ? tmp_data.z : dst_data.z; + dst_data.w = ((dst_args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y)) ? tmp_data.w : dst_data.w; - *((__global uchar4 *)(dst + dst_index)) = dst_data; + *((__global uchar4 *)(dst + dst_args.z)) = dst_data; } } @@ -141,8 +137,11 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -181,8 +180,11 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -296,8 +298,11 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src_index = mad24(y, src_step, x + src_offset - dst_align); int dst_start = mad24(y, dst_step, dst_offset); @@ -332,8 +337,11 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -367,8 +375,11 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1)); int dst_start = mad24(y, dst_step, dst_offset); @@ -455,3 +466,5 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse } } #endif + + diff --git a/modules/ocl/src/opencl/arithm_flip.cl b/modules/ocl/src/opencl/arithm_flip.cl index f4925244a54678936a485840380ef36f81675636..821a84ab756b91a3ed02ea1c8294a719031fb738 100644 --- a/modules/ocl/src/opencl/arithm_flip.cl +++ b/modules/ocl/src/opencl/arithm_flip.cl @@ -44,7 +44,11 @@ //M*/ #if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif ////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -60,8 +64,11 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of if (x < cols && y < thread_rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src_index_0 = mad24(y, src_step, x + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align); @@ -115,8 +122,11 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off if (x < cols && y < thread_rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src_index_0 = mad24(y, src_step, x + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align); @@ -157,8 +167,11 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o if (x < cols && y < thread_rows) { x = x << 2; - - #define dst_align (((dst_offset >> 1) & 3) << 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset >> 1) & 3) << 1) int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align); @@ -199,8 +212,11 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of if (x < cols && y < thread_rows) { x = x << 2; - - #define dst_align (((dst_offset >> 1) & 3) << 1) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (((dst_offset >> 1) & 3) << 1) int src_index_0 = mad24(y, src_step, (x << 1) + src_offset - dst_align); int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align); diff --git a/modules/ocl/src/opencl/arithm_mul.cl b/modules/ocl/src/opencl/arithm_mul.cl index f9f3936a46cf691d2957d3a7a5e0853ebcb77be3..e1cc9f6ab4fcc35c4bf7f5b35bc38c4156fb79b8 100644 --- a/modules/ocl/src/opencl/arithm_mul.cl +++ b/modules/ocl/src/opencl/arithm_mul.cl @@ -16,7 +16,6 @@ // // @Authors // Jia Haipeng, jiahaipeng95@gmail.com -// Dachuan Zhao, dachuan@multicorewareinc.com // // Redistribution and use in source and binary forms, with or without modification, // are permitted provided that the following conditions are met: @@ -44,11 +43,16 @@ // //M*/ -#if defined DOUBLE_SUPPORT +#if defined (DOUBLE_SUPPORT) +#ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64:enable +#elif defined (cl_amd_fp64) +#pragma OPENCL EXTENSION cl_amd_fp64:enable +#endif #endif -int4 round_int4(float4 v){ +int4 round_int4(float4 v) +{ v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5); v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5); v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5); @@ -56,7 +60,8 @@ int4 round_int4(float4 v){ return convert_int4_sat(v); } -uint4 round_uint4(float4 v){ +uint4 round_uint4(float4 v) +{ v.s0 = v.s0 + (v.s0 > 0 ? 0.5 : -0.5); v.s1 = v.s1 + (v.s1 > 0 ? 0.5 : -0.5); v.s2 = v.s2 + (v.s2 > 0 ? 0.5 : -0.5); @@ -64,7 +69,8 @@ uint4 round_uint4(float4 v){ return convert_uint4_sat(v); } -long round_int(float v){ +long round_int(float v) +{ v = v + (v > 0 ? 0.5 : -0.5); return convert_int_sat(v); @@ -84,8 +90,11 @@ __kernel void arithm_mul_D0 (__global uchar *src1, int src1_step, int src1_offse if (x < cols && y < rows) { x = x << 2; - - #define dst_align (dst_offset & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align (dst_offset & 3) int src1_index = mad24(y, src1_step, x + src1_offset - dst_align); int src2_index = mad24(y, src2_step, x + src2_offset - dst_align); @@ -129,8 +138,11 @@ __kernel void arithm_mul_D2 (__global ushort *src1, int src1_step, int src1_offs if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -165,8 +177,11 @@ __kernel void arithm_mul_D3 (__global short *src1, int src1_step, int src1_offse if (x < cols && y < rows) { x = x << 2; - - #define dst_align ((dst_offset >> 1) & 3) + +#ifdef dst_align +#undef dst_align +#endif +#define dst_align ((dst_offset >> 1) & 3) int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1)); int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1)); @@ -263,8 +278,8 @@ __kernel void arithm_mul_D6 (__global double *src1, int src1_step, int src1_offs #endif __kernel void arithm_muls_D5 (__global float *src1, int src1_step, int src1_offset, - __global float *dst, int dst_step, int dst_offset, - int rows, int cols, int dst_step1, float scalar) + __global float *dst, int dst_step, int dst_offset, + int rows, int cols, int dst_step1, float scalar) { int x = get_global_id(0); int y = get_global_id(1);