Merge pull request #775 from bitwangyaoyao:2.4_fixerr

a2d27429 · Andrey Kamaev · OpenCV Buildbot · 977562b6 · bcc086ba · a2d27429
23 changed file
--- a/modules/ocl/src/opencl/arithm_absdiff.cl
+++ b/modules/ocl/src/opencl/arithm_absdiff.cl
@@ -44,7 +44,11 @@
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -62,7 +66,10 @@ __kernel void arithm_absdiff_D0 (__global uchar *src1, int src1_step, int src1_o
    if (x < cols && y < rows)
    {
        x = x << 2;
+#ifdef dst_align
+#undef dst_align
+#endif
        #define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -110,8 +117,11 @@ __kernel void arithm_absdiff_D2 (__global ushort *src1, int src1_step, int src1_
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -144,8 +154,11 @@ __kernel void arithm_absdiff_D3 (__global short *src1, int src1_step, int src1_o
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -248,8 +261,11 @@ __kernel void arithm_s_absdiff_C1_D0 (__global   uchar *src1, int src1_step, int
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -287,8 +303,11 @@ __kernel void arithm_s_absdiff_C1_D2 (__global   ushort *src1, int src1_step, in
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -318,8 +337,11 @@ __kernel void arithm_s_absdiff_C1_D3 (__global   short *src1, int src1_step, int
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -387,8 +409,8 @@ __kernel void arithm_s_absdiff_C1_D5 (__global   float *src1, int src1_step, int
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_s_absdiff_C1_D6 (__global   double *src1, int src1_step, int src1_offset,
-                                     __global   double *dst,  int dst_step,  int dst_offset,
+                                      __global   double *dst,  int dst_step,  int dst_offset,
-                                     double4 src2, int rows, int cols, int dst_step1)
+                                      double4 src2, int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
@@ -421,8 +443,11 @@ __kernel void arithm_s_absdiff_C2_D0 (__global   uchar *src1, int src1_step, int
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -465,7 +490,7 @@ __kernel void arithm_s_absdiff_C2_D2 (__global   ushort *src1, int src1_step, in
 }
 __kernel void arithm_s_absdiff_C2_D3 (__global   short *src1, int src1_step, int src1_offset,
                                      __global   short *dst,  int dst_step,  int dst_offset,
-                                     int4 src2, int rows, int cols, int dst_step1)
+                                      int4 src2, int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
@@ -509,7 +534,7 @@ __kernel void arithm_s_absdiff_C2_D4 (__global   int *src1, int src1_step, int s
 }
 __kernel void arithm_s_absdiff_C2_D5 (__global   float *src1, int src1_step, int src1_offset,
                                      __global   float *dst,  int dst_step,  int dst_offset,
-                                     float4 src2, int rows, int cols, int dst_step1)
+                                      float4 src2, int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
@@ -563,8 +588,11 @@ __kernel void arithm_s_absdiff_C3_D0 (__global   uchar *src1, int src1_step, int
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -617,8 +645,11 @@ __kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, in
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -644,16 +675,16 @@ __kernel void arithm_s_absdiff_C3_D2 (__global   ushort *src1, int src1_step, in
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
@@ -667,8 +698,11 @@ __kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -694,16 +728,16 @@ __kernel void arithm_s_absdiff_C3_D3 (__global   short *src1, int src1_step, int
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_absdiff_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
@@ -735,9 +769,9 @@ __kernel void arithm_s_absdiff_C3_D4 (__global   int *src1, int src1_step, int s
        int tmp_data_1 = convert_int_sat(abs_diff(src1_data_1, src2_data_1));
        int tmp_data_2 = convert_int_sat(abs_diff(src1_data_2, src2_data_2));
-       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }
 __kernel void arithm_s_absdiff_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
@@ -769,9 +803,9 @@ __kernel void arithm_s_absdiff_C3_D5 (__global   float *src1, int src1_step, int
        float tmp_data_1 = fabs(src1_data_1 - src2_data_1);
        float tmp_data_2 = fabs(src1_data_2 - src2_data_2);
-       *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }
@@ -805,9 +839,9 @@ __kernel void arithm_s_absdiff_C3_D6 (__global   double *src1, int src1_step, in
        double tmp_data_1 = fabs(src1_data_1 - src2_data_1);
        double tmp_data_2 = fabs(src1_data_2 - src2_data_2);
-       *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
    }
 }
 #endif

--- a/modules/ocl/src/opencl/arithm_add.cl
+++ b/modules/ocl/src/opencl/arithm_add.cl
@@ -45,7 +45,11 @@
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -64,7 +68,10 @@ __kernel void arithm_add_D0 (__global uchar *src1, int src1_step, int src1_offse
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -112,7 +119,10 @@ __kernel void arithm_add_D2 (__global ushort *src1, int src1_step, int src1_offs
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -147,7 +157,10 @@ __kernel void arithm_add_D3 (__global short *src1, int src1_step, int src1_offse
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -252,7 +265,10 @@ __kernel void arithm_add_with_mask_C1_D0 (__global uchar *src1, int src1_step, i
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -311,7 +327,10 @@ __kernel void arithm_add_with_mask_C1_D2 (__global ushort *src1, int src1_step,
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -348,7 +367,10 @@ __kernel void arithm_add_with_mask_C1_D3 (__global short *src1, int src1_step, i
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -477,7 +499,10 @@ __kernel void arithm_add_with_mask_C2_D0 (__global uchar *src1, int src1_step, i
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -664,7 +689,10 @@ __kernel void arithm_add_with_mask_C3_D0 (__global uchar *src1, int src1_step, i
    {
        x = x << 2;
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int src2_index = mad24(y, src2_step, (x * 3) + src2_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -724,7 +752,10 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -754,16 +785,16 @@ __kernel void arithm_add_with_mask_C3_D2 (__global ushort *src1, int src1_step,
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, int src1_offset,
@@ -780,7 +811,10 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int src2_index = mad24(y, src2_step, (x * 6) + src2_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -810,16 +844,16 @@ __kernel void arithm_add_with_mask_C3_D3 (__global short *src1, int src1_step, i
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_add_with_mask_C3_D4 (__global int   *src1, int src1_step, int src1_offset,
@@ -861,9 +895,9 @@ __kernel void arithm_add_with_mask_C3_D4 (__global int   *src1, int src1_step, i
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;
-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, int src1_offset,
@@ -905,9 +939,9 @@ __kernel void arithm_add_with_mask_C3_D5 (__global float *src1, int src1_step, i
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;
-       *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
@@ -951,9 +985,9 @@ __kernel void arithm_add_with_mask_C3_D6 (__global double *src1, int src1_step,
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;
-       *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
    }
 }
 #endif

--- a/modules/ocl/src/opencl/arithm_addWeighted.cl
+++ b/modules/ocl/src/opencl/arithm_addWeighted.cl
@@ -42,8 +42,12 @@
 // the use of this software, even if advised of the possibility of such damage.
 //
 //M*/
-#if defined DOUBLE_SUPPORT
+#if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 typedef double F;
 #else
 typedef float F;
@@ -52,10 +56,10 @@ typedef float F;
 /////////////////////////////////////////////addWeighted//////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset,
-                           __global uchar *src2, int src2_step,int src2_offset,
+                              __global uchar *src2, int src2_step,int src2_offset,
-                           F alpha,F beta,F gama,
+                              F alpha,F beta,F gama,
-                           __global uchar *dst,  int dst_step,int dst_offset,
+                              __global uchar *dst,  int dst_step,int dst_offset,
-                           int rows,  int cols,int dst_step1)
+                              int rows,  int cols,int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -65,7 +69,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -87,7 +94,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
 //        short4 tmp      = convert_short4_sat(src1_data) * alpha + convert_short4_sat(src2_data) * beta + gama;
-         short4 tmp;
+        short4 tmp;
        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@@ -100,7 +107,7 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
        *((__global uchar4 *)(dst + dst_index)) = dst_data;
-       // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
+        // dst[x + y * dst_step] = src1[x + y * src1_step] * alpha + src2[x + y * src2_step] * beta + gama;
    }
 }
@@ -108,10 +115,10 @@ __kernel void addWeighted_D0 (__global uchar *src1,int src1_step,int src1_offset
 __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offset,
-                           __global ushort *src2, int src2_step,int src2_offset,
+                              __global ushort *src2, int src2_step,int src2_offset,
-                           F alpha,F beta,F gama,
+                              F alpha,F beta,F gama,
-                           __global ushort *dst,  int dst_step,int dst_offset,
+                              __global ushort *dst,  int dst_step,int dst_offset,
-                           int rows,  int cols,int dst_step1)
+                              int rows,  int cols,int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -121,35 +128,38 @@ __kernel void addWeighted_D2 (__global ushort *src1, int src1_step,int src1_offs
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) & (int)0xfffffff8);
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-    if(src1_index < 0)
+        if(src1_index < 0)
-    {
+        {
-        ushort4 tmp;
+            ushort4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
+        }
-    if(src2_index < 0)
+        if(src2_index < 0)
-    {
+        {
-        ushort4 tmp;
+            ushort4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
+        }
        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
-       // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
+        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-         int4 tmp;
+        int4 tmp;
        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@@ -181,8 +191,11 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset -( dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset -( dst_align << 1));
@@ -190,26 +203,26 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset +( x<< 1) - (dst_align << 1 ));
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
-    if(src1_index < 0)
+        if(src1_index < 0)
-    {
+        {
-        short4 tmp;
+            short4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
+        }
-    if(src2_index < 0)
+        if(src2_index < 0)
-    {
+        {
-        short4 tmp;
+            short4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
+        }
        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-       // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
+        // int4 tmp      = convert_int4_sat(src1_data) * alpha + convert_int4_sat(src2_data) * beta + gama;
-         int4 tmp;
+        int4 tmp;
        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@@ -228,7 +241,7 @@ __kernel void addWeighted_D3 (__global short *src1, int src1_step,int src1_offse
 __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
                              __global int *src2, int src2_step,int src2_offset,
-                             F alpha,F beta, F gama,
+                              F alpha,F beta, F gama,
                              __global int *dst,  int dst_step,int dst_offset,
                              int rows,  int cols,int dst_step1)
 {
@@ -241,9 +254,12 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
        x = x << 2;
-        #define bitOfInt  (sizeof(int)== 4 ? 2: 3)
+#define bitOfInt  (sizeof(int)== 4 ? 2: 3)
-        #define dst_align ((dst_offset >> bitOfInt) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> bitOfInt) & 3)
        int src1_index = mad24(y, src1_step, (x << bitOfInt) + src1_offset - (dst_align << bitOfInt));
        int src2_index = mad24(y, src2_step, (x << bitOfInt) + src2_offset - (dst_align << bitOfInt));
@@ -252,26 +268,26 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << bitOfInt) -(dst_align << bitOfInt));
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        int4 src1_data = vload4(0, (__global int *)((__global char *)src1 + src1_index_fix));
        int4 src2_data = vload4(0, (__global int *)((__global char *)src2 + src2_index_fix));
-    if(src1_index < 0)
+        if(src1_index < 0)
-    {
+        {
-        int4 tmp;
+            int4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
+        }
-    if(src2_index < 0)
+        if(src2_index < 0)
-    {
+        {
-        int4 tmp;
+            int4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
+        }
        int4 dst_data = *((__global int4 *)((__global char *)dst + dst_index));
-       // double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
+        // double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-         float4 tmp;
+        float4 tmp;
        tmp.x = src1_data.x * alpha + src2_data.x * beta + gama;
        tmp.y = src1_data.y * alpha + src2_data.y * beta + gama;
        tmp.z = src1_data.z * alpha + src2_data.z * beta + gama;
@@ -291,7 +307,7 @@ __kernel void addWeighted_D4 (__global int *src1, int src1_step,int src1_offset,
 __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset,
                              __global float *src2, int src2_step,int src2_offset,
-                             F alpha,F beta, F gama,
+                              F alpha,F beta, F gama,
                              __global float *dst,  int dst_step,int dst_offset,
                              int rows,  int cols,int dst_step1)
 {
@@ -303,8 +319,11 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 2) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 2) & 3)
        int src1_index = mad24(y, src1_step, (x << 2) + src1_offset - (dst_align << 2));
        int src2_index = mad24(y, src2_step, (x << 2) + src2_offset - (dst_align << 2));
@@ -313,32 +332,32 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 2) -(dst_align << 2));
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        float4 src1_data = vload4(0, (__global float  *)((__global char *)src1 + src1_index_fix));
        float4 src2_data = vload4(0, (__global float *)((__global char *)src2 + src2_index_fix));
        float4 dst_data = *((__global float4 *)((__global char *)dst + dst_index));
-    if(src1_index < 0)
+        if(src1_index < 0)
-    {
+        {
-        float4 tmp;
+            float4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
+        }
-    if(src2_index < 0)
+        if(src2_index < 0)
-    {
+        {
-        float4 tmp;
+            float4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
+        }
-    //    double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
+        //    double4   tmp = convert_double4(src1_data) * alpha + convert_double4(src2_data) * beta + gama ;
-       // float4   tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
+        // float4   tmp_data =(src1_data) * alpha + (src2_data) * beta + gama ;
-         float4 tmp_data;
+        float4 tmp_data;
        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;
        tmp_data.w = src1_data.w * alpha + src2_data.w * beta + gama;
-       // float4 tmp_data = convert_float4(tmp);
+        // float4 tmp_data = convert_float4(tmp);
        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
        dst_data.y = ((dst_index + 4 >= dst_start) && (dst_index + 4 < dst_end)) ? tmp_data.y : dst_data.y;
@@ -353,7 +372,7 @@ __kernel void addWeighted_D5 (__global float *src1,int src1_step,int src1_offset
 #if defined (DOUBLE_SUPPORT)
 __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offset,
                              __global double *src2, int src2_step,int src2_offset,
-                             F alpha,F beta, F gama,
+                              F alpha,F beta, F gama,
                              __global double *dst,  int dst_step,int dst_offset,
                              int rows,  int cols,int dst_step1)
 {
@@ -365,8 +384,11 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 3) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 3) & 3)
        int src1_index = mad24(y, src1_step, (x << 3) + src1_offset - (dst_align << 3));
        int src2_index = mad24(y, src2_step, (x << 3) + src2_offset - (dst_align << 3));
@@ -375,25 +397,25 @@ __kernel void addWeighted_D6 (__global double *src1, int src1_step,int src1_offs
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 3) -(dst_align << 3));
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-    int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        double4 src1_data = vload4(0, (__global double  *)((__global char *)src1 + src1_index_fix));
        double4 src2_data = vload4(0, (__global double  *)((__global char *)src2 + src2_index_fix));
        double4 dst_data = *((__global double4 *)((__global char *)dst + dst_index));
-    if(src1_index < 0)
+        if(src1_index < 0)
-    {
+        {
-        double4 tmp;
+            double4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
+        }
-    if(src2_index < 0)
+        if(src2_index < 0)
-    {
+        {
-        double4 tmp;
+            double4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-    }
+        }
-      //  double4   tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
+        //  double4   tmp_data = (src1_data) * alpha + (src2_data) * beta + gama ;
-         double4 tmp_data;
+        double4 tmp_data;
        tmp_data.x = src1_data.x * alpha + src2_data.x * beta + gama;
        tmp_data.y = src1_data.y * alpha + src2_data.y * beta + gama;
        tmp_data.z = src1_data.z * alpha + src2_data.z * beta + gama;

--- a/modules/ocl/src/opencl/arithm_add_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar.cl
@@ -44,9 +44,13 @@
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
 #endif
+#endif
 /**************************************add with scalar without mask**************************************/
 __kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src1_offset,
                                  __global   uchar *dst,  int dst_step,  int dst_offset,
@@ -58,8 +62,11 @@ __kernel void arithm_s_add_C1_D0 (__global   uchar *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -98,8 +105,11 @@ __kernel void arithm_s_add_C1_D2 (__global   ushort *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -130,8 +140,11 @@ __kernel void arithm_s_add_C1_D3 (__global   short *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -232,8 +245,11 @@ __kernel void arithm_s_add_C2_D0 (__global   uchar *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -377,8 +393,11 @@ __kernel void arithm_s_add_C3_D0 (__global   uchar *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -431,8 +450,11 @@ __kernel void arithm_s_add_C3_D2 (__global   ushort *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -458,16 +480,16 @@ __kernel void arithm_s_add_C3_D2 (__global   ushort *src1, int src1_step, int sr
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_add_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
@@ -481,8 +503,11 @@ __kernel void arithm_s_add_C3_D3 (__global   short *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -508,16 +533,16 @@ __kernel void arithm_s_add_C3_D3 (__global   short *src1, int src1_step, int src
        data_0.xy = ((dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
        data_1.x  = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
        data_2.xy = ((dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_add_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
@@ -549,9 +574,9 @@ __kernel void arithm_s_add_C3_D4 (__global   int *src1, int src1_step, int src1_
        int tmp_data_1 = convert_int_sat((long)src1_data_1 + (long)src2_data_1);
        int tmp_data_2 = convert_int_sat((long)src1_data_2 + (long)src2_data_2);
-       *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global int *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global int *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global int *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }
 __kernel void arithm_s_add_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
@@ -583,9 +608,9 @@ __kernel void arithm_s_add_C3_D5 (__global   float *src1, int src1_step, int src
        float tmp_data_1 = src1_data_1 + src2_data_1;
        float tmp_data_2 = src1_data_2 + src2_data_2;
-       *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
+        *((__global float *)((__global char *)dst + dst_index + 0))= tmp_data_0;
-       *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
+        *((__global float *)((__global char *)dst + dst_index + 4))= tmp_data_1;
-       *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
+        *((__global float *)((__global char *)dst + dst_index + 8))= tmp_data_2;
    }
 }
@@ -619,9 +644,9 @@ __kernel void arithm_s_add_C3_D6 (__global   double *src1, int src1_step, int sr
        double tmp_data_1 = src1_data_1 + src2_data_1;
        double tmp_data_2 = src1_data_2 + src2_data_2;
-       *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
+        *((__global double *)((__global char *)dst + dst_index + 0 ))= tmp_data_0;
-       *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
+        *((__global double *)((__global char *)dst + dst_index + 8 ))= tmp_data_1;
-       *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
+        *((__global double *)((__global char *)dst + dst_index + 16))= tmp_data_2;
    }
 }
 #endif

--- a/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_add_scalar_mask.cl
@@ -44,7 +44,11 @@
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 /**************************************add with scalar with mask**************************************/
@@ -60,8 +64,11 @@ __kernel void arithm_s_add_with_mask_C1_D0 (__global   uchar *src1, int src1_ste
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -110,8 +117,11 @@ __kernel void arithm_s_add_with_mask_C1_D2 (__global   ushort *src1, int src1_st
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -145,8 +155,11 @@ __kernel void arithm_s_add_with_mask_C1_D3 (__global   short *src1, int src1_ste
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -266,8 +279,11 @@ __kernel void arithm_s_add_with_mask_C2_D0 (__global   uchar *src1, int src1_ste
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align ((dst_offset >> 1) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 1)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -442,8 +458,11 @@ __kernel void arithm_s_add_with_mask_C3_D0 (__global   uchar *src1, int src1_ste
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (((dst_offset % dst_step) / 3 ) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 3 ) & 3)
        int src1_index = mad24(y, src1_step, (x * 3) + src1_offset - (dst_align * 3));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -500,8 +519,11 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global   ushort *src1, int src1_st
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -530,16 +552,16 @@ __kernel void arithm_s_add_with_mask_C3_D2 (__global   ushort *src1, int src1_st
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global ushort2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_step, int src1_offset,
@@ -554,8 +576,11 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_ste
    if (x < cols && y < rows)
    {
        x = x << 1;
-        #define dst_align (((dst_offset % dst_step) / 6 ) & 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset % dst_step) / 6 ) & 1)
        int src1_index = mad24(y, src1_step, (x * 6) + src1_offset - (dst_align * 6));
        int mask_index = mad24(y, mask_step, x + mask_offset - dst_align);
@@ -584,16 +609,16 @@ __kernel void arithm_s_add_with_mask_C3_D3 (__global   short *src1, int src1_ste
        data_0.xy = ((mask_data.x) && (dst_index + 0 >= dst_start)) ? tmp_data_0.xy : data_0.xy;
        data_1.x  = ((mask_data.x) && (dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end))
-                     ? tmp_data_1.x : data_1.x;
+                    ? tmp_data_1.x : data_1.x;
        data_1.y  = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_1.y : data_1.y;
+                    ? tmp_data_1.y : data_1.y;
        data_2.xy = ((mask_data.y) && (dst_index + 6 >= dst_start) && (dst_index + 6 < dst_end))
-                     ? tmp_data_2.xy : data_2.xy;
+                    ? tmp_data_2.xy : data_2.xy;
-       *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global short2 *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global short2 *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global short2 *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_add_with_mask_C3_D4 (__global   int *src1, int src1_step, int src1_offset,
@@ -633,9 +658,9 @@ __kernel void arithm_s_add_with_mask_C3_D4 (__global   int *src1, int src1_step,
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;
-       *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global int *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global int *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global int *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
 __kernel void arithm_s_add_with_mask_C3_D5 (__global   float *src1, int src1_step, int src1_offset,
@@ -675,9 +700,9 @@ __kernel void arithm_s_add_with_mask_C3_D5 (__global   float *src1, int src1_ste
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;
-       *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
+        *((__global float *)((__global char *)dst + dst_index + 0))= data_0;
-       *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
+        *((__global float *)((__global char *)dst + dst_index + 4))= data_1;
-       *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
+        *((__global float *)((__global char *)dst + dst_index + 8))= data_2;
    }
 }
@@ -719,9 +744,9 @@ __kernel void arithm_s_add_with_mask_C3_D6 (__global   double *src1, int src1_st
        data_1 = mask_data ? tmp_data_1 : data_1;
        data_2 = mask_data ? tmp_data_2 : data_2;
-       *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
+        *((__global double *)((__global char *)dst + dst_index + 0 ))= data_0;
-       *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
+        *((__global double *)((__global char *)dst + dst_index + 8 ))= data_1;
-       *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
+        *((__global double *)((__global char *)dst + dst_index + 16))= data_2;
    }
 }
 #endif

--- a/modules/ocl/src/opencl/arithm_bitwise_and.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and.cl
@@ -43,7 +43,11 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -51,9 +55,9 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_and without mask**************************************/
 __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
+                                     __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
+                                     __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -61,31 +65,34 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-     uchar4 src1_data = vload4(0, src1 + src1_index_fix);
+        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
-     uchar4 src2_data = vload4(0, src2 + src2_index_fix);
+        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-     if(src1_index < 0)
+        if(src1_index < 0)
-     {
+        {
-        uchar4 tmp;
+            uchar4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
+        }
-     if(src2_index < 0)
+        if(src2_index < 0)
-     {
+        {
-        uchar4 tmp;
+            uchar4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        }
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = src1_data & src2_data;
@@ -101,9 +108,9 @@ __kernel void arithm_bitwise_and_D0 (__global uchar *src1, int src1_step, int sr
 __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
+                                     __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -111,8 +118,11 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -120,23 +130,23 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
-     char4 src1_data = vload4(0, src1 + src1_index_fix);
+        char4 src1_data = vload4(0, src1 + src1_index_fix);
-     char4 src2_data = vload4(0, src2 + src2_index_fix);
+        char4 src2_data = vload4(0, src2 + src2_index_fix);
-     if(src1_index < 0)
+        if(src1_index < 0)
-     {
+        {
-        char4 tmp;
+            char4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
+        }
-     if(src2_index < 0)
+        if(src2_index < 0)
-     {
+        {
-        char4 tmp;
+            char4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        }
        char4 dst_data = *((__global char4 *)(dst + dst_index));
        char4 tmp_data = src1_data & src2_data;
@@ -151,9 +161,9 @@ __kernel void arithm_bitwise_and_D1 (__global char *src1, int src1_step, int src
 __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
+                                     __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
+                                     __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
@@ -162,8 +172,11 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -171,23 +184,23 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-     if(src1_index < 0)
+        if(src1_index < 0)
-     {
+        {
-        ushort4 tmp;
+            ushort4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
+        }
-     if(src2_index < 0)
+        if(src2_index < 0)
-     {
+        {
-        ushort4 tmp;
+            ushort4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        }
        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
        ushort4 tmp_data = src1_data & src2_data;
@@ -203,9 +216,9 @@ __kernel void arithm_bitwise_and_D2 (__global ushort *src1, int src1_step, int s
 __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
+                                     __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
+                                     __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
@@ -214,8 +227,11 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -223,23 +239,23 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
-     if(src1_index < 0)
+        if(src1_index < 0)
-     {
+        {
-        short4 tmp;
+            short4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
+        }
-     if(src2_index < 0)
+        if(src2_index < 0)
-     {
+        {
-        short4 tmp;
+            short4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        }
        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
        short4 tmp_data = src1_data & src2_data;
@@ -255,9 +271,9 @@ __kernel void arithm_bitwise_and_D3 (__global short *src1, int src1_step, int sr
 __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
+                                     __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
+                                     __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -277,9 +293,9 @@ __kernel void arithm_bitwise_and_D4 (__global int *src1, int src1_step, int src1
 }
 __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
+                                     __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -300,9 +316,9 @@ __kernel void arithm_bitwise_and_D5 (__global char *src1, int src1_step, int src
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_bitwise_and_D6 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
+                                     __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

--- a/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_mask.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_and_scalar_mask.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_not.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_not.cl
@@ -43,9 +43,12 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_NOT////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -60,26 +63,29 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-    int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = ~ src1_data;
-  /*  if(src1_index < 0)
+        /*  if(src1_index < 0)
-    {
+          {
-      uchar4 tmp;
+            uchar4 tmp;
-      tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-      src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-    }
+          }
-  */
+        */
        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
@@ -91,8 +97,8 @@ __kernel void arithm_bitwise_not_D0 (__global uchar *src1, int src1_step, int sr
 __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -100,8 +106,11 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -124,8 +133,8 @@ __kernel void arithm_bitwise_not_D1 (__global char *src1, int src1_step, int src
 __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
+                                     __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
@@ -134,8 +143,11 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -159,8 +171,8 @@ __kernel void arithm_bitwise_not_D2 (__global ushort *src1, int src1_step, int s
 __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
+                                     __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
@@ -169,8 +181,11 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -194,8 +209,8 @@ __kernel void arithm_bitwise_not_D3 (__global short *src1, int src1_step, int sr
 __kernel void arithm_bitwise_not_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
+                                     __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

--- a/modules/ocl/src/opencl/arithm_bitwise_or.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or.cl
@@ -43,7 +43,11 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -51,9 +55,9 @@
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_or without mask**************************************/
 __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
+                                    __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
+                                    __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -61,30 +65,33 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
        int dst_start  = mad24(y, dst_step, dst_offset);
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-      int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-      int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-      if(src1_index < 0)
+        if(src1_index < 0)
-      {
+        {
-        uchar4 tmp;
+            uchar4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-      }
+        }
-      if(src2_index < 0)
+        if(src2_index < 0)
-      {
+        {
-        uchar4 tmp;
+            uchar4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-      }
+        }
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = src1_data | src2_data;
@@ -99,9 +106,9 @@ __kernel void arithm_bitwise_or_D0 (__global uchar *src1, int src1_step, int src
 __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
+                                    __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
+                                    __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -109,8 +116,11 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -135,9 +145,9 @@ __kernel void arithm_bitwise_or_D1 (__global char *src1, int src1_step, int src1
 __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
+                                    __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
+                                    __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
@@ -146,8 +156,11 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -173,9 +186,9 @@ __kernel void arithm_bitwise_or_D2 (__global ushort *src1, int src1_step, int sr
 __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
+                                    __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
+                                    __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
@@ -184,8 +197,11 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -211,9 +227,9 @@ __kernel void arithm_bitwise_or_D3 (__global short *src1, int src1_step, int src
 __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
+                                    __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
+                                    __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -233,9 +249,9 @@ __kernel void arithm_bitwise_or_D4 (__global int *src1, int src1_step, int src1_
 }
 __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
+                                    __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
+                                    __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -256,9 +272,9 @@ __kernel void arithm_bitwise_or_D5 (__global char *src1, int src1_step, int src1
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_bitwise_or_D6 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
+                                    __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
+                                    __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                    int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

--- a/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_mask.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_or_scalar_mask.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_xor.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor.cl
@@ -43,17 +43,20 @@
 //
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////BITWISE_XOR////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////////////////////////////
 /**************************************bitwise_xor without mask**************************************/
 __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int src1_offset,
-                             __global uchar *src2, int src2_step, int src2_offset,
+                                     __global uchar *src2, int src2_step, int src2_offset,
-                             __global uchar *dst,  int dst_step,  int dst_offset,
+                                     __global uchar *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -61,8 +64,11 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -70,23 +76,23 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        uchar4 src1_data = vload4(0, src1 + src1_index_fix);
        uchar4 src2_data = vload4(0, src2 + src2_index_fix);
-     if(src1_index < 0)
+        if(src1_index < 0)
-     {
+        {
-        uchar4 tmp;
+            uchar4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
+        }
-     if(src2_index < 0)
+        if(src2_index < 0)
-     {
+        {
-        uchar4 tmp;
+            uchar4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        }
        uchar4 dst_data = *((__global uchar4 *)(dst + dst_index));
        uchar4 tmp_data = src1_data ^ src2_data;
@@ -101,9 +107,9 @@ __kernel void arithm_bitwise_xor_D0 (__global uchar *src1, int src1_step, int sr
 __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
+                                     __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -111,8 +117,11 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
@@ -120,23 +129,23 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        char4 src1_data = vload4(0, src1 + src1_index_fix);
        char4 src2_data = vload4(0, src2 + src2_index_fix);
-     if(src1_index < 0)
+        if(src1_index < 0)
-     {
+        {
-        char4 tmp;
+            char4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
+        }
-     if(src2_index < 0)
+        if(src2_index < 0)
-     {
+        {
-        char4 tmp;
+            char4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        }
        char4 dst_data = *((__global char4 *)(dst + dst_index));
        char4 tmp_data = src1_data ^ src2_data;
@@ -151,9 +160,9 @@ __kernel void arithm_bitwise_xor_D1 (__global char *src1, int src1_step, int src
 __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int src1_offset,
-                             __global ushort *src2, int src2_step, int src2_offset,
+                                     __global ushort *src2, int src2_step, int src2_offset,
-                             __global ushort *dst,  int dst_step,  int dst_offset,
+                                     __global ushort *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
@@ -162,8 +171,11 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -171,23 +183,23 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        ushort4 src1_data = vload4(0, (__global ushort *)((__global char *)src1 + src1_index_fix));
        ushort4 src2_data = vload4(0, (__global ushort *)((__global char *)src2 + src2_index_fix));
-     if(src1_index < 0)
+        if(src1_index < 0)
-     {
+        {
-        ushort4 tmp;
+            ushort4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
+        }
-     if(src2_index < 0)
+        if(src2_index < 0)
-     {
+        {
-        ushort4 tmp;
+            ushort4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        }
        ushort4 dst_data = *((__global ushort4 *)((__global char *)dst + dst_index));
        ushort4 tmp_data = src1_data ^ src2_data;
@@ -203,9 +215,9 @@ __kernel void arithm_bitwise_xor_D2 (__global ushort *src1, int src1_step, int s
 __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int src1_offset,
-                             __global short *src2, int src2_step, int src2_offset,
+                                     __global short *src2, int src2_step, int src2_offset,
-                             __global short *dst,  int dst_step,  int dst_offset,
+                                     __global short *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
@@ -214,8 +226,11 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -223,25 +238,25 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
        int dst_index  = mad24(y, dst_step, dst_offset + (x << 1) & (int)0xfffffff8);
-     int src1_index_fix = src1_index < 0 ? 0 : src1_index;
+        int src1_index_fix = src1_index < 0 ? 0 : src1_index;
-     int src2_index_fix = src2_index < 0 ? 0 : src2_index;
+        int src2_index_fix = src2_index < 0 ? 0 : src2_index;
        short4 src1_data = vload4(0, (__global short *)((__global char *)src1 + src1_index_fix));
        short4 src2_data = vload4(0, (__global short *)((__global char *)src2 + src2_index_fix));
        short4 dst_data = *((__global short4 *)((__global char *)dst + dst_index));
-     if(src1_index < 0)
+        if(src1_index < 0)
-     {
+        {
-        short4 tmp;
+            short4 tmp;
-        tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
+            tmp.xyzw = (src1_index == -2) ? src1_data.zwxy:src1_data.yzwx;
-        src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
+            src1_data.xyzw = (src1_index == -1) ? src1_data.wxyz:tmp.xyzw;
-     }
+        }
-     if(src2_index < 0)
+        if(src2_index < 0)
-     {
+        {
-        short4 tmp;
+            short4 tmp;
-        tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
+            tmp.xyzw = (src2_index == -2) ? src2_data.zwxy:src2_data.yzwx;
-        src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
+            src2_data.xyzw = (src2_index == -1) ? src2_data.wxyz:tmp.xyzw;
-     }
+        }
@@ -259,9 +274,9 @@ __kernel void arithm_bitwise_xor_D3 (__global short *src1, int src1_step, int sr
 __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1_offset,
-                             __global int *src2, int src2_step, int src2_offset,
+                                     __global int *src2, int src2_step, int src2_offset,
-                             __global int *dst,  int dst_step,  int dst_offset,
+                                     __global int *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -281,9 +296,9 @@ __kernel void arithm_bitwise_xor_D4 (__global int *src1, int src1_step, int src1
 }
 __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
+                                     __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);
@@ -301,12 +316,11 @@ __kernel void arithm_bitwise_xor_D5 (__global char *src1, int src1_step, int src
        *((__global char4 *)((__global char *)dst + dst_index)) = tmp;
    }
 }
 #if defined (DOUBLE_SUPPORT)
 __kernel void arithm_bitwise_xor_D6 (__global char *src1, int src1_step, int src1_offset,
-                             __global char *src2, int src2_step, int src2_offset,
+                                     __global char *src2, int src2_step, int src2_offset,
-                             __global char *dst,  int dst_step,  int dst_offset,
+                                     __global char *dst,  int dst_step,  int dst_offset,
-                             int rows, int cols, int dst_step1)
+                                     int rows, int cols, int dst_step1)
 {
    int x = get_global_id(0);
    int y = get_global_id(1);

--- a/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_mask.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar.cl
--- a/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
+++ b/modules/ocl/src/opencl/arithm_bitwise_xor_scalar_mask.cl
--- a/modules/ocl/src/opencl/arithm_compare_eq.cl
+++ b/modules/ocl/src/opencl/arithm_compare_eq.cl
--- a/modules/ocl/src/opencl/arithm_compare_ne.cl
+++ b/modules/ocl/src/opencl/arithm_compare_ne.cl
--- a/modules/ocl/src/opencl/arithm_div.cl
+++ b/modules/ocl/src/opencl/arithm_div.cl
@@ -44,7 +44,11 @@
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 typedef double F ;
 typedef double4 F4;
 #define convert_F4 convert_double4
@@ -56,34 +60,24 @@ typedef float4 F4;
 #define convert_F  float
 #endif
-uchar round2_uchar(F v){
+inline uchar round2_uchar(F v)
+{
-    uchar v1 = convert_uchar_sat(round(v));
+    return convert_uchar_sat(round(v));
-    //uchar v2 = convert_uchar_sat(v+(v>=0 ? 0.5 : -0.5));
-    return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
 }
-ushort round2_ushort(F v){
+inline ushort round2_ushort(F v)
+{
-    ushort v1 = convert_ushort_sat(round(v));
+    return convert_ushort_sat(round(v));
-    //ushort v2 = convert_ushort_sat(v+(v>=0 ? 0.5 : -0.5));
-    return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
 }
-short round2_short(F v){
-    short v1 = convert_short_sat(round(v));
-    //short v2 = convert_short_sat(v+(v>=0 ? 0.5 : -0.5));
-    return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
+inline short round2_short(F v)
+{
+    return convert_short_sat(round(v));
 }
-int round2_int(F v){
-    int v1 = convert_int_sat(round(v));
-    //int v2 = convert_int_sat(v+(v>=0 ? 0.5 : -0.5));
-    return v1;//(((v-v1)==0.5) && (v1%2==0)) ? v1 : v2;
+inline int round2_int(F v)
+{
+    return convert_int_sat(round(v));
 }
 ///////////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////divide///////////////////////////////////////////////////
@@ -94,39 +88,41 @@ __kernel void arithm_div_D0 (__global uchar *src1, int src1_step, int src1_offse
                             __global uchar *dst,  int dst_step,  int dst_offset,
                             int rows, int cols, int dst_step1, F scalar)
 {
-    int x = get_global_id(0);
+    int2 coor = (int2)(get_global_id(0), get_global_id(1));
-    int y = get_global_id(1);
-    if (x < cols && y < rows)
+    if (coor.x < cols && coor.y < rows)
    {
-        x = x << 2;
+        coor.x = coor.x << 2;
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
+        int2 src_index = (int2)(mad24(coor.y, src1_step, coor.x + src1_offset - dst_align),
+                                mad24(coor.y, src2_step, coor.x + src2_offset - dst_align));
-        #define dst_align (dst_offset & 3)
+        int4 dst_args  = (int4)(mad24(coor.y, dst_step, dst_offset),
-        int src1_index = mad24(y, src1_step, x + src1_offset - dst_align);
+                                mad24(coor.y, dst_step, dst_offset + dst_step1),
-        int src2_index = mad24(y, src2_step, x + src2_offset - dst_align);
+                                mad24(coor.y, dst_step, dst_offset + coor.x & (int)0xfffffffc),
+                                0);
-        int dst_start  = mad24(y, dst_step, dst_offset);
+        uchar4 src1_data = vload4(0, src1 + src_index.x);
-        int dst_end    = mad24(y, dst_step, dst_offset + dst_step1);
+        uchar4 src2_data = vload4(0, src2 + src_index.y);
-        int dst_index  = mad24(y, dst_step, dst_offset + x & (int)0xfffffffc);
+        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_args.z));
-        uchar4 src1_data = vload4(0, src1 + src1_index);
-        uchar4 src2_data = vload4(0, src2 + src2_index);
-        uchar4 dst_data  = *((__global uchar4 *)(dst + dst_index));
        F4 tmp      = convert_F4(src1_data) * scalar;
        uchar4 tmp_data;
-        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / (F)src2_data.x);
+        tmp_data.x = ((tmp.x == 0) || (src2_data.x == 0)) ? 0 : round2_uchar(tmp.x / src2_data.x);
-        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / (F)src2_data.y);
+        tmp_data.y = ((tmp.y == 0) || (src2_data.y == 0)) ? 0 : round2_uchar(tmp.y / src2_data.y);
-        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / (F)src2_data.z);
+        tmp_data.z = ((tmp.z == 0) || (src2_data.z == 0)) ? 0 : round2_uchar(tmp.z / src2_data.z);
-        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / (F)src2_data.w);
+        tmp_data.w = ((tmp.w == 0) || (src2_data.w == 0)) ? 0 : round2_uchar(tmp.w / src2_data.w);
-        dst_data.x = ((dst_index + 0 >= dst_start) && (dst_index + 0 < dst_end)) ? tmp_data.x : dst_data.x;
+        dst_data.x = ((dst_args.z + 0 >= dst_args.x) && (dst_args.z + 0 < dst_args.y)) ? tmp_data.x : dst_data.x;
-        dst_data.y = ((dst_index + 1 >= dst_start) && (dst_index + 1 < dst_end)) ? tmp_data.y : dst_data.y;
+        dst_data.y = ((dst_args.z + 1 >= dst_args.x) && (dst_args.z + 1 < dst_args.y)) ? tmp_data.y : dst_data.y;
-        dst_data.z = ((dst_index + 2 >= dst_start) && (dst_index + 2 < dst_end)) ? tmp_data.z : dst_data.z;
+        dst_data.z = ((dst_args.z + 2 >= dst_args.x) && (dst_args.z + 2 < dst_args.y)) ? tmp_data.z : dst_data.z;
-        dst_data.w = ((dst_index + 3 >= dst_start) && (dst_index + 3 < dst_end)) ? tmp_data.w : dst_data.w;
+        dst_data.w = ((dst_args.z + 3 >= dst_args.x) && (dst_args.z + 3 < dst_args.y)) ? tmp_data.w : dst_data.w;
-        *((__global uchar4 *)(dst + dst_index)) = dst_data;
+        *((__global uchar4 *)(dst + dst_args.z)) = dst_data;
    }
 }
@@ -141,8 +137,11 @@ __kernel void arithm_div_D2 (__global ushort *src1, int src1_step, int src1_offs
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -181,8 +180,11 @@ __kernel void arithm_div_D3 (__global short *src1, int src1_step, int src1_offse
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src1_index = mad24(y, src1_step, (x << 1) + src1_offset - (dst_align << 1));
        int src2_index = mad24(y, src2_step, (x << 1) + src2_offset - (dst_align << 1));
@@ -296,8 +298,11 @@ __kernel void arithm_s_div_D0 (__global uchar *src, int src_step, int src_offset
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src_index = mad24(y, src_step, x + src_offset - dst_align);
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -332,8 +337,11 @@ __kernel void arithm_s_div_D2 (__global ushort *src, int src_step, int src_offse
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -367,8 +375,11 @@ __kernel void arithm_s_div_D3 (__global short *src, int src_step, int src_offset
    if (x < cols && y < rows)
    {
        x = x << 2;
-        #define dst_align ((dst_offset >> 1) & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align ((dst_offset >> 1) & 3)
        int src_index = mad24(y, src_step, (x << 1) + src_offset - (dst_align << 1));
        int dst_start  = mad24(y, dst_step, dst_offset);
@@ -455,3 +466,5 @@ __kernel void arithm_s_div_D6 (__global double *src, int src_step, int src_offse
    }
 }
 #endif
--- a/modules/ocl/src/opencl/arithm_flip.cl
+++ b/modules/ocl/src/opencl/arithm_flip.cl
@@ -44,7 +44,11 @@
 //M*/
 #if defined (DOUBLE_SUPPORT)
+#ifdef cl_khr_fp64
 #pragma OPENCL EXTENSION cl_khr_fp64:enable
+#elif defined (cl_amd_fp64)
+#pragma OPENCL EXTENSION cl_amd_fp64:enable
+#endif
 #endif
 //////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -60,8 +64,11 @@ __kernel void arithm_flip_rows_D0 (__global uchar *src, int src_step, int src_of
    if (x < cols && y < thread_rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align);
        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
@@ -115,8 +122,11 @@ __kernel void arithm_flip_rows_D1 (__global char *src, int src_step, int src_off
    if (x < cols && y < thread_rows)
    {
        x = x << 2;
-        #define dst_align (dst_offset & 3)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (dst_offset & 3)
        int src_index_0 = mad24(y,            src_step, x + src_offset - dst_align);
        int src_index_1 = mad24(rows - y - 1, src_step, x + src_offset - dst_align);
@@ -157,8 +167,11 @@ __kernel void arithm_flip_rows_D2 (__global ushort *src, int src_step, int src_o
    if (x < cols && y < thread_rows)
    {
        x = x << 2;
-        #define dst_align (((dst_offset >> 1) & 3) << 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset >> 1) & 3) << 1)
        int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align);
        int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);
@@ -199,8 +212,11 @@ __kernel void arithm_flip_rows_D3 (__global short *src, int src_step, int src_of
    if (x < cols && y < thread_rows)
    {
        x = x << 2;
-        #define dst_align (((dst_offset >> 1) & 3) << 1)
+#ifdef dst_align
+#undef dst_align
+#endif
+#define dst_align (((dst_offset >> 1) & 3) << 1)
        int src_index_0 = mad24(y,            src_step, (x << 1) + src_offset - dst_align);
        int src_index_1 = mad24(rows - y - 1, src_step, (x << 1) + src_offset - dst_align);

--- a/modules/ocl/src/opencl/arithm_mul.cl
+++ b/modules/ocl/src/opencl/arithm_mul.cl