diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp
index 98f65047303659420e90f38e60734152d70e26bd..f4680aacbd6c274327bc7a230829c5b0040f0c04 100644
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@@ -344,7 +344,7 @@ OCL_PERF_TEST_P(FlipFixture, Flip,
 typedef Size_MatType MinMaxLocFixture;
 
 OCL_PERF_TEST_P(MinMaxLocFixture, MinMaxLoc,
-            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
 {
     const Size_MatType_t params = GetParam();
     const Size srcSize = get<0>(params);
@@ -380,7 +380,7 @@ typedef Size_MatType SumFixture;
 
 OCL_PERF_TEST_P(SumFixture, Sum,
             ::testing::Combine(OCL_TEST_SIZES,
-                               OCL_TEST_TYPES))
+                               OCL_TEST_TYPES_134))
 {
     const Size_MatType_t params = GetParam();
     const Size srcSize = get<0>(params);
@@ -447,7 +447,7 @@ OCL_PERF_TEST_P(PhaseFixture, Phase, ::testing::Combine(
     SANITY_CHECK(dst, 1e-2);
 }
 
-///////////// bitwise_and////////////////////////
+///////////// bitwise_and ////////////////////////
 
 typedef Size_MatType BitwiseAndFixture;
 
@@ -531,7 +531,7 @@ OCL_PERF_TEST_P(BitwiseNotFixture, Bitwise_not,
     SANITY_CHECK(dst);
 }
 
-///////////// compare////////////////////////
+///////////// compare ////////////////////////
 
 CV_ENUM(CmpCode, CMP_LT, CMP_LE, CMP_EQ, CMP_NE, CMP_GE, CMP_GT)
 
@@ -652,7 +652,8 @@ OCL_PERF_TEST_P(SetIdentityFixture, SetIdentity,
 typedef Size_MatType MeanStdDevFixture;
 
 OCL_PERF_TEST_P(MeanStdDevFixture, MeanStdDev,
-                ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES))
+                ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
+                                   OCL_TEST_TYPES_134))
 {
     const Size_MatType_t params = GetParam();
     const Size srcSize = get<0>(params);
@@ -688,7 +689,8 @@ typedef std::tr1::tuple<Size, MatType, NormType> NormParams;
 typedef TestBaseWithParam<NormParams> NormFixture;
 
 OCL_PERF_TEST_P(NormFixture, Norm,
-                ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES, NormType::all()))
+                ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
+                                   OCL_TEST_TYPES_134, NormType::all()))
 {
     const NormParams params = GetParam();
     const Size srcSize = get<0>(params);
@@ -711,7 +713,8 @@ OCL_PERF_TEST_P(NormFixture, Norm,
 typedef Size_MatType UMatDotFixture;
 
 OCL_PERF_TEST_P(UMatDotFixture, UMatDot,
-            ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES))
+            ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
+                               OCL_TEST_TYPES_134))
 {
     const Size_MatType_t params = GetParam();
     const Size srcSize = get<0>(params);
@@ -820,7 +823,8 @@ typedef tuple<Size, MatType, NormalizeModes> NormalizeParams;
 typedef TestBaseWithParam<NormalizeParams> NormalizeFixture;
 
 OCL_PERF_TEST_P(NormalizeFixture, Normalize,
-                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, NormalizeModes::all()))
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134,
+                                   NormalizeModes::all()))
 {
     const NormalizeParams params = GetParam();
     const Size srcSize = get<0>(params);
diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl
index 0a0538ed6afbecbb52cce57b86a9d155076a9310..ed935881da8a05174265eea945ae4a81629ae90f 100644
--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -52,6 +52,18 @@
 
 #define noconvert
 
+#if cn != 3
+#define loadpix(addr) *(__global const srcT *)(addr)
+#define storepix(val, addr)  *(__global dstT *)(addr) = val
+#define srcTSIZE (int)sizeof(srcT)
+#define dstTSIZE (int)sizeof(dstT)
+#else
+#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
+#define srcTSIZE ((int)sizeof(srcT1)*3)
+#define dstTSIZE ((int)sizeof(dstT1)*3)
+#endif
+
 #ifdef HAVE_MASK
 #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset
 #else
@@ -88,19 +100,20 @@
 
 #ifdef HAVE_MASK
 #define REDUCE_GLOBAL \
-    dstT temp = convertToDT(src[0]); \
     int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \
     if (mask[mask_index]) \
-        FUNC(accumulator, temp)
+    { \
+        dstT temp = convertToDT(loadpix(srcptr + src_index)); \
+        FUNC(accumulator, temp); \
+    }
 #elif defined OP_DOT
 #define REDUCE_GLOBAL \
-    int src2_index = mad24(id / cols, src2_step, mad24(id % cols, (int)sizeof(srcT), src2_offset)); \
-    __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index); \
-    dstT temp = convertToDT(src[0]), temp2 = convertToDT(src2[0]); \
+    int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset)); \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
     FUNC(accumulator, temp, temp2)
 #else
 #define REDUCE_GLOBAL \
-    dstT temp = convertToDT(src[0]); \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)); \
     FUNC(accumulator, temp)
 #endif
 
@@ -111,8 +124,7 @@
 #define REDUCE_LOCAL_2 \
     localmem[lid] += localmem[lid2]
 #define CALC_RESULT \
-    __global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \
-    dst[0] = localmem[0]
+    storepix(localmem[0], dstptr + dstTSIZE * gid)
 
 // countNonZero stuff
 #elif defined OP_COUNT_NON_ZERO
@@ -123,7 +135,7 @@
     dstT accumulator = (dstT)(0); \
     srcT zero = (srcT)(0), one = (srcT)(1)
 #define REDUCE_GLOBAL \
-    accumulator += src[0] == zero ? zero : one
+    accumulator += loadpix(srcptr + src_index) == zero ? zero : one
 #define SET_LOCAL_1 \
     localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
@@ -131,8 +143,7 @@
 #define REDUCE_LOCAL_2 \
     localmem[lid] += localmem[lid2]
 #define CALC_RESULT \
-    __global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \
-    dst[0] = localmem[0]
+    storepix(localmem[0], dstptr + dstTSIZE * gid)
 
 // minMaxLoc stuff
 #elif defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK
@@ -167,6 +178,8 @@
 #define MAX_VAL DBL_MAX
 #endif
 
+#define dstT srcT
+
 #define DECLARE_LOCAL_MEM \
     __local srcT localmem_min[WGS2_ALIGNED]; \
     __local srcT localmem_max[WGS2_ALIGNED]; \
@@ -181,7 +194,7 @@
     srcT temp; \
     int temploc
 #define REDUCE_GLOBAL \
-    temp = src[0]; \
+    temp = loadpix(srcptr + src_index); \
     temploc = id; \
     srcT temp_minval = minval, temp_maxval = maxval; \
     minval = min(minval, temp); \
@@ -217,10 +230,8 @@
         localmem_maxloc[lid] : (max1 == max2) ? (max1 == oldmax) ? min(localmem_maxloc[lid2],localmem_maxloc[lid]) : \
         localmem_maxloc[lid2] : localmem_maxloc[lid]
 #define CALC_RESULT \
-    __global srcT * dstminval = (__global srcT *)(dstptr + (int)sizeof(srcT) * gid); \
-    __global srcT * dstmaxval = (__global srcT *)(dstptr2 + (int)sizeof(srcT) * gid); \
-    dstminval[0] = localmem_min[0]; \
-    dstmaxval[0] = localmem_max[0]; \
+    storepix(localmem_min[0], dstptr + dstTSIZE * gid); \
+    storepix(localmem_max[0], dstptr2 + dstTSIZE * gid); \
     dstlocptr[gid] = localmem_minloc[0]; \
     dstlocptr2[gid] = localmem_maxloc[0]
 
@@ -236,7 +247,7 @@
     int temploc
 #undef REDUCE_GLOBAL
 #define REDUCE_GLOBAL \
-    temp = src[0]; \
+    temp = loadpix(srcptr + src_index); \
     temploc = id; \
     int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols) * (int)sizeof(uchar)); \
     __global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \
@@ -278,8 +289,7 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset
 
     for (int grain = groupnum * WGS; id < total; id += grain)
     {
-        int src_index = mad24(id / cols, src_step, mad24(id % cols, (int)sizeof(srcT), src_offset));
-        __global const srcT * src = (__global const srcT *)(srcptr + src_index);
+        int src_index = mad24(id / cols, src_step, mad24(id % cols, srcTSIZE, src_offset));
         REDUCE_GLOBAL;
     }
 
diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp
index 3cecc9ab0d624ed47460e09ff686d9bd35928777..a4605d1605b2366ff4f55b42d79cd05f04bfb65f 100644
--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -475,7 +475,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
 
-    if ( (!doubleSupport && depth == CV_64F) || cn > 4 || cn == 3 )
+    if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
         return false;
 
     int dbsize = ocl::Device::getDefault().maxComputeUnits();
@@ -494,8 +494,11 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
     static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
     char cvt[40];
     ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
-                  format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
-                         ocl::typeToStr(type), ocl::typeToStr(dtype), ddepth, ocl::convertTypeStr(depth, ddepth, cn, cvt),
+                  format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
+                         " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
+                         ocl::typeToStr(type), ocl::typeToStr(depth),
+                         ocl::typeToStr(dtype), ocl::typeToStr(ddepth), ddepth, cn,
+                         ocl::convertTypeStr(depth, ddepth, cn, cvt),
                          opMap[sum_op], (int)wgs, wgs2_aligned,
                          doubleSupport ? " -D DOUBLE_SUPPORT" : "",
                          haveMask ? " -D HAVE_MASK" : ""));