Merge pull request #2470 from ilya-lavrenov:tapi_reduction

c5bf1c10 · Andrey Pavlenko · OpenCV Buildbot · 42335396 · 9932cf41 · c5bf1c10
3 changed file
--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@@ -344,7 +344,7 @@ OCL_PERF_TEST_P(FlipFixture, Flip,
 typedef Size_MatType MinMaxLocFixture;

 OCL_PERF_TEST_P(MinMaxLocFixture, MinMaxLoc,
-            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES))
+            ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134))
 {
    const Size_MatType_t params = GetParam();
    const Size srcSize = get<0>(params);
@@ -380,7 +380,7 @@ typedef Size_MatType SumFixture;

 OCL_PERF_TEST_P(SumFixture, Sum,
            ::testing::Combine(OCL_TEST_SIZES,
-                               OCL_TEST_TYPES))
+                               OCL_TEST_TYPES_134))
 {
    const Size_MatType_t params = GetParam();
    const Size srcSize = get<0>(params);
@@ -447,7 +447,7 @@ OCL_PERF_TEST_P(PhaseFixture, Phase, ::testing::Combine(
    SANITY_CHECK(dst, 1e-2);
 }

-///////////// bitwise_and////////////////////////
+///////////// bitwise_and ////////////////////////

 typedef Size_MatType BitwiseAndFixture;

@@ -531,7 +531,7 @@ OCL_PERF_TEST_P(BitwiseNotFixture, Bitwise_not,
    SANITY_CHECK(dst);
 }

-///////////// compare////////////////////////
+///////////// compare ////////////////////////

 CV_ENUM(CmpCode, CMP_LT, CMP_LE, CMP_EQ, CMP_NE, CMP_GE, CMP_GT)

@@ -652,7 +652,8 @@ OCL_PERF_TEST_P(SetIdentityFixture, SetIdentity,
 typedef Size_MatType MeanStdDevFixture;

 OCL_PERF_TEST_P(MeanStdDevFixture, MeanStdDev,
-                ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES))
+                ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
+                                   OCL_TEST_TYPES_134))
 {
    const Size_MatType_t params = GetParam();
    const Size srcSize = get<0>(params);
@@ -688,7 +689,8 @@ typedef std::tr1::tuple<Size, MatType, NormType> NormParams;
 typedef TestBaseWithParam<NormParams> NormFixture;

 OCL_PERF_TEST_P(NormFixture, Norm,
-                ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES, NormType::all()))
+                ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
+                                   OCL_TEST_TYPES_134, NormType::all()))
 {
    const NormParams params = GetParam();
    const Size srcSize = get<0>(params);
@@ -711,7 +713,8 @@ OCL_PERF_TEST_P(NormFixture, Norm,
 typedef Size_MatType UMatDotFixture;

 OCL_PERF_TEST_P(UMatDotFixture, UMatDot,
-            ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES))
+            ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3),
+                               OCL_TEST_TYPES_134))
 {
    const Size_MatType_t params = GetParam();
    const Size srcSize = get<0>(params);
@@ -820,7 +823,8 @@ typedef tuple<Size, MatType, NormalizeModes> NormalizeParams;
 typedef TestBaseWithParam<NormalizeParams> NormalizeFixture;

 OCL_PERF_TEST_P(NormalizeFixture, Normalize,
-                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, NormalizeModes::all()))
+                ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134,
+                                   NormalizeModes::all()))
 {
    const NormalizeParams params = GetParam();
    const Size srcSize = get<0>(params);

--- a/modules/core/src/opencl/reduce.cl
+++ b/modules/core/src/opencl/reduce.cl
@@ -52,6 +52,18 @@

 #define noconvert

+#if cn != 3
+#define loadpix(addr) *(__global const srcT *)(addr)
+#define storepix(val, addr)  *(__global dstT *)(addr) = val
+#define srcTSIZE (int)sizeof(srcT)
+#define dstTSIZE (int)sizeof(dstT)
+#else
+#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr))
+#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
+#define srcTSIZE ((int)sizeof(srcT1)*3)
+#define dstTSIZE ((int)sizeof(dstT1)*3)
+#endif
+
 #ifdef HAVE_MASK
 #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset
 #else
@@ -88,19 +100,20 @@

 #ifdef HAVE_MASK
 #define REDUCE_GLOBAL \
-    dstT temp = convertToDT(src[0]); \
    int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \
    if (mask[mask_index]) \
-        FUNC(accumulator, temp)
+    { \
+        dstT temp = convertToDT(loadpix(srcptr + src_index)); \
+        FUNC(accumulator, temp); \
+    }
 #elif defined OP_DOT
 #define REDUCE_GLOBAL \
-    int src2_index = mad24(id / cols, src2_step, mad24(id % cols, (int)sizeof(srcT), src2_offset)); \
-    __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index); \
-    dstT temp = convertToDT(src[0]), temp2 = convertToDT(src2[0]); \
+    int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset)); \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \
    FUNC(accumulator, temp, temp2)
 #else
 #define REDUCE_GLOBAL \
-    dstT temp = convertToDT(src[0]); \
+    dstT temp = convertToDT(loadpix(srcptr + src_index)); \
    FUNC(accumulator, temp)
 #endif

@@ -111,8 +124,7 @@
 #define REDUCE_LOCAL_2 \
    localmem[lid] += localmem[lid2]
 #define CALC_RESULT \
-    __global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \
-    dst[0] = localmem[0]
+    storepix(localmem[0], dstptr + dstTSIZE * gid)

 // countNonZero stuff
 #elif defined OP_COUNT_NON_ZERO
@@ -123,7 +135,7 @@
    dstT accumulator = (dstT)(0); \
    srcT zero = (srcT)(0), one = (srcT)(1)
 #define REDUCE_GLOBAL \
-    accumulator += src[0] == zero ? zero : one
+    accumulator += loadpix(srcptr + src_index) == zero ? zero : one
 #define SET_LOCAL_1 \
    localmem[lid] = accumulator
 #define REDUCE_LOCAL_1 \
@@ -131,8 +143,7 @@
 #define REDUCE_LOCAL_2 \
    localmem[lid] += localmem[lid2]
 #define CALC_RESULT \
-    __global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \
-    dst[0] = localmem[0]
+    storepix(localmem[0], dstptr + dstTSIZE * gid)

 // minMaxLoc stuff
 #elif defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK
@@ -167,6 +178,8 @@
 #define MAX_VAL DBL_MAX
 #endif

+#define dstT srcT
+
 #define DECLARE_LOCAL_MEM \
    __local srcT localmem_min[WGS2_ALIGNED]; \
    __local srcT localmem_max[WGS2_ALIGNED]; \
@@ -181,7 +194,7 @@
    srcT temp; \
    int temploc
 #define REDUCE_GLOBAL \
-    temp = src[0]; \
+    temp = loadpix(srcptr + src_index); \
    temploc = id; \
    srcT temp_minval = minval, temp_maxval = maxval; \
    minval = min(minval, temp); \
@@ -217,10 +230,8 @@
        localmem_maxloc[lid] : (max1 == max2) ? (max1 == oldmax) ? min(localmem_maxloc[lid2],localmem_maxloc[lid]) : \
        localmem_maxloc[lid2] : localmem_maxloc[lid]
 #define CALC_RESULT \
-    __global srcT * dstminval = (__global srcT *)(dstptr + (int)sizeof(srcT) * gid); \
-    __global srcT * dstmaxval = (__global srcT *)(dstptr2 + (int)sizeof(srcT) * gid); \
-    dstminval[0] = localmem_min[0]; \
-    dstmaxval[0] = localmem_max[0]; \
+    storepix(localmem_min[0], dstptr + dstTSIZE * gid); \
+    storepix(localmem_max[0], dstptr2 + dstTSIZE * gid); \
    dstlocptr[gid] = localmem_minloc[0]; \
    dstlocptr2[gid] = localmem_maxloc[0]

@@ -236,7 +247,7 @@
    int temploc
 #undef REDUCE_GLOBAL
 #define REDUCE_GLOBAL \
-    temp = src[0]; \
+    temp = loadpix(srcptr + src_index); \
    temploc = id; \
    int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols) * (int)sizeof(uchar)); \
    __global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \
@@ -278,8 +289,7 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset

    for (int grain = groupnum * WGS; id < total; id += grain)
    {
-        int src_index = mad24(id / cols, src_step, mad24(id % cols, (int)sizeof(srcT), src_offset));
-        __global const srcT * src = (__global const srcT *)(srcptr + src_index);
+        int src_index = mad24(id / cols, src_step, mad24(id % cols, srcTSIZE, src_offset));
        REDUCE_GLOBAL;
    }


--- a/modules/core/src/stat.cpp
+++ b/modules/core/src/stat.cpp
@@ -475,7 +475,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
    int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
    bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;

-    if ( (!doubleSupport && depth == CV_64F) || cn > 4 || cn == 3 )
+    if ( (!doubleSupport && depth == CV_64F) || cn > 4 )
        return false;

    int dbsize = ocl::Device::getDefault().maxComputeUnits();
@@ -494,8 +494,11 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask
    static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" };
    char cvt[40];
    ocl::Kernel k("reduce", ocl::core::reduce_oclsrc,
-                  format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
-                         ocl::typeToStr(type), ocl::typeToStr(dtype), ddepth, ocl::convertTypeStr(depth, ddepth, cn, cvt),
+                  format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D ddepth=%d -D cn=%d"
+                         " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s",
+                         ocl::typeToStr(type), ocl::typeToStr(depth),
+                         ocl::typeToStr(dtype), ocl::typeToStr(ddepth), ddepth, cn,
+                         ocl::convertTypeStr(depth, ddepth, cn, cvt),
                         opMap[sum_op], (int)wgs, wgs2_aligned,
                         doubleSupport ? " -D DOUBLE_SUPPORT" : "",
                         haveMask ? " -D HAVE_MASK" : ""));