diff --git a/modules/core/perf/opencl/perf_arithm.cpp b/modules/core/perf/opencl/perf_arithm.cpp index 98f65047303659420e90f38e60734152d70e26bd..f4680aacbd6c274327bc7a230829c5b0040f0c04 100644 --- a/modules/core/perf/opencl/perf_arithm.cpp +++ b/modules/core/perf/opencl/perf_arithm.cpp @@ -344,7 +344,7 @@ OCL_PERF_TEST_P(FlipFixture, Flip, typedef Size_MatType MinMaxLocFixture; OCL_PERF_TEST_P(MinMaxLocFixture, MinMaxLoc, - ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES)) + ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134)) { const Size_MatType_t params = GetParam(); const Size srcSize = get<0>(params); @@ -380,7 +380,7 @@ typedef Size_MatType SumFixture; OCL_PERF_TEST_P(SumFixture, Sum, ::testing::Combine(OCL_TEST_SIZES, - OCL_TEST_TYPES)) + OCL_TEST_TYPES_134)) { const Size_MatType_t params = GetParam(); const Size srcSize = get<0>(params); @@ -447,7 +447,7 @@ OCL_PERF_TEST_P(PhaseFixture, Phase, ::testing::Combine( SANITY_CHECK(dst, 1e-2); } -///////////// bitwise_and//////////////////////// +///////////// bitwise_and //////////////////////// typedef Size_MatType BitwiseAndFixture; @@ -531,7 +531,7 @@ OCL_PERF_TEST_P(BitwiseNotFixture, Bitwise_not, SANITY_CHECK(dst); } -///////////// compare//////////////////////// +///////////// compare //////////////////////// CV_ENUM(CmpCode, CMP_LT, CMP_LE, CMP_EQ, CMP_NE, CMP_GE, CMP_GT) @@ -652,7 +652,8 @@ OCL_PERF_TEST_P(SetIdentityFixture, SetIdentity, typedef Size_MatType MeanStdDevFixture; OCL_PERF_TEST_P(MeanStdDevFixture, MeanStdDev, - ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES)) + ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), + OCL_TEST_TYPES_134)) { const Size_MatType_t params = GetParam(); const Size srcSize = get<0>(params); @@ -688,7 +689,8 @@ typedef std::tr1::tuple NormParams; typedef TestBaseWithParam NormFixture; OCL_PERF_TEST_P(NormFixture, Norm, - ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES, NormType::all())) + ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), + OCL_TEST_TYPES_134, NormType::all())) { const NormParams params = GetParam(); const Size srcSize = get<0>(params); @@ -711,7 +713,8 @@ OCL_PERF_TEST_P(NormFixture, Norm, typedef Size_MatType UMatDotFixture; OCL_PERF_TEST_P(UMatDotFixture, UMatDot, - ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), OCL_TEST_TYPES)) + ::testing::Combine(OCL_PERF_ENUM(OCL_SIZE_1, OCL_SIZE_2, OCL_SIZE_3), + OCL_TEST_TYPES_134)) { const Size_MatType_t params = GetParam(); const Size srcSize = get<0>(params); @@ -820,7 +823,8 @@ typedef tuple NormalizeParams; typedef TestBaseWithParam NormalizeFixture; OCL_PERF_TEST_P(NormalizeFixture, Normalize, - ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES, NormalizeModes::all())) + ::testing::Combine(OCL_TEST_SIZES, OCL_TEST_TYPES_134, + NormalizeModes::all())) { const NormalizeParams params = GetParam(); const Size srcSize = get<0>(params); diff --git a/modules/core/src/opencl/reduce.cl b/modules/core/src/opencl/reduce.cl index 0a0538ed6afbecbb52cce57b86a9d155076a9310..ed935881da8a05174265eea945ae4a81629ae90f 100644 --- a/modules/core/src/opencl/reduce.cl +++ b/modules/core/src/opencl/reduce.cl @@ -52,6 +52,18 @@ #define noconvert +#if cn != 3 +#define loadpix(addr) *(__global const srcT *)(addr) +#define storepix(val, addr) *(__global dstT *)(addr) = val +#define srcTSIZE (int)sizeof(srcT) +#define dstTSIZE (int)sizeof(dstT) +#else +#define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) +#define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) +#define srcTSIZE ((int)sizeof(srcT1)*3) +#define dstTSIZE ((int)sizeof(dstT1)*3) +#endif + #ifdef HAVE_MASK #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset #else @@ -88,19 +100,20 @@ #ifdef HAVE_MASK #define REDUCE_GLOBAL \ - dstT temp = convertToDT(src[0]); \ int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)); \ if (mask[mask_index]) \ - FUNC(accumulator, temp) + { \ + dstT temp = convertToDT(loadpix(srcptr + src_index)); \ + FUNC(accumulator, temp); \ + } #elif defined OP_DOT #define REDUCE_GLOBAL \ - int src2_index = mad24(id / cols, src2_step, mad24(id % cols, (int)sizeof(srcT), src2_offset)); \ - __global const srcT * src2 = (__global const srcT *)(src2ptr + src2_index); \ - dstT temp = convertToDT(src[0]), temp2 = convertToDT(src2[0]); \ + int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset)); \ + dstT temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ FUNC(accumulator, temp, temp2) #else #define REDUCE_GLOBAL \ - dstT temp = convertToDT(src[0]); \ + dstT temp = convertToDT(loadpix(srcptr + src_index)); \ FUNC(accumulator, temp) #endif @@ -111,8 +124,7 @@ #define REDUCE_LOCAL_2 \ localmem[lid] += localmem[lid2] #define CALC_RESULT \ - __global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \ - dst[0] = localmem[0] + storepix(localmem[0], dstptr + dstTSIZE * gid) // countNonZero stuff #elif defined OP_COUNT_NON_ZERO @@ -123,7 +135,7 @@ dstT accumulator = (dstT)(0); \ srcT zero = (srcT)(0), one = (srcT)(1) #define REDUCE_GLOBAL \ - accumulator += src[0] == zero ? zero : one + accumulator += loadpix(srcptr + src_index) == zero ? zero : one #define SET_LOCAL_1 \ localmem[lid] = accumulator #define REDUCE_LOCAL_1 \ @@ -131,8 +143,7 @@ #define REDUCE_LOCAL_2 \ localmem[lid] += localmem[lid2] #define CALC_RESULT \ - __global dstT * dst = (__global dstT *)(dstptr + (int)sizeof(dstT) * gid); \ - dst[0] = localmem[0] + storepix(localmem[0], dstptr + dstTSIZE * gid) // minMaxLoc stuff #elif defined OP_MIN_MAX_LOC || defined OP_MIN_MAX_LOC_MASK @@ -167,6 +178,8 @@ #define MAX_VAL DBL_MAX #endif +#define dstT srcT + #define DECLARE_LOCAL_MEM \ __local srcT localmem_min[WGS2_ALIGNED]; \ __local srcT localmem_max[WGS2_ALIGNED]; \ @@ -181,7 +194,7 @@ srcT temp; \ int temploc #define REDUCE_GLOBAL \ - temp = src[0]; \ + temp = loadpix(srcptr + src_index); \ temploc = id; \ srcT temp_minval = minval, temp_maxval = maxval; \ minval = min(minval, temp); \ @@ -217,10 +230,8 @@ localmem_maxloc[lid] : (max1 == max2) ? (max1 == oldmax) ? min(localmem_maxloc[lid2],localmem_maxloc[lid]) : \ localmem_maxloc[lid2] : localmem_maxloc[lid] #define CALC_RESULT \ - __global srcT * dstminval = (__global srcT *)(dstptr + (int)sizeof(srcT) * gid); \ - __global srcT * dstmaxval = (__global srcT *)(dstptr2 + (int)sizeof(srcT) * gid); \ - dstminval[0] = localmem_min[0]; \ - dstmaxval[0] = localmem_max[0]; \ + storepix(localmem_min[0], dstptr + dstTSIZE * gid); \ + storepix(localmem_max[0], dstptr2 + dstTSIZE * gid); \ dstlocptr[gid] = localmem_minloc[0]; \ dstlocptr2[gid] = localmem_maxloc[0] @@ -236,7 +247,7 @@ int temploc #undef REDUCE_GLOBAL #define REDUCE_GLOBAL \ - temp = src[0]; \ + temp = loadpix(srcptr + src_index); \ temploc = id; \ int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols) * (int)sizeof(uchar)); \ __global const uchar * mask = (__global const uchar *)(maskptr + mask_index); \ @@ -278,8 +289,7 @@ __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset for (int grain = groupnum * WGS; id < total; id += grain) { - int src_index = mad24(id / cols, src_step, mad24(id % cols, (int)sizeof(srcT), src_offset)); - __global const srcT * src = (__global const srcT *)(srcptr + src_index); + int src_index = mad24(id / cols, src_step, mad24(id % cols, srcTSIZE, src_offset)); REDUCE_GLOBAL; } diff --git a/modules/core/src/stat.cpp b/modules/core/src/stat.cpp index 3cecc9ab0d624ed47460e09ff686d9bd35928777..a4605d1605b2366ff4f55b42d79cd05f04bfb65f 100644 --- a/modules/core/src/stat.cpp +++ b/modules/core/src/stat.cpp @@ -475,7 +475,7 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; - if ( (!doubleSupport && depth == CV_64F) || cn > 4 || cn == 3 ) + if ( (!doubleSupport && depth == CV_64F) || cn > 4 ) return false; int dbsize = ocl::Device::getDefault().maxComputeUnits(); @@ -494,8 +494,11 @@ static bool ocl_sum( InputArray _src, Scalar & res, int sum_op, InputArray _mask static const char * const opMap[3] = { "OP_SUM", "OP_SUM_ABS", "OP_SUM_SQR" }; char cvt[40]; ocl::Kernel k("reduce", ocl::core::reduce_oclsrc, - format("-D srcT=%s -D dstT=%s -D ddepth=%d -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s", - ocl::typeToStr(type), ocl::typeToStr(dtype), ddepth, ocl::convertTypeStr(depth, ddepth, cn, cvt), + format("-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D ddepth=%d -D cn=%d" + " -D convertToDT=%s -D %s -D WGS=%d -D WGS2_ALIGNED=%d%s%s", + ocl::typeToStr(type), ocl::typeToStr(depth), + ocl::typeToStr(dtype), ocl::typeToStr(ddepth), ddepth, cn, + ocl::convertTypeStr(depth, ddepth, cn, cvt), opMap[sum_op], (int)wgs, wgs2_aligned, doubleSupport ? " -D DOUBLE_SUPPORT" : "", haveMask ? " -D HAVE_MASK" : ""));