Merge pull request #9753 from tomoaki0705:universalMatmul

* add accuracy test and performance check for matmul * add performance tests for transform and dotProduct * add test Core_TransformLargeTest for 8u version of transform * remove raw SSE2/NEON implementation from matmul.cpp * use universal intrinsic instead of raw intrinsic * remove unused templated function * add v_matmuladd which multiply 3x3 matrix and add 3x1 vector * add v_rotate_left/right in universal intrinsic * suppress intrinsic on some function and platform * add pure SW implementation of new universal intrinsics * add test for new universal intrinsics * core: prevent memory access after the end of buffer * fix perf tests

Merge pull request #9753 from tomoaki0705:universalMatmul
* add accuracy test and performance check for matmul * add performance tests for transform and dotProduct * add test Core_TransformLargeTest for 8u version of transform * remove raw SSE2/NEON implementation from matmul.cpp * use universal intrinsic instead of raw intrinsic * remove unused templated function * add v_matmuladd which multiply 3x3 matrix and add 3x1 vector * add v_rotate_left/right in universal intrinsic * suppress intrinsic on some function and platform * add pure SW implementation of new universal intrinsics * add test for new universal intrinsics * core: prevent memory access after the end of buffer * fix perf tests
3cbe60cc · Tomoaki Teshima · Alexander Alekhin · 2674c6b5 · 3cbe60cc · 3cbe60cc
9 changed file
--- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp
@@ -885,12 +885,59 @@ template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg
 /** @brief Bitwise shift left

 For 16-, 32- and 64-bit integer values. */
-OPENCV_HAL_IMPL_SHIFT_OP(<<)
+OPENCV_HAL_IMPL_SHIFT_OP(<< )

 /** @brief Bitwise shift right

 For 16-, 32- and 64-bit integer values. */
-OPENCV_HAL_IMPL_SHIFT_OP(>>)
+OPENCV_HAL_IMPL_SHIFT_OP(>> )
+
+/** @brief Element shift left among vector
+
+For all type */
+#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
+template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
+{ \
+    v_reg<_Tp, n> b; \
+    for (int i = 0; i < n; i++) \
+    { \
+        int sIndex = i opA imm; \
+        if (0 <= sIndex && sIndex < n) \
+        { \
+            b.s[i] = a.s[sIndex]; \
+        } \
+        else \
+        { \
+            b.s[i] = 0; \
+        } \
+    } \
+    return b; \
+} \
+template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
+{ \
+    v_reg<_Tp, n> c; \
+    for (int i = 0; i < n; i++) \
+    { \
+        int aIndex = i opA imm; \
+        int bIndex = i opA imm opB n; \
+        if (0 <= bIndex && bIndex < n) \
+        { \
+            c.s[i] = b.s[bIndex]; \
+        } \
+        else if (0 <= aIndex && aIndex < n) \
+        { \
+            c.s[i] = a.s[aIndex]; \
+        } \
+        else \
+        { \
+            c.s[i] = 0; \
+        } \
+    } \
+    return c; \
+}
+
+OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(left,  -, +)
+OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(right, +, -)

 /** @brief Sum packed values

@@ -1860,6 +1907,30 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
 }

+/** @brief Matrix multiplication and add
+
+Scheme:
+@code
+{A0 A1 A2   }   |V0|   |D0|
+{B0 B1 B2   }   |V1|   |D1|
+{C0 C1 C2   } x |V2| + |D2|
+====================
+{R0 R1 R2 R3}, where:
+R0 = A0V0 + A1V1 + A2V2 + D0,
+R1 = B0V0 + B1V1 + B2V2 + D1
+...
+@endcode
+*/
+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& m3)
+{
+    return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + m3.s[0],
+                       v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + m3.s[1],
+                       v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + m3.s[2],
+                       v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + m3.s[3]);
+}
+
 //! @}

 //! @name Check SIMD support

--- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp
@@ -407,6 +407,18 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
    return v_float32x4(res);
 }

+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    float32x2_t vl = vget_low_f32(v.val), vh = vget_high_f32(v.val);
+    float32x4_t res = vmulq_lane_f32(m0.val, vl, 0);
+    res = vmlaq_lane_f32(res, m1.val, vl, 1);
+    res = vmlaq_lane_f32(res, m2.val, vh, 0);
+    res = vaddq_f32(res, a.val);
+    return v_float32x4(res);
+}
+
 #define OPENCV_HAL_IMPL_NEON_BIN_OP(bin_op, _Tpvec, intrin) \
 inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
 { \
@@ -747,7 +759,15 @@ template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
 { return _Tpvec(vshrq_n_##suffix(a.val, n)); } \
 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
-{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); }
+{ return _Tpvec(vrshrq_n_##suffix(a.val, n)); } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a) \
+{ return _Tpvec(vextq_##suffix(a.val, vdupq_n_##suffix(0), n)); } \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a) \
+{ return _Tpvec(vextq_##suffix(vdupq_n_##suffix(0), a.val, _Tpvec::nlanes - n)); } \
+template<int n> inline _Tpvec v_rotate_right(const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(vextq_##suffix(a.val, b.val, n)); } \
+template<int n> inline _Tpvec v_rotate_left(const _Tpvec& a, const _Tpvec& b) \
+{ return _Tpvec(vextq_##suffix(b.val, a.val, _Tpvec::nlanes - n)); }

 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_uint8x16, u8, schar, s8)
 OPENCV_HAL_IMPL_NEON_SHIFT_OP(v_int8x16, s8, schar, s8)

--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -602,6 +602,16 @@ inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
    return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
 }

+inline v_float32x4 v_matmuladd(const v_float32x4& v, const v_float32x4& m0,
+                               const v_float32x4& m1, const v_float32x4& m2,
+                               const v_float32x4& a)
+{
+    __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
+    __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
+    __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
+
+    return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, a.val)));
+}

 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
    inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
@@ -1011,6 +1021,29 @@ OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)

+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a)
+{
+    return _Tpvec(_mm_srli_si128(a.val, imm*(sizeof(typename _Tpvec::lane_type))));
+}
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a)
+{
+    return _Tpvec(_mm_slli_si128(a.val, imm*(sizeof(typename _Tpvec::lane_type))));
+}
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_right(const _Tpvec &a, const _Tpvec &b)
+{
+    const int cWidth = sizeof(typename _Tpvec::lane_type);
+    return _Tpvec(_mm_or_si128(_mm_srli_si128(a.val, imm*cWidth), _mm_slli_si128(b.val, (16 - imm*cWidth))));
+}
+template<int imm, typename _Tpvec>
+inline _Tpvec v_rotate_left(const _Tpvec &a, const _Tpvec &b)
+{
+    const int cWidth = sizeof(typename _Tpvec::lane_type);
+    return _Tpvec(_mm_or_si128(_mm_slli_si128(a.val, imm*cWidth), _mm_srli_si128(b.val, (16 - imm*cWidth))));
+}
+
 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
 inline _Tpvec v_load(const _Tp* ptr) \
 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \

--- a/modules/core/perf/opencl/perf_arithm.cpp
+++ b/modules/core/perf/opencl/perf_arithm.cpp
@@ -1062,6 +1062,34 @@ OCL_PERF_TEST_P(ScaleAddFixture, ScaleAdd,
    SANITY_CHECK(dst, 1e-6);
 }

+///////////// Transform ////////////////////////
+
+typedef Size_MatType TransformFixture;
+
+OCL_PERF_TEST_P(TransformFixture, Transform,
+                ::testing::Combine(OCL_TEST_SIZES,
+                ::testing::Values(CV_8UC3, CV_8SC3, CV_16UC3, CV_16SC3, CV_32SC3, CV_32FC3, CV_64FC3)))
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize = get<0>(params);
+    const int type = get<1>(params);
+
+    checkDeviceMaxMemoryAllocSize(srcSize, type);
+
+    const float transform[] = { 0.5f,           0.f, 0.86602540378f, 128,
+                                0.f,            1.f, 0.f,            -64,
+                                0.86602540378f, 0.f, 0.5f,            32,};
+    Mat mtx(Size(4, 3), CV_32FC1, (void*)transform);
+
+    UMat src(srcSize, type), dst(srcSize, type);
+    randu(src, 0, 30);
+    declare.in(src).out(dst);
+
+    OCL_TEST_CYCLE() cv::transform(src, dst, mtx);
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
+
 ///////////// PSNR ////////////////////////

 typedef Size_MatType PSNRFixture;

--- a/modules/core/perf/perf_mat.cpp
+++ b/modules/core/perf/perf_mat.cpp
@@ -96,3 +96,31 @@ PERF_TEST_P(Size_MatType, Mat_Clone_Roi,

    SANITY_CHECK(destination, 1);
 }
+
+///////////// Transform ////////////////////////
+
+PERF_TEST_P(Size_MatType, Mat_Transform,
+            testing::Combine(testing::Values(TYPICAL_MAT_SIZES),
+                             testing::Values(CV_8UC3, CV_8SC3, CV_16UC3, CV_16SC3, CV_32SC3, CV_32FC3, CV_64FC3))
+            )
+{
+    const Size_MatType_t params = GetParam();
+    const Size srcSize0 = get<0>(params);
+    const Size srcSize = Size(1, srcSize0.width*srcSize0.height);
+    const int type = get<1>(params);
+    const float transform[] = { 0.5f,           0.f, 0.86602540378f, 128,
+                                0.f,            1.f, 0.f,            -64,
+                                0.86602540378f, 0.f, 0.5f,            32,};
+    Mat mtx(Size(4, 3), CV_32FC1, (void*)transform);
+
+    Mat src(srcSize, type), dst(srcSize, type);
+    randu(src, 0, 30);
+    declare.in(src).out(dst);
+
+    TEST_CYCLE()
+    {
+        cv::transform(src, dst, mtx);
+    }
+
+    SANITY_CHECK(dst, 1e-6, ERROR_RELATIVE);
+}
--- a/modules/core/src/matmul.cpp
+++ b/modules/core/src/matmul.cpp
--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@@ -33,6 +33,7 @@ TEST(hal_intrin, uint8x16) {
        .test_pack_u<1>().test_pack_u<2>().test_pack_u<3>().test_pack_u<8>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
        ;
 }

@@ -54,6 +55,7 @@ TEST(hal_intrin, int8x16) {
        .test_pack<1>().test_pack<2>().test_pack<3>().test_pack<8>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<8>().test_extract<15>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<8>().test_rotate<15>()
        ;
 }

@@ -81,6 +83,7 @@ TEST(hal_intrin, uint16x8) {
        .test_pack_u<1>().test_pack_u<2>().test_pack_u<7>().test_pack_u<16>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
        ;
 }

@@ -107,6 +110,7 @@ TEST(hal_intrin, int16x8) {
        .test_pack<1>().test_pack<2>().test_pack<7>().test_pack<16>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<4>().test_extract<7>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<4>().test_rotate<7>()
        ;
 }

@@ -132,6 +136,7 @@ TEST(hal_intrin, uint32x4) {
        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        .test_transpose()
        ;
 }
@@ -155,6 +160,7 @@ TEST(hal_intrin, int32x4) {
        .test_pack<1>().test_pack<2>().test_pack<15>().test_pack<32>()
        .test_unpack()
        .test_extract<0>().test_extract<1>().test_extract<2>().test_extract<3>()
+        .test_rotate<0>().test_rotate<1>().test_rotate<2>().test_rotate<3>()
        .test_float_cvt32()
        .test_float_cvt64()
        .test_transpose()
@@ -170,6 +176,7 @@ TEST(hal_intrin, uint64x2) {
        .test_shift<1>().test_shift<8>()
        .test_logic()
        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
        ;
 }

@@ -180,6 +187,7 @@ TEST(hal_intrin, int64x2) {
        .test_shift<1>().test_shift<8>()
        .test_logic()
        .test_extract<0>().test_extract<1>()
+        .test_rotate<0>().test_rotate<1>()
        ;
 }


--- a/modules/core/test/test_intrin_utils.hpp
+++ b/modules/core/test/test_intrin_utils.hpp
@@ -793,6 +793,30 @@ template<typename R> struct TheTest
        return *this;
    }

+    template<int s>
+    TheTest & test_rotate()
+    {
+        Data<R> dataA, dataB;
+        dataB *= 10;
+        R a = dataA, b = dataB;
+
+        Data<R> resC = v_rotate_right<s>(a);
+        Data<R> resD = v_rotate_right<s>(a, b);
+
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            if (i + s >= R::nlanes)
+            {
+                EXPECT_EQ((LaneType)0, resC[i]);
+                EXPECT_EQ(dataB[i - R::nlanes + s], resD[i]);
+            }
+            else
+                EXPECT_EQ(dataA[i + s], resC[i]);
+        }
+
+        return *this;
+    }
+
    TheTest & test_float_math()
    {
        typedef typename V_RegTrait128<LaneType>::int_reg Ri;
@@ -882,6 +906,16 @@ template<typename R> struct TheTest
                                      + dataV[3] * dataD[i];
            EXPECT_DOUBLE_EQ(val, res[i]);
        }
+
+        Data<R> resAdd = v_matmuladd(v, a, b, c, d);
+        for (int i = 0; i < R::nlanes; ++i)
+        {
+            LaneType val = dataV[0] * dataA[i]
+                                      + dataV[1] * dataB[i]
+                                      + dataV[2] * dataC[i]
+                                      + dataD[i];
+            EXPECT_DOUBLE_EQ(val, resAdd[i]);
+        }
        return *this;
    }


--- a/modules/core/test/test_math.cpp
+++ b/modules/core/test/test_math.cpp
@@ -904,6 +904,60 @@ void Core_TransformTest::prepare_to_validation( int )
    cvtest::transform( test_mat[INPUT][0], test_mat[REF_OUTPUT][0], transmat, shift );
 }

+class Core_TransformLargeTest : public Core_TransformTest
+{
+public:
+    typedef Core_MatrixTest Base;
+protected:
+    void get_test_array_types_and_sizes(int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types);
+};
+
+void Core_TransformLargeTest::get_test_array_types_and_sizes(int test_case_idx, vector<vector<Size> >& sizes, vector<vector<int> >& types)
+{
+    RNG& rng = ts->get_rng();
+    int bits = cvtest::randInt(rng);
+    int depth, dst_cn, mat_cols, mattype;
+    Base::get_test_array_types_and_sizes(test_case_idx, sizes, types);
+    for (unsigned int j = 0; j < sizes.size(); j++)
+    {
+        for (unsigned int i = 0; i < sizes[j].size(); i++)
+        {
+            sizes[j][i].width *= 4;
+        }
+    }
+
+    mat_cols = CV_MAT_CN(types[INPUT][0]);
+    depth = CV_MAT_DEPTH(types[INPUT][0]);
+    dst_cn = cvtest::randInt(rng) % 4 + 1;
+    types[OUTPUT][0] = types[REF_OUTPUT][0] = CV_MAKETYPE(depth, dst_cn);
+
+    mattype = depth < CV_32S ? CV_32F : depth == CV_64F ? CV_64F : bits & 1 ? CV_32F : CV_64F;
+    types[INPUT][1] = mattype;
+    types[INPUT][2] = CV_MAKETYPE(mattype, dst_cn);
+
+    scale = 1. / ((cvtest::randInt(rng) % 4) * 50 + 1);
+
+    if (bits & 2)
+    {
+        sizes[INPUT][2] = Size(0, 0);
+        mat_cols += (bits & 4) != 0;
+    }
+    else if (bits & 4)
+        sizes[INPUT][2] = Size(1, 1);
+    else
+    {
+        if (bits & 8)
+            sizes[INPUT][2] = Size(dst_cn, 1);
+        else
+            sizes[INPUT][2] = Size(1, dst_cn);
+        types[INPUT][2] &= ~CV_MAT_CN_MASK;
+    }
+    diagMtx = (bits & 16) != 0;
+
+    sizes[INPUT][1] = Size(mat_cols, dst_cn);
+}
+
+

 ///////////////// PerspectiveTransform /////////////////////

@@ -2691,6 +2745,7 @@ TEST(Core_Invert, accuracy) { Core_InvertTest test; test.safe_run(); }
 TEST(Core_Mahalanobis, accuracy) { Core_MahalanobisTest test; test.safe_run(); }
 TEST(Core_MulTransposed, accuracy) { Core_MulTransposedTest test; test.safe_run(); }
 TEST(Core_Transform, accuracy) { Core_TransformTest test; test.safe_run(); }
+TEST(Core_TransformLarge, accuracy) { Core_TransformLargeTest test; test.safe_run(); }
 TEST(Core_PerspectiveTransform, accuracy) { Core_PerspectiveTransformTest test; test.safe_run(); }
 TEST(Core_Pow, accuracy) { Core_PowTest test; test.safe_run(); }
 TEST(Core_SolveLinearSystem, accuracy) { Core_SolveTest test; test.safe_run(); }