build: fix v_reduce_sum4 (requires SSE3)

e23b59da · Alexander Alekhin · f49f056d · e23b59da · e23b59da
隐藏空白更改
内联并排

Showing with 24 addition and 0 deletion

modules/core/include/opencv2/core/hal/intrin_sse.hpp modules/core/include/opencv2/core/hal/intrin_sse.hpp +6 -0

modules/core/test/test_intrin.cpp modules/core/test/test_intrin.cpp +18 -0

未找到文件。
--- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp
+++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp
@@ -1129,9 +1129,15 @@ OPENCV_HAL_IMPL_SSE_REDUCE_OP_4_SUM(v_float32x4, float, __m128, ps, _mm_castps_s
 inline v_float32x4 v_reduce_sum4(const v_float32x4& a, const v_float32x4& b,
                                 const v_float32x4& c, const v_float32x4& d)
 {
+#if CV_SSE3
    __m128 ab = _mm_hadd_ps(a.val, b.val);
    __m128 cd = _mm_hadd_ps(c.val, d.val);
    return v_float32x4(_mm_hadd_ps(ab, cd));
+#else
+    __m128 ac = _mm_add_ps(_mm_unpacklo_ps(a.val, c.val), _mm_unpackhi_ps(a.val, c.val));
+    __m128 bd = _mm_add_ps(_mm_unpacklo_ps(b.val, d.val), _mm_unpackhi_ps(b.val, d.val));
+    return v_float32x4(_mm_add_ps(_mm_unpacklo_ps(ac, bd), _mm_unpackhi_ps(ac, bd)));
+#endif
 }

 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)

--- a/modules/core/test/test_intrin.cpp
+++ b/modules/core/test/test_intrin.cpp
@@ -741,6 +741,23 @@ template<typename R> struct TheTest
        return *this;
    }

+    TheTest & test_reduce_sum4()
+    {
+        R a(0.1f, 0.02f, 0.003f, 0.0004f);
+        R b(1, 20, 300, 4000);
+        R c(10, 2, 0.3f, 0.04f);
+        R d(1, 2, 3, 4);
+
+        R sum = v_reduce_sum4(a, b, c, d);
+
+        Data<R> res = sum;
+        EXPECT_EQ(0.1234f, res[0]);
+        EXPECT_EQ(4321.0f, res[1]);
+        EXPECT_EQ(12.34f, res[2]);
+        EXPECT_EQ(10.0f, res[3]);
+        return *this;
+    }
+
    TheTest & test_loadstore_fp16()
    {
 #if CV_FP16 && CV_SIMD128
@@ -986,6 +1003,7 @@ TEST(hal_intrin, float32x4) {
        .test_float_cvt64()
        .test_matmul()
        .test_transpose()
+        .test_reduce_sum4()
        ;
 }