Implementing AVX512 Support for 2 and 4 channel mats for CV_64F format

507f8add · Brad Kelly · 9d3d5e9d · 507f8add · 507f8add · 507f8add
4 changed file
--- a/modules/imgproc/perf/perf_integral.cpp
+++ b/modules/imgproc/perf/perf_integral.cpp
@@ -11,7 +11,7 @@ typedef perf::TestBaseWithParam<Size_MatType_OutMatDepth_t> Size_MatType_OutMatD
 PERF_TEST_P(Size_MatType_OutMatDepth, integral,
            testing::Combine(
                testing::Values(TYPICAL_MAT_SIZES),
-                testing::Values(CV_8UC1, CV_8UC3, CV_8UC4),
+                testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4),
                testing::Values(CV_32S, CV_32F, CV_64F)
                )
            )
@@ -32,9 +32,9 @@ PERF_TEST_P(Size_MatType_OutMatDepth, integral,

 PERF_TEST_P(Size_MatType_OutMatDepth, integral_sqsum,
            testing::Combine(
-                testing::Values(::perf::szVGA, ::perf::sz1080p),
-                testing::Values(CV_8UC1, CV_8UC4),
-                testing::Values(CV_32S, CV_32F)
+                testing::Values(TYPICAL_MAT_SIZES),
+                testing::Values(CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4),
+                testing::Values(CV_32S, CV_32F, CV_64F)
                )
            )
 {
@@ -57,9 +57,9 @@ PERF_TEST_P(Size_MatType_OutMatDepth, integral_sqsum,

 PERF_TEST_P( Size_MatType_OutMatDepth, integral_sqsum_tilted,
             testing::Combine(
-                 testing::Values( ::perf::szVGA, ::perf::szODD , ::perf::sz1080p ),
-                 testing::Values( CV_8UC1, CV_8UC4 ),
-                 testing::Values( CV_32S, CV_32F )
+                 testing::Values(TYPICAL_MAT_SIZES),
+                 testing::Values( CV_8UC1, CV_8UC2, CV_8UC3, CV_8UC4 ),
+                 testing::Values( CV_32S, CV_32F, CV_64F )
                 )
             )
 {

--- a/modules/imgproc/src/sumpixels.avx512_skx.cpp
+++ b/modules/imgproc/src/sumpixels.avx512_skx.cpp
@@ -13,33 +13,34 @@ namespace { // Anonymous namespace to avoid exposing the implementation classes
 // NOTE: Look at the bottom of the file for the entry-point function for external callers
 //

-// At the moment only 3 channel support untilted is supported
-// More channel support coming soon.
-// TODO: Add support for sqsum and 1,2, and 4 channels
-class IntegralCalculator_3Channel {
+// TODO: Add support for 1 channel input (WIP: currently hitting hardware glassjaw)
+template<size_t num_channels> class IntegralCalculator;
+
+template<size_t num_channels>
+class IntegralCalculator  {
 public:
-    IntegralCalculator_3Channel() {};
+    IntegralCalculator() {};


    void calculate_integral_avx512(const uchar *src, size_t _srcstep,
                                   double *sum,      size_t _sumstep,
                                   double *sqsum,    size_t _sqsumstep,
-                                   int width, int height, int cn)
+                                   int width, int height)
    {
        const int srcstep = (int)(_srcstep/sizeof(uchar));
        const int sumstep = (int)(_sumstep/sizeof(double));
        const int sqsumstep = (int)(_sqsumstep/sizeof(double));
-        const int ops_per_line = width * cn;
+        const int ops_per_line = width * num_channels;

        // Clear the first line of the sum as per spec (see integral documentation)
        // Also adjust the index of sum and sqsum to be at the real 0th element
        // and not point to the border pixel so it stays in sync with the src pointer
-        memset( sum, 0, (ops_per_line+cn)*sizeof(double));
-        sum += cn;
+        memset( sum, 0, (ops_per_line+num_channels)*sizeof(double));
+        sum += num_channels;

        if (sqsum) {
-            memset( sqsum, 0, (ops_per_line+cn)*sizeof(double));
-            sqsum += cn;
+            memset( sqsum, 0, (ops_per_line+num_channels)*sizeof(double));
+            sqsum += num_channels;
        }

        // Now calculate the integral over the whole image one line at a time
@@ -50,22 +51,22 @@ public:
            double      * sqsum_above = (sqsum) ? &sqsum[y*sqsumstep]     : NULL;
            double      * sqsum_line  = (sqsum) ? &sqsum_above[sqsumstep] : NULL;

-            integral_line_3channel_avx512(src_line, sum_line, sum_above, sqsum_line, sqsum_above, ops_per_line);
+            calculate_integral_for_line(src_line, sum_line, sum_above, sqsum_line, sqsum_above, ops_per_line);

        }
    }

-    static inline
-    void integral_line_3channel_avx512(const uchar *srcs,
-                                       double *sums,   double *sums_above,
-                                       double *sqsums, double *sqsums_above,
-                                       int num_ops_in_line)
+    static CV_ALWAYS_INLINE
+    void calculate_integral_for_line(const uchar *srcs,
+                                     double *sums, double *sums_above,
+                                     double *sqsums, double *sqsums_above,
+                                     int num_ops_in_line)
    {
        __m512i sum_accumulator   = _mm512_setzero_si512();  // holds rolling sums for the line
        __m512i sqsum_accumulator = _mm512_setzero_si512();  // holds rolling sqsums for the line

        // The first element on each line must be zeroes as per spec (see integral documentation)
-        set_border_pixel_value(sums, sqsums);
+        zero_out_border_pixel(sums, sqsums);

        // Do all 64 byte chunk operations then do the last bits that don't fit in a 64 byte chunk
        aligned_integral(     srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line);
@@ -74,20 +75,18 @@ public:
    }


-    static inline
-    void set_border_pixel_value(double *sums, double *sqsums)
+    static CV_ALWAYS_INLINE
+    void zero_out_border_pixel(double *sums, double *sqsums)
    {
-        // Sets the border pixel value to 0s.
-        // Note the hard coded -3 and the 0x7 mask is because we only support 3 channel right now
-        __m512i zeroes = _mm512_setzero_si512();
-
-        _mm512_mask_storeu_epi64(&sums[-3], 0x7, zeroes);
+        // Note the negative index is because the sums/sqsums pointers point to the first real pixel
+        // after the border pixel so we have to look backwards
+        _mm512_mask_storeu_epi64(&sums[-num_channels], (1<<num_channels)-1, _mm512_setzero_si512());
        if (sqsums)
-            _mm512_mask_storeu_epi64(&sqsums[-3], 0x7, zeroes);
+            _mm512_mask_storeu_epi64(&sqsums[-num_channels], (1<<num_channels)-1, _mm512_setzero_si512());
    }


-    static inline
+    static CV_ALWAYS_INLINE
    void aligned_integral(const uchar *&srcs,
                          double *&sums,  double *&sums_above,
                          double *&sqsum, double *&sqsum_above,
@@ -110,7 +109,7 @@ public:
    }


-    static inline
+    static CV_ALWAYS_INLINE
    void post_aligned_integral(const uchar *srcs,
                               const double *sums,   const double *sums_above,
                               const double *sqsum,  const double *sqsum_above,
@@ -132,50 +131,54 @@ public:
    }


-    static inline
+    static CV_ALWAYS_INLINE
    void integral_64_operations_avx512(const __m512i *srcs,
                                       __m512i *sums,       const __m512i *sums_above,
                                       __m512i *sqsums,     const __m512i *sqsums_above,
                                       __mmask64 data_mask,
                                       __m512i &sum_accumulator, __m512i &sqsum_accumulator)
    {
-        __m512i src_64byte_chunk = read_64_bytes(srcs, data_mask);
-
-        for(int num_16byte_chunks=0; num_16byte_chunks<4; num_16byte_chunks++) {
-            __m128i src_16bytes = _mm512_extracti64x2_epi64(src_64byte_chunk, 0x0); // Get lower 16 bytes of data
+       __m512i src_64byte_chunk = read_64_bytes(srcs, data_mask);

-            for (int num_8byte_chunks = 0; num_8byte_chunks < 2; num_8byte_chunks++) {
+        while (data_mask) {
+            __m128i src_16bytes = extract_lower_16bytes(src_64byte_chunk);

-                __m512i src_longs = convert_lower_8bytes_to_longs(src_16bytes);
+            __m512i src_longs_lo = convert_lower_8bytes_to_longs(src_16bytes);
+            __m512i src_longs_hi = convert_lower_8bytes_to_longs(shift_right_8_bytes(src_16bytes));

-                // Calculate integral for the sum on the 8 entries
-                integral_8_operations(src_longs, sums_above, data_mask, sums, sum_accumulator);
-                sums++; sums_above++;
+            // Calculate integral for the sum on the 8 lanes at a time
+            integral_8_operations(src_longs_lo, sums_above, data_mask, sums, sum_accumulator);
+            integral_8_operations(src_longs_hi, sums_above+1, data_mask>>8, sums+1, sum_accumulator);

-                if (sqsums){ // Calculate integral for the sum on the 8 entries
-                    __m512i squared_source = _mm512_mullo_epi64(src_longs, src_longs);
-
-                    integral_8_operations(squared_source, sqsums_above, data_mask, sqsums, sqsum_accumulator);
-                    sqsums++; sqsums_above++;
-                }
-
-                // Prepare for next iteration of inner loop
-                // shift source to align next 8 bytes to lane 0 and shift the mask
-                src_16bytes = shift_right_8_bytes(src_16bytes);
-                data_mask = data_mask >> 8;
+            if (sqsums) {
+                __m512i squared_source_lo = square_m512(src_longs_lo);
+                __m512i squared_source_hi = square_m512(src_longs_hi);

+                integral_8_operations(squared_source_lo, sqsums_above, data_mask, sqsums, sqsum_accumulator);
+                integral_8_operations(squared_source_hi, sqsums_above+1, data_mask>>8, sqsums+1, sqsum_accumulator);
+                sqsums += 2;
+                sqsums_above+=2;
            }

-            // Prepare for next iteration of outer loop
+            // Prepare for next iteration of loop
+            // shift source to align next 16 bytes to lane 0, shift the mask, and advance the pointers
+            sums += 2;
+            sums_above += 2;
+            data_mask = data_mask >> 16;
            src_64byte_chunk = shift_right_16_bytes(src_64byte_chunk);
+
        }
+
    }


-    static inline
+    static CV_ALWAYS_INLINE
    void integral_8_operations(const __m512i src_longs, const __m512i *above_values_ptr, __mmask64 data_mask,
                               __m512i *results_ptr, __m512i &accumulator)
     {
+        // NOTE that the calculate_integral function referenced here must be implemented in the templated
+        // derivatives because the algorithm depends heavily on the number of channels in the image
+        //
        _mm512_mask_storeu_pd(
                results_ptr,   // Store the result here
                data_mask,     // Using the data mask to avoid overrunning the line
@@ -188,59 +191,178 @@ public:
    }


-    static inline
-    __m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator)
-    {
-        __m512i carryover_idxs = _mm512_set_epi64(6, 5, 7, 6, 5, 7, 6, 5);
-
-        // Align data to prepare for the adds:
-        //    shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
-        //   Vertical LANES:     76543210
-        //   src_longs       :   HGFEDCBA
-        //   shited3lanes    : + EDCBA
-        //   shifted6lanes   : + BA
-        //   carry_over_idxs : + 65765765  (index position of result from previous iteration)
-        //                     = integral
-        __m512i shifted3lanes = _mm512_maskz_expand_epi64(0xF8, src_longs);
-        __m512i shifted6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs);
-        __m512i carry_over    = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator);
-
-        // Do the adds in tree form (shift3 + shift 6) + (current_source_values + accumulator)
-        __m512i sum_shift3and6 = _mm512_add_epi64(shifted3lanes, shifted6lanes);
-        __m512i sum_src_carry  = _mm512_add_epi64(src_longs, carry_over);
-        accumulator            = _mm512_add_epi64(sum_shift3and6, sum_src_carry);
-
-        // Convert to packed double and add to the line above to get the true integral value
-        __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator);
-        __m512d integral_pd    = _mm512_add_pd(accumulator_pd, above_values);
-        return integral_pd;
-    }
+    // The calculate_integral function referenced here must be implemented in the templated derivatives
+    // because the algorithm depends heavily on the number of channels in the image
+    // This is the incomplete definition (just the prototype) here.
+    //
+    static CV_ALWAYS_INLINE
+    __m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator);


-    static inline
+    static CV_ALWAYS_INLINE
    __m512i read_64_bytes(const __m512i *srcs, __mmask64 data_mask)  {
        return _mm512_maskz_loadu_epi8(data_mask, srcs);
    }


-    static inline
+    static CV_ALWAYS_INLINE
+    __m128i extract_lower_16bytes(__m512i src_64byte_chunk) {
+        return _mm512_extracti64x2_epi64(src_64byte_chunk, 0x0);
+    }
+
+
+    static CV_ALWAYS_INLINE
    __m512i convert_lower_8bytes_to_longs(__m128i src_16bytes)  {
        return _mm512_cvtepu8_epi64(src_16bytes);
    }


-    static inline
+    static CV_ALWAYS_INLINE
+    __m512i square_m512(__m512i src_longs) {
+        return _mm512_mullo_epi64(src_longs, src_longs);
+    }
+
+
+    static CV_ALWAYS_INLINE
    __m128i shift_right_8_bytes(__m128i src_16bytes)  {
        return _mm_maskz_compress_epi64(2, src_16bytes);
    }


-    static inline
+
+    static CV_ALWAYS_INLINE
    __m512i shift_right_16_bytes(__m512i src_64byte_chunk)  {
        return _mm512_maskz_compress_epi64(0xFC, src_64byte_chunk);
    }

+
 };
+
+
+//============================================================================================================
+// This the only section that needs to change with respect to algorithm based on the number of channels
+// It is responsible for returning the calculation of 8 lanes worth of the integral and returning in the
+// accumulated sums in the accumulator parameter (NOTE: accumulator is an input and output parameter)
+//
+// The function prototype that needs to be implemented is:
+//
+//     __m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator){ ... }
+//
+// Description of parameters:
+//   INPUTS:
+//      src_longs   : 8 lanes worth of the source bytes converted to 64 bit integers
+//      above_values: 8 lanes worth of the result values from the line above (See the integral spec)
+//      accumulator : 8 lanes worth of sums from the previous iteration
+//                    IMPORTANT NOTE: This parameter is both an INPUT AND OUTPUT parameter to this function
+//
+//   OUTPUTS:
+//      return value: The actual integral value for all 8 lanes which is defined by the spec as
+//                    the sum of all channel values to the left of a given pixel plus the result
+//                    written to the line directly above the current line.
+//      accumulator:  This is an input and and output.  This parameter should be left with the accumulated
+//                    sums for the current 8 lanes keeping all entries in the proper lane (do not shuffle it)
+//
+// Below here is the channel specific implementation
+//
+
+//========================================
+//   2 Channel Integral Implementation
+//========================================
+template<>
+CV_ALWAYS_INLINE
+__m512d IntegralCalculator < 2 > ::calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator)
+{
+    __m512i carryover_idxs = _mm512_set_epi64(7, 6, 7, 6, 7, 6, 7, 6);
+
+    // Align data to prepare for the adds:
+    //    shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
+    //   Vertical LANES  :   76543210
+    //   src_longs       :   HGFEDCBA
+    //   shift2lanes     : + FEDCBA
+    //   shift4lanes     : + DCBA
+    //   shift6lanes     : + BA
+    //   carry_over_idxs : + 76767676  (index position of result from previous iteration)
+    //                     = integral
+    __m512i shift2lanes = _mm512_maskz_expand_epi64(0xFC, src_longs);
+    __m512i shift4lanes = _mm512_maskz_expand_epi64(0xF0, src_longs);
+    __m512i shift6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs);
+    __m512i carry_over  = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator);
+
+    // Add all values in tree form for perf ((0+2) + (4+6))
+    __m512i sum_shift_02  = _mm512_add_epi64(src_longs,    shift2lanes);
+    __m512i sum_shift_46  = _mm512_add_epi64(shift4lanes,  shift6lanes);
+    __m512i sum_all       = _mm512_add_epi64(sum_shift_02, sum_shift_46);
+    accumulator           = _mm512_add_epi64(sum_all,      carry_over);
+
+    // Convert to packed double and add to the line above to get the true integral value
+    __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator);
+    __m512d integral_pd    = _mm512_add_pd(accumulator_pd, above_values);
+    return integral_pd;
+}
+
+//========================================
+//   3 Channel Integral Implementation
+//========================================
+template<>
+CV_ALWAYS_INLINE
+__m512d IntegralCalculator < 3 > ::calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator)
+{
+    __m512i carryover_idxs = _mm512_set_epi64(6, 5, 7, 6, 5, 7, 6, 5);
+
+    // Align data to prepare for the adds:
+    //    shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
+    //   Vertical LANES:     76543210
+    //   src_longs       :   HGFEDCBA
+    //   shit3lanes      : + EDCBA
+    //   shift6lanes     : + BA
+    //   carry_over_idxs : + 65765765  (index position of result from previous iteration)
+    //                     = integral
+    __m512i shift3lanes = _mm512_maskz_expand_epi64(0xF8, src_longs);
+    __m512i shift6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs);
+    __m512i carry_over    = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator);
+
+    // Do the adds in tree form
+    __m512i sum_shift_03 = _mm512_add_epi64(src_longs,    shift3lanes);
+    __m512i sum_rest     = _mm512_add_epi64(shift6lanes,  carry_over);
+    accumulator          = _mm512_add_epi64(sum_shift_03, sum_rest);
+
+    // Convert to packed double and add to the line above to get the true integral value
+    __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator);
+    __m512d integral_pd    = _mm512_add_pd(accumulator_pd, above_values);
+    return integral_pd;
+}
+
+
+//========================================
+//   4 Channel Integral Implementation
+//========================================
+template<>
+CV_ALWAYS_INLINE
+__m512d IntegralCalculator < 4 > ::calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator)
+{
+    __m512i carryover_idxs = _mm512_set_epi64(7, 6, 5, 4, 7, 6, 5, 4);
+
+    // Align data to prepare for the adds:
+    //    shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
+    //   Vertical LANES:     76543210
+    //   src_longs       :   HGFEDCBA
+    //   shit4lanes      : + DCBA
+    //   carry_over_idxs : + 76547654  (index position of result from previous iteration)
+    //                     = integral
+    __m512i shifted4lanes = _mm512_maskz_expand_epi64(0xF0, src_longs);
+    __m512i carry_over    = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator);
+
+    // Add data pixels and carry over from last iteration
+    __m512i sum_shift_04 = _mm512_add_epi64(src_longs,    shifted4lanes);
+    accumulator          = _mm512_add_epi64(sum_shift_04, carry_over);
+
+    // Convert to packed double and add to the line above to get the true integral value
+    __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator);
+    __m512d integral_pd    = _mm512_add_pd(accumulator_pd, above_values);
+    return integral_pd;
+}
+
+
 } // end of anonymous namespace

 namespace opt_AVX512_SKX {
@@ -253,8 +375,22 @@ void calculate_integral_avx512(const uchar *src,   size_t _srcstep,
                               double      *sqsum, size_t _sqsumstep,
                               int width, int height, int cn)
 {
-    IntegralCalculator_3Channel  calculator;
-    calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height, cn);
+    switch(cn){
+        case 2: {
+            IntegralCalculator<2> calculator;
+            calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
+            break;
+        }
+        case 3: {
+            IntegralCalculator<3> calculator;
+            calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
+            break;
+        }
+        case 4: {
+            IntegralCalculator<4> calculator;
+            calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height);
+        }
+    }
 }



--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@@ -46,7 +46,6 @@
 #include "opencv2/core/hal/intrin.hpp"
 #include "sumpixels.hpp"

-
 namespace cv
 {

@@ -77,8 +76,8 @@ struct Integral_SIMD<uchar, double, double> {
    {
 #if CV_TRY_AVX512_SKX
        CV_UNUSED(_tiltedstep);
-        // TODO: Add support for 1,2, and 4 channels
-        if (CV_CPU_HAS_SUPPORT_AVX512_SKX && !tilted && cn == 3){
+        // TODO:  Add support for 1 channel input (WIP)
+        if (CV_CPU_HAS_SUPPORT_AVX512_SKX && !tilted && ((cn >= 2) && (cn <= 4))){
            opt_AVX512_SKX::calculate_integral_avx512(src, _srcstep, sum, _sumstep,
                                                      sqsum, _sqsumstep, width, height, cn);
            return true;

--- a/modules/imgproc/test/test_filter.cpp
+++ b/modules/imgproc/test/test_filter.cpp
@@ -1667,12 +1667,11 @@ void CV_IntegralTest::get_test_array_types_and_sizes( int test_case_idx,
 {
    RNG& rng = ts->get_rng();
    int depth = cvtest::randInt(rng) % 2, sum_depth;
-    int cn = cvtest::randInt(rng) % 3 + 1;
+    int cn = cvtest::randInt(rng) % 4 + 1;
    cvtest::ArrayTest::get_test_array_types_and_sizes( test_case_idx, sizes, types );
    Size sum_size;

    depth = depth == 0 ? CV_8U : CV_32F;
-    cn += cn == 2;
    int b = (cvtest::randInt(rng) & 1) != 0;
    sum_depth = depth == CV_8U && b ? CV_32S : b ? CV_32F : CV_64F;