Implementing AVX512 support for 3 channel cv::integral for CV_64F

0165ffa9 · Brad Kelly · 36dfb71c · 0165ffa9 · 0165ffa9 · 0165ffa9
4 changed file
--- a/modules/imgproc/perf/perf_integral.cpp
+++ b/modules/imgproc/perf/perf_integral.cpp
@@ -11,7 +11,7 @@ typedef perf::TestBaseWithParam<Size_MatType_OutMatDepth_t> Size_MatType_OutMatD
 PERF_TEST_P(Size_MatType_OutMatDepth, integral,
            testing::Combine(
                testing::Values(TYPICAL_MAT_SIZES),
-                testing::Values(CV_8UC1, CV_8UC4),
+                testing::Values(CV_8UC1, CV_8UC3, CV_8UC4),
                testing::Values(CV_32S, CV_32F, CV_64F)
                )
            )

--- a/modules/imgproc/src/sumpixels.avx512_skx.cpp
+++ b/modules/imgproc/src/sumpixels.avx512_skx.cpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019, Intel Corporation, all rights reserved.
+#include "precomp.hpp"
+#include "sumpixels.hpp"
+
+namespace cv {
+namespace { // Anonymous namespace to avoid exposing the implementation classes
+
+//
+// NOTE: Look at the bottom of the file for the entry-point function for external callers
+//
+
+// At the moment only 3 channel support untilted is supported
+// More channel support coming soon.
+// TODO: Add support for sqsum and 1,2, and 4 channels
+class IntegralCalculator_3Channel {
+public:
+    IntegralCalculator_3Channel() {};
+
+
+    void calculate_integral_avx512(const uchar *src, size_t _srcstep,
+                                   double *sum,      size_t _sumstep,
+                                   double *sqsum,    size_t _sqsumstep,
+                                   int width, int height, int cn)
+    {
+        const int srcstep = (int)(_srcstep/sizeof(uchar));
+        const int sumstep = (int)(_sumstep/sizeof(double));
+        const int sqsumstep = (int)(_sqsumstep/sizeof(double));
+        const int ops_per_line = width * cn;
+
+        // Clear the first line of the sum as per spec (see integral documentation)
+        // Also adjust the index of sum and sqsum to be at the real 0th element
+        // and not point to the border pixel so it stays in sync with the src pointer
+        memset( sum, 0, (ops_per_line+cn)*sizeof(double));
+        sum += cn;
+
+        if (sqsum) {
+            memset( sqsum, 0, (ops_per_line+cn)*sizeof(double));
+            sqsum += cn;
+        }
+
+        // Now calculate the integral over the whole image one line at a time
+        for(int y = 0; y < height; y++) {
+            const uchar * src_line    = &src[y*srcstep];
+            double      * sum_above   = &sum[y*sumstep];
+            double      * sum_line    = &sum_above[sumstep];
+            double      * sqsum_above = (sqsum) ? &sqsum[y*sqsumstep]     : NULL;
+            double      * sqsum_line  = (sqsum) ? &sqsum_above[sqsumstep] : NULL;
+
+            integral_line_3channel_avx512(src_line, sum_line, sum_above, sqsum_line, sqsum_above, ops_per_line);
+
+        }
+    }
+
+    static inline
+    void integral_line_3channel_avx512(const uchar *srcs,
+                                       double *sums,   double *sums_above,
+                                       double *sqsums, double *sqsums_above,
+                                       int num_ops_in_line)
+    {
+        __m512i sum_accumulator   = _mm512_setzero_si512();  // holds rolling sums for the line
+        __m512i sqsum_accumulator = _mm512_setzero_si512();  // holds rolling sqsums for the line
+
+        // The first element on each line must be zeroes as per spec (see integral documentation)
+        set_border_pixel_value(sums, sqsums);
+
+        // Do all 64 byte chunk operations then do the last bits that don't fit in a 64 byte chunk
+        aligned_integral(     srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line);
+        post_aligned_integral(srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line);
+
+    }
+
+
+    static inline
+    void set_border_pixel_value(double *sums, double *sqsums)
+    {
+        // Sets the border pixel value to 0s.
+        // Note the hard coded -3 and the 0x7 mask is because we only support 3 channel right now
+        __m512i zeroes = _mm512_setzero_si512();
+
+        _mm512_mask_storeu_epi64(&sums[-3], 0x7, zeroes);
+        if (sqsums)
+            _mm512_mask_storeu_epi64(&sqsums[-3], 0x7, zeroes);
+    }
+
+
+    static inline
+    void aligned_integral(const uchar *&srcs,
+                          double *&sums,  double *&sums_above,
+                          double *&sqsum, double *&sqsum_above,
+                          __m512i &sum_accumulator, __m512i &sqsum_accumulator,
+                          int num_ops_in_line)
+    {
+        // This function handles full 64 byte chunks of the source data at a time until it gets to the part of
+        // the line that no longer contains a full 64 byte chunk.  Other code will handle the last part.
+
+        const int num_chunks = num_ops_in_line >> 6;  // quick int divide by 64
+
+        for (int index_64byte_chunk = 0; index_64byte_chunk < num_chunks; index_64byte_chunk++){
+            integral_64_operations_avx512((__m512i *) srcs,
+                                          (__m512i *) sums,  (__m512i *) sums_above,
+                                          (__m512i *) sqsum, (__m512i *) sqsum_above,
+                                          0xFFFFFFFFFFFFFFFF, sum_accumulator, sqsum_accumulator);
+            srcs+=64; sums+=64; sums_above+=64;
+            if (sqsum){ sqsum+= 64; sqsum_above+=64; }
+        }
+    }
+
+
+    static inline
+    void post_aligned_integral(const uchar *srcs,
+                               const double *sums,   const double *sums_above,
+                               const double *sqsum,  const double *sqsum_above,
+                               __m512i &sum_accumulator, __m512i &sqsum_accumulator,
+                               int num_ops_in_line)
+    {
+        // This function handles the last few straggling operations that are not a full chunk of 64 operations
+        // We use the same algorithm, but we calculate a different operation mask using (num_ops % 64).
+
+        const unsigned int num_operations = (unsigned int) num_ops_in_line & 0x3F;  // Quick int modulo 64
+
+        if (num_operations > 0) {
+            __mmask64 operation_mask = (1ULL << num_operations) - 1ULL;
+
+            integral_64_operations_avx512((__m512i *) srcs, (__m512i *) sums, (__m512i *) sums_above,
+                                          (__m512i *) sqsum, (__m512i *) sqsum_above,
+                                          operation_mask, sum_accumulator, sqsum_accumulator);
+        }
+    }
+
+
+    static inline
+    void integral_64_operations_avx512(const __m512i *srcs,
+                                       __m512i *sums,       const __m512i *sums_above,
+                                       __m512i *sqsums,     const __m512i *sqsums_above,
+                                       __mmask64 data_mask,
+                                       __m512i &sum_accumulator, __m512i &sqsum_accumulator)
+    {
+        __m512i src_64byte_chunk = read_64_bytes(srcs, data_mask);
+
+        for(int num_16byte_chunks=0; num_16byte_chunks<4; num_16byte_chunks++) {
+            __m128i src_16bytes = _mm512_extracti64x2_epi64(src_64byte_chunk, 0x0); // Get lower 16 bytes of data
+
+            for (int num_8byte_chunks = 0; num_8byte_chunks < 2; num_8byte_chunks++) {
+
+                __m512i src_longs = convert_lower_8bytes_to_longs(src_16bytes);
+
+                // Calculate integral for the sum on the 8 entries
+                integral_8_operations(src_longs, sums_above, data_mask, sums, sum_accumulator);
+                sums++; sums_above++;
+
+                if (sqsums){ // Calculate integral for the sum on the 8 entries
+                    __m512i squared_source = _mm512_mullo_epi64(src_longs, src_longs);
+
+                    integral_8_operations(squared_source, sqsums_above, data_mask, sqsums, sqsum_accumulator);
+                    sqsums++; sqsums_above++;
+                }
+
+                // Prepare for next iteration of inner loop
+                // shift source to align next 8 bytes to lane 0 and shift the mask
+                src_16bytes = shift_right_8_bytes(src_16bytes);
+                data_mask = data_mask >> 8;
+
+            }
+
+            // Prepare for next iteration of outer loop
+            src_64byte_chunk = shift_right_16_bytes(src_64byte_chunk);
+        }
+    }
+
+
+    static inline
+    void integral_8_operations(const __m512i src_longs, const __m512i *above_values_ptr, __mmask64 data_mask,
+                               __m512i *results_ptr, __m512i &accumulator)
+     {
+        _mm512_mask_storeu_pd(
+                results_ptr,   // Store the result here
+                data_mask,     // Using the data mask to avoid overrunning the line
+                calculate_integral( // Writing the value of the integral derived from:
+                        src_longs,                                           // input data
+                        _mm512_maskz_loadu_pd(data_mask, above_values_ptr),  // and the results from line above
+                        accumulator                                          // keeping track of the accumulator
+                )
+        );
+    }
+
+
+    static inline
+    __m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator)
+    {
+        __m512i carryover_idxs = _mm512_set_epi64(6, 5, 7, 6, 5, 7, 6, 5);
+
+        // Align data to prepare for the adds:
+        //    shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
+        //   Vertical LANES:     76543210
+        //   src_longs       :   HGFEDCBA
+        //   shited3lanes    : + EDCBA
+        //   shifted6lanes   : + BA
+        //   carry_over_idxs : + 65765765  (index position of result from previous iteration)
+        //                     = integral
+        __m512i shifted3lanes = _mm512_maskz_expand_epi64(0xF8, src_longs);
+        __m512i shifted6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs);
+        __m512i carry_over    = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator);
+
+        // Do the adds in tree form (shift3 + shift 6) + (current_source_values + accumulator)
+        __m512i sum_shift3and6 = _mm512_add_epi64(shifted3lanes, shifted6lanes);
+        __m512i sum_src_carry  = _mm512_add_epi64(src_longs, carry_over);
+        accumulator            = _mm512_add_epi64(sum_shift3and6, sum_src_carry);
+
+        // Convert to packed double and add to the line above to get the true integral value
+        __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator);
+        __m512d integral_pd    = _mm512_add_pd(accumulator_pd, above_values);
+        return integral_pd;
+    }
+
+
+    static inline
+    __m512i read_64_bytes(const __m512i *srcs, __mmask64 data_mask)  {
+        return _mm512_maskz_loadu_epi8(data_mask, srcs);
+    }
+
+
+    static inline
+    __m512i convert_lower_8bytes_to_longs(__m128i src_16bytes)  {
+        return _mm512_cvtepu8_epi64(src_16bytes);
+    }
+
+
+    static inline
+    __m128i shift_right_8_bytes(__m128i src_16bytes)  {
+        return _mm_maskz_compress_epi64(2, src_16bytes);
+    }
+
+
+    static inline
+    __m512i shift_right_16_bytes(__m512i src_64byte_chunk)  {
+        return _mm512_maskz_compress_epi64(0xFC, src_64byte_chunk);
+    }
+
+};
+} // end of anonymous namespace
+
+namespace opt_AVX512_SKX {
+
+// This is the implementation for the external callers interface entry point.
+// It should be the only function called into this file from outside
+// Any new implementations should be directed from here
+void calculate_integral_avx512(const uchar *src,   size_t _srcstep,
+                               double      *sum,   size_t _sumstep,
+                               double      *sqsum, size_t _sqsumstep,
+                               int width, int height, int cn)
+{
+    IntegralCalculator_3Channel  calculator;
+    calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height, cn);
+}
+
+
+} // end namespace opt_AVX512_SXK
+} // end namespace cv
--- a/modules/imgproc/src/sumpixels.cpp
+++ b/modules/imgproc/src/sumpixels.cpp
@@ -10,7 +10,7 @@
 //                           License Agreement
 //                For Open Source Computer Vision Library
 //
-// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
+// Copyright (C) 2000-2008,2019 Intel Corporation, all rights reserved.
 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
 // Copyright (C) 2014, Itseez Inc., all rights reserved.
 // Third party copyrights are property of their respective owners.
@@ -44,6 +44,7 @@
 #include "precomp.hpp"
 #include "opencl_kernels_imgproc.hpp"
 #include "opencv2/core/hal/intrin.hpp"
+#include "sumpixels.hpp"


 namespace cv
@@ -62,6 +63,37 @@ struct Integral_SIMD
    }
 };

+
+template <>
+struct Integral_SIMD<uchar, double, double> {
+    Integral_SIMD() {};
+
+
+    bool operator()(const uchar *src, size_t _srcstep,
+                    double *sum,      size_t _sumstep,
+                    double *sqsum,    size_t _sqsumstep,
+                    double *tilted,   size_t _tiltedstep,
+                    int width, int height, int cn) const
+    {
+#if CV_TRY_AVX512_SKX
+        CV_UNUSED(_tiltedstep);
+        // TODO: Add support for 1,2, and 4 channels
+        if (CV_CPU_HAS_SUPPORT_AVX512_SKX && !tilted && cn == 3){
+            opt_AVX512_SKX::calculate_integral_avx512(src, _srcstep, sum, _sumstep,
+                                                      sqsum, _sqsumstep, width, height, cn);
+            return true;
+        }
+#else
+        // Avoid warnings in some builds
+        CV_UNUSED(src); CV_UNUSED(_srcstep); CV_UNUSED(sum); CV_UNUSED(_sumstep);
+        CV_UNUSED(sqsum); CV_UNUSED(_sqsumstep); CV_UNUSED(tilted); CV_UNUSED(_tiltedstep);
+        CV_UNUSED(width); CV_UNUSED(height); CV_UNUSED(cn);
+#endif
+        return false;
+    }
+
+};
+
 #if CV_SIMD && CV_SIMD_WIDTH <= 64

 template <>

--- a/modules/imgproc/src/sumpixels.hpp
+++ b/modules/imgproc/src/sumpixels.hpp
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+//
+// Copyright (C) 2019, Intel Corporation, all rights reserved.
+#ifndef OPENCV_IMGPROC_SUM_PIXELS_HPP
+#define OPENCV_IMGPROC_SUM_PIXELS_HPP
+
+namespace cv
+{
+
+namespace opt_AVX512_SKX
+{
+#if CV_TRY_AVX512_SKX
+    void calculate_integral_avx512(
+            const uchar *src, size_t _srcstep,
+            double *sum,      size_t _sumstep,
+            double *sqsum,    size_t _sqsumstep,
+            int width, int height, int cn);
+
+#endif
+} // end namespace opt_AVX512_SKX
+} // end namespace cv
+
+#endif