diff --git a/modules/imgproc/perf/perf_integral.cpp b/modules/imgproc/perf/perf_integral.cpp index 4b2ba971483b0047a97461462bbd075b0f122b3c..d64c49e0a921385d6d99109fbed038c2a986c249 100644 --- a/modules/imgproc/perf/perf_integral.cpp +++ b/modules/imgproc/perf/perf_integral.cpp @@ -11,7 +11,7 @@ typedef perf::TestBaseWithParam Size_MatType_OutMatD PERF_TEST_P(Size_MatType_OutMatDepth, integral, testing::Combine( testing::Values(TYPICAL_MAT_SIZES), - testing::Values(CV_8UC1, CV_8UC4), + testing::Values(CV_8UC1, CV_8UC3, CV_8UC4), testing::Values(CV_32S, CV_32F, CV_64F) ) ) diff --git a/modules/imgproc/src/sumpixels.avx512_skx.cpp b/modules/imgproc/src/sumpixels.avx512_skx.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7e5cbdcf88b545e71d1b699a2d7a6e3d3db9faaf --- /dev/null +++ b/modules/imgproc/src/sumpixels.avx512_skx.cpp @@ -0,0 +1,262 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2019, Intel Corporation, all rights reserved. +#include "precomp.hpp" +#include "sumpixels.hpp" + +namespace cv { +namespace { // Anonymous namespace to avoid exposing the implementation classes + +// +// NOTE: Look at the bottom of the file for the entry-point function for external callers +// + +// At the moment only 3 channel support untilted is supported +// More channel support coming soon. +// TODO: Add support for sqsum and 1,2, and 4 channels +class IntegralCalculator_3Channel { +public: + IntegralCalculator_3Channel() {}; + + + void calculate_integral_avx512(const uchar *src, size_t _srcstep, + double *sum, size_t _sumstep, + double *sqsum, size_t _sqsumstep, + int width, int height, int cn) + { + const int srcstep = (int)(_srcstep/sizeof(uchar)); + const int sumstep = (int)(_sumstep/sizeof(double)); + const int sqsumstep = (int)(_sqsumstep/sizeof(double)); + const int ops_per_line = width * cn; + + // Clear the first line of the sum as per spec (see integral documentation) + // Also adjust the index of sum and sqsum to be at the real 0th element + // and not point to the border pixel so it stays in sync with the src pointer + memset( sum, 0, (ops_per_line+cn)*sizeof(double)); + sum += cn; + + if (sqsum) { + memset( sqsum, 0, (ops_per_line+cn)*sizeof(double)); + sqsum += cn; + } + + // Now calculate the integral over the whole image one line at a time + for(int y = 0; y < height; y++) { + const uchar * src_line = &src[y*srcstep]; + double * sum_above = &sum[y*sumstep]; + double * sum_line = &sum_above[sumstep]; + double * sqsum_above = (sqsum) ? &sqsum[y*sqsumstep] : NULL; + double * sqsum_line = (sqsum) ? &sqsum_above[sqsumstep] : NULL; + + integral_line_3channel_avx512(src_line, sum_line, sum_above, sqsum_line, sqsum_above, ops_per_line); + + } + } + + static inline + void integral_line_3channel_avx512(const uchar *srcs, + double *sums, double *sums_above, + double *sqsums, double *sqsums_above, + int num_ops_in_line) + { + __m512i sum_accumulator = _mm512_setzero_si512(); // holds rolling sums for the line + __m512i sqsum_accumulator = _mm512_setzero_si512(); // holds rolling sqsums for the line + + // The first element on each line must be zeroes as per spec (see integral documentation) + set_border_pixel_value(sums, sqsums); + + // Do all 64 byte chunk operations then do the last bits that don't fit in a 64 byte chunk + aligned_integral( srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line); + post_aligned_integral(srcs, sums, sums_above, sqsums, sqsums_above, sum_accumulator, sqsum_accumulator, num_ops_in_line); + + } + + + static inline + void set_border_pixel_value(double *sums, double *sqsums) + { + // Sets the border pixel value to 0s. + // Note the hard coded -3 and the 0x7 mask is because we only support 3 channel right now + __m512i zeroes = _mm512_setzero_si512(); + + _mm512_mask_storeu_epi64(&sums[-3], 0x7, zeroes); + if (sqsums) + _mm512_mask_storeu_epi64(&sqsums[-3], 0x7, zeroes); + } + + + static inline + void aligned_integral(const uchar *&srcs, + double *&sums, double *&sums_above, + double *&sqsum, double *&sqsum_above, + __m512i &sum_accumulator, __m512i &sqsum_accumulator, + int num_ops_in_line) + { + // This function handles full 64 byte chunks of the source data at a time until it gets to the part of + // the line that no longer contains a full 64 byte chunk. Other code will handle the last part. + + const int num_chunks = num_ops_in_line >> 6; // quick int divide by 64 + + for (int index_64byte_chunk = 0; index_64byte_chunk < num_chunks; index_64byte_chunk++){ + integral_64_operations_avx512((__m512i *) srcs, + (__m512i *) sums, (__m512i *) sums_above, + (__m512i *) sqsum, (__m512i *) sqsum_above, + 0xFFFFFFFFFFFFFFFF, sum_accumulator, sqsum_accumulator); + srcs+=64; sums+=64; sums_above+=64; + if (sqsum){ sqsum+= 64; sqsum_above+=64; } + } + } + + + static inline + void post_aligned_integral(const uchar *srcs, + const double *sums, const double *sums_above, + const double *sqsum, const double *sqsum_above, + __m512i &sum_accumulator, __m512i &sqsum_accumulator, + int num_ops_in_line) + { + // This function handles the last few straggling operations that are not a full chunk of 64 operations + // We use the same algorithm, but we calculate a different operation mask using (num_ops % 64). + + const unsigned int num_operations = (unsigned int) num_ops_in_line & 0x3F; // Quick int modulo 64 + + if (num_operations > 0) { + __mmask64 operation_mask = (1ULL << num_operations) - 1ULL; + + integral_64_operations_avx512((__m512i *) srcs, (__m512i *) sums, (__m512i *) sums_above, + (__m512i *) sqsum, (__m512i *) sqsum_above, + operation_mask, sum_accumulator, sqsum_accumulator); + } + } + + + static inline + void integral_64_operations_avx512(const __m512i *srcs, + __m512i *sums, const __m512i *sums_above, + __m512i *sqsums, const __m512i *sqsums_above, + __mmask64 data_mask, + __m512i &sum_accumulator, __m512i &sqsum_accumulator) + { + __m512i src_64byte_chunk = read_64_bytes(srcs, data_mask); + + for(int num_16byte_chunks=0; num_16byte_chunks<4; num_16byte_chunks++) { + __m128i src_16bytes = _mm512_extracti64x2_epi64(src_64byte_chunk, 0x0); // Get lower 16 bytes of data + + for (int num_8byte_chunks = 0; num_8byte_chunks < 2; num_8byte_chunks++) { + + __m512i src_longs = convert_lower_8bytes_to_longs(src_16bytes); + + // Calculate integral for the sum on the 8 entries + integral_8_operations(src_longs, sums_above, data_mask, sums, sum_accumulator); + sums++; sums_above++; + + if (sqsums){ // Calculate integral for the sum on the 8 entries + __m512i squared_source = _mm512_mullo_epi64(src_longs, src_longs); + + integral_8_operations(squared_source, sqsums_above, data_mask, sqsums, sqsum_accumulator); + sqsums++; sqsums_above++; + } + + // Prepare for next iteration of inner loop + // shift source to align next 8 bytes to lane 0 and shift the mask + src_16bytes = shift_right_8_bytes(src_16bytes); + data_mask = data_mask >> 8; + + } + + // Prepare for next iteration of outer loop + src_64byte_chunk = shift_right_16_bytes(src_64byte_chunk); + } + } + + + static inline + void integral_8_operations(const __m512i src_longs, const __m512i *above_values_ptr, __mmask64 data_mask, + __m512i *results_ptr, __m512i &accumulator) + { + _mm512_mask_storeu_pd( + results_ptr, // Store the result here + data_mask, // Using the data mask to avoid overrunning the line + calculate_integral( // Writing the value of the integral derived from: + src_longs, // input data + _mm512_maskz_loadu_pd(data_mask, above_values_ptr), // and the results from line above + accumulator // keeping track of the accumulator + ) + ); + } + + + static inline + __m512d calculate_integral(__m512i src_longs, const __m512d above_values, __m512i &accumulator) + { + __m512i carryover_idxs = _mm512_set_epi64(6, 5, 7, 6, 5, 7, 6, 5); + + // Align data to prepare for the adds: + // shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes + // Vertical LANES: 76543210 + // src_longs : HGFEDCBA + // shited3lanes : + EDCBA + // shifted6lanes : + BA + // carry_over_idxs : + 65765765 (index position of result from previous iteration) + // = integral + __m512i shifted3lanes = _mm512_maskz_expand_epi64(0xF8, src_longs); + __m512i shifted6lanes = _mm512_maskz_expand_epi64(0xC0, src_longs); + __m512i carry_over = _mm512_permutex2var_epi64(accumulator, carryover_idxs, accumulator); + + // Do the adds in tree form (shift3 + shift 6) + (current_source_values + accumulator) + __m512i sum_shift3and6 = _mm512_add_epi64(shifted3lanes, shifted6lanes); + __m512i sum_src_carry = _mm512_add_epi64(src_longs, carry_over); + accumulator = _mm512_add_epi64(sum_shift3and6, sum_src_carry); + + // Convert to packed double and add to the line above to get the true integral value + __m512d accumulator_pd = _mm512_cvtepu64_pd(accumulator); + __m512d integral_pd = _mm512_add_pd(accumulator_pd, above_values); + return integral_pd; + } + + + static inline + __m512i read_64_bytes(const __m512i *srcs, __mmask64 data_mask) { + return _mm512_maskz_loadu_epi8(data_mask, srcs); + } + + + static inline + __m512i convert_lower_8bytes_to_longs(__m128i src_16bytes) { + return _mm512_cvtepu8_epi64(src_16bytes); + } + + + static inline + __m128i shift_right_8_bytes(__m128i src_16bytes) { + return _mm_maskz_compress_epi64(2, src_16bytes); + } + + + static inline + __m512i shift_right_16_bytes(__m512i src_64byte_chunk) { + return _mm512_maskz_compress_epi64(0xFC, src_64byte_chunk); + } + +}; +} // end of anonymous namespace + +namespace opt_AVX512_SKX { + +// This is the implementation for the external callers interface entry point. +// It should be the only function called into this file from outside +// Any new implementations should be directed from here +void calculate_integral_avx512(const uchar *src, size_t _srcstep, + double *sum, size_t _sumstep, + double *sqsum, size_t _sqsumstep, + int width, int height, int cn) +{ + IntegralCalculator_3Channel calculator; + calculator.calculate_integral_avx512(src, _srcstep, sum, _sumstep, sqsum, _sqsumstep, width, height, cn); +} + + +} // end namespace opt_AVX512_SXK +} // end namespace cv diff --git a/modules/imgproc/src/sumpixels.cpp b/modules/imgproc/src/sumpixels.cpp index 3c49aaf773f399ade8188cd5e13ed6af004331c2..ae7647b8bd5456f6c4e32607fa014088b3f254dc 100755 --- a/modules/imgproc/src/sumpixels.cpp +++ b/modules/imgproc/src/sumpixels.cpp @@ -10,7 +10,7 @@ // License Agreement // For Open Source Computer Vision Library // -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2000-2008,2019 Intel Corporation, all rights reserved. // Copyright (C) 2009, Willow Garage Inc., all rights reserved. // Copyright (C) 2014, Itseez Inc., all rights reserved. // Third party copyrights are property of their respective owners. @@ -44,6 +44,7 @@ #include "precomp.hpp" #include "opencl_kernels_imgproc.hpp" #include "opencv2/core/hal/intrin.hpp" +#include "sumpixels.hpp" namespace cv @@ -62,6 +63,37 @@ struct Integral_SIMD } }; + +template <> +struct Integral_SIMD { + Integral_SIMD() {}; + + + bool operator()(const uchar *src, size_t _srcstep, + double *sum, size_t _sumstep, + double *sqsum, size_t _sqsumstep, + double *tilted, size_t _tiltedstep, + int width, int height, int cn) const + { +#if CV_TRY_AVX512_SKX + CV_UNUSED(_tiltedstep); + // TODO: Add support for 1,2, and 4 channels + if (CV_CPU_HAS_SUPPORT_AVX512_SKX && !tilted && cn == 3){ + opt_AVX512_SKX::calculate_integral_avx512(src, _srcstep, sum, _sumstep, + sqsum, _sqsumstep, width, height, cn); + return true; + } +#else + // Avoid warnings in some builds + CV_UNUSED(src); CV_UNUSED(_srcstep); CV_UNUSED(sum); CV_UNUSED(_sumstep); + CV_UNUSED(sqsum); CV_UNUSED(_sqsumstep); CV_UNUSED(tilted); CV_UNUSED(_tiltedstep); + CV_UNUSED(width); CV_UNUSED(height); CV_UNUSED(cn); +#endif + return false; + } + +}; + #if CV_SIMD && CV_SIMD_WIDTH <= 64 template <> diff --git a/modules/imgproc/src/sumpixels.hpp b/modules/imgproc/src/sumpixels.hpp new file mode 100644 index 0000000000000000000000000000000000000000..8d5ab0a851be887501914b0dcbf172c176013991 --- /dev/null +++ b/modules/imgproc/src/sumpixels.hpp @@ -0,0 +1,25 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. +// +// Copyright (C) 2019, Intel Corporation, all rights reserved. +#ifndef OPENCV_IMGPROC_SUM_PIXELS_HPP +#define OPENCV_IMGPROC_SUM_PIXELS_HPP + +namespace cv +{ + +namespace opt_AVX512_SKX +{ +#if CV_TRY_AVX512_SKX + void calculate_integral_avx512( + const uchar *src, size_t _srcstep, + double *sum, size_t _sumstep, + double *sqsum, size_t _sqsumstep, + int width, int height, int cn); + +#endif +} // end namespace opt_AVX512_SKX +} // end namespace cv + +#endif