Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
0165ffa9
O
Opencv
项目概览
Greenplum
/
Opencv
大约 1 年 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
0165ffa9
编写于
11月 13, 2018
作者:
B
Brad Kelly
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Implementing AVX512 support for 3 channel cv::integral for CV_64F
上级
36dfb71c
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
321 addition
and
2 deletion
+321
-2
modules/imgproc/perf/perf_integral.cpp
modules/imgproc/perf/perf_integral.cpp
+1
-1
modules/imgproc/src/sumpixels.avx512_skx.cpp
modules/imgproc/src/sumpixels.avx512_skx.cpp
+262
-0
modules/imgproc/src/sumpixels.cpp
modules/imgproc/src/sumpixels.cpp
+33
-1
modules/imgproc/src/sumpixels.hpp
modules/imgproc/src/sumpixels.hpp
+25
-0
未找到文件。
modules/imgproc/perf/perf_integral.cpp
浏览文件 @
0165ffa9
...
...
@@ -11,7 +11,7 @@ typedef perf::TestBaseWithParam<Size_MatType_OutMatDepth_t> Size_MatType_OutMatD
PERF_TEST_P
(
Size_MatType_OutMatDepth
,
integral
,
testing
::
Combine
(
testing
::
Values
(
TYPICAL_MAT_SIZES
),
testing
::
Values
(
CV_8UC1
,
CV_8UC4
),
testing
::
Values
(
CV_8UC1
,
CV_8UC
3
,
CV_8UC
4
),
testing
::
Values
(
CV_32S
,
CV_32F
,
CV_64F
)
)
)
...
...
modules/imgproc/src/sumpixels.avx512_skx.cpp
0 → 100644
浏览文件 @
0165ffa9
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2019, Intel Corporation, all rights reserved.
#include "precomp.hpp"
#include "sumpixels.hpp"
namespace
cv
{
namespace
{
// Anonymous namespace to avoid exposing the implementation classes
//
// NOTE: Look at the bottom of the file for the entry-point function for external callers
//
// At the moment only 3 channel support untilted is supported
// More channel support coming soon.
// TODO: Add support for sqsum and 1,2, and 4 channels
class
IntegralCalculator_3Channel
{
public:
IntegralCalculator_3Channel
()
{};
void
calculate_integral_avx512
(
const
uchar
*
src
,
size_t
_srcstep
,
double
*
sum
,
size_t
_sumstep
,
double
*
sqsum
,
size_t
_sqsumstep
,
int
width
,
int
height
,
int
cn
)
{
const
int
srcstep
=
(
int
)(
_srcstep
/
sizeof
(
uchar
));
const
int
sumstep
=
(
int
)(
_sumstep
/
sizeof
(
double
));
const
int
sqsumstep
=
(
int
)(
_sqsumstep
/
sizeof
(
double
));
const
int
ops_per_line
=
width
*
cn
;
// Clear the first line of the sum as per spec (see integral documentation)
// Also adjust the index of sum and sqsum to be at the real 0th element
// and not point to the border pixel so it stays in sync with the src pointer
memset
(
sum
,
0
,
(
ops_per_line
+
cn
)
*
sizeof
(
double
));
sum
+=
cn
;
if
(
sqsum
)
{
memset
(
sqsum
,
0
,
(
ops_per_line
+
cn
)
*
sizeof
(
double
));
sqsum
+=
cn
;
}
// Now calculate the integral over the whole image one line at a time
for
(
int
y
=
0
;
y
<
height
;
y
++
)
{
const
uchar
*
src_line
=
&
src
[
y
*
srcstep
];
double
*
sum_above
=
&
sum
[
y
*
sumstep
];
double
*
sum_line
=
&
sum_above
[
sumstep
];
double
*
sqsum_above
=
(
sqsum
)
?
&
sqsum
[
y
*
sqsumstep
]
:
NULL
;
double
*
sqsum_line
=
(
sqsum
)
?
&
sqsum_above
[
sqsumstep
]
:
NULL
;
integral_line_3channel_avx512
(
src_line
,
sum_line
,
sum_above
,
sqsum_line
,
sqsum_above
,
ops_per_line
);
}
}
static
inline
void
integral_line_3channel_avx512
(
const
uchar
*
srcs
,
double
*
sums
,
double
*
sums_above
,
double
*
sqsums
,
double
*
sqsums_above
,
int
num_ops_in_line
)
{
__m512i
sum_accumulator
=
_mm512_setzero_si512
();
// holds rolling sums for the line
__m512i
sqsum_accumulator
=
_mm512_setzero_si512
();
// holds rolling sqsums for the line
// The first element on each line must be zeroes as per spec (see integral documentation)
set_border_pixel_value
(
sums
,
sqsums
);
// Do all 64 byte chunk operations then do the last bits that don't fit in a 64 byte chunk
aligned_integral
(
srcs
,
sums
,
sums_above
,
sqsums
,
sqsums_above
,
sum_accumulator
,
sqsum_accumulator
,
num_ops_in_line
);
post_aligned_integral
(
srcs
,
sums
,
sums_above
,
sqsums
,
sqsums_above
,
sum_accumulator
,
sqsum_accumulator
,
num_ops_in_line
);
}
static
inline
void
set_border_pixel_value
(
double
*
sums
,
double
*
sqsums
)
{
// Sets the border pixel value to 0s.
// Note the hard coded -3 and the 0x7 mask is because we only support 3 channel right now
__m512i
zeroes
=
_mm512_setzero_si512
();
_mm512_mask_storeu_epi64
(
&
sums
[
-
3
],
0x7
,
zeroes
);
if
(
sqsums
)
_mm512_mask_storeu_epi64
(
&
sqsums
[
-
3
],
0x7
,
zeroes
);
}
static
inline
void
aligned_integral
(
const
uchar
*&
srcs
,
double
*&
sums
,
double
*&
sums_above
,
double
*&
sqsum
,
double
*&
sqsum_above
,
__m512i
&
sum_accumulator
,
__m512i
&
sqsum_accumulator
,
int
num_ops_in_line
)
{
// This function handles full 64 byte chunks of the source data at a time until it gets to the part of
// the line that no longer contains a full 64 byte chunk. Other code will handle the last part.
const
int
num_chunks
=
num_ops_in_line
>>
6
;
// quick int divide by 64
for
(
int
index_64byte_chunk
=
0
;
index_64byte_chunk
<
num_chunks
;
index_64byte_chunk
++
){
integral_64_operations_avx512
((
__m512i
*
)
srcs
,
(
__m512i
*
)
sums
,
(
__m512i
*
)
sums_above
,
(
__m512i
*
)
sqsum
,
(
__m512i
*
)
sqsum_above
,
0xFFFFFFFFFFFFFFFF
,
sum_accumulator
,
sqsum_accumulator
);
srcs
+=
64
;
sums
+=
64
;
sums_above
+=
64
;
if
(
sqsum
){
sqsum
+=
64
;
sqsum_above
+=
64
;
}
}
}
static
inline
void
post_aligned_integral
(
const
uchar
*
srcs
,
const
double
*
sums
,
const
double
*
sums_above
,
const
double
*
sqsum
,
const
double
*
sqsum_above
,
__m512i
&
sum_accumulator
,
__m512i
&
sqsum_accumulator
,
int
num_ops_in_line
)
{
// This function handles the last few straggling operations that are not a full chunk of 64 operations
// We use the same algorithm, but we calculate a different operation mask using (num_ops % 64).
const
unsigned
int
num_operations
=
(
unsigned
int
)
num_ops_in_line
&
0x3F
;
// Quick int modulo 64
if
(
num_operations
>
0
)
{
__mmask64
operation_mask
=
(
1ULL
<<
num_operations
)
-
1ULL
;
integral_64_operations_avx512
((
__m512i
*
)
srcs
,
(
__m512i
*
)
sums
,
(
__m512i
*
)
sums_above
,
(
__m512i
*
)
sqsum
,
(
__m512i
*
)
sqsum_above
,
operation_mask
,
sum_accumulator
,
sqsum_accumulator
);
}
}
static
inline
void
integral_64_operations_avx512
(
const
__m512i
*
srcs
,
__m512i
*
sums
,
const
__m512i
*
sums_above
,
__m512i
*
sqsums
,
const
__m512i
*
sqsums_above
,
__mmask64
data_mask
,
__m512i
&
sum_accumulator
,
__m512i
&
sqsum_accumulator
)
{
__m512i
src_64byte_chunk
=
read_64_bytes
(
srcs
,
data_mask
);
for
(
int
num_16byte_chunks
=
0
;
num_16byte_chunks
<
4
;
num_16byte_chunks
++
)
{
__m128i
src_16bytes
=
_mm512_extracti64x2_epi64
(
src_64byte_chunk
,
0x0
);
// Get lower 16 bytes of data
for
(
int
num_8byte_chunks
=
0
;
num_8byte_chunks
<
2
;
num_8byte_chunks
++
)
{
__m512i
src_longs
=
convert_lower_8bytes_to_longs
(
src_16bytes
);
// Calculate integral for the sum on the 8 entries
integral_8_operations
(
src_longs
,
sums_above
,
data_mask
,
sums
,
sum_accumulator
);
sums
++
;
sums_above
++
;
if
(
sqsums
){
// Calculate integral for the sum on the 8 entries
__m512i
squared_source
=
_mm512_mullo_epi64
(
src_longs
,
src_longs
);
integral_8_operations
(
squared_source
,
sqsums_above
,
data_mask
,
sqsums
,
sqsum_accumulator
);
sqsums
++
;
sqsums_above
++
;
}
// Prepare for next iteration of inner loop
// shift source to align next 8 bytes to lane 0 and shift the mask
src_16bytes
=
shift_right_8_bytes
(
src_16bytes
);
data_mask
=
data_mask
>>
8
;
}
// Prepare for next iteration of outer loop
src_64byte_chunk
=
shift_right_16_bytes
(
src_64byte_chunk
);
}
}
static
inline
void
integral_8_operations
(
const
__m512i
src_longs
,
const
__m512i
*
above_values_ptr
,
__mmask64
data_mask
,
__m512i
*
results_ptr
,
__m512i
&
accumulator
)
{
_mm512_mask_storeu_pd
(
results_ptr
,
// Store the result here
data_mask
,
// Using the data mask to avoid overrunning the line
calculate_integral
(
// Writing the value of the integral derived from:
src_longs
,
// input data
_mm512_maskz_loadu_pd
(
data_mask
,
above_values_ptr
),
// and the results from line above
accumulator
// keeping track of the accumulator
)
);
}
static
inline
__m512d
calculate_integral
(
__m512i
src_longs
,
const
__m512d
above_values
,
__m512i
&
accumulator
)
{
__m512i
carryover_idxs
=
_mm512_set_epi64
(
6
,
5
,
7
,
6
,
5
,
7
,
6
,
5
);
// Align data to prepare for the adds:
// shifts data left by 3 and 6 qwords(lanes) and gets rolling sum in all lanes
// Vertical LANES: 76543210
// src_longs : HGFEDCBA
// shited3lanes : + EDCBA
// shifted6lanes : + BA
// carry_over_idxs : + 65765765 (index position of result from previous iteration)
// = integral
__m512i
shifted3lanes
=
_mm512_maskz_expand_epi64
(
0xF8
,
src_longs
);
__m512i
shifted6lanes
=
_mm512_maskz_expand_epi64
(
0xC0
,
src_longs
);
__m512i
carry_over
=
_mm512_permutex2var_epi64
(
accumulator
,
carryover_idxs
,
accumulator
);
// Do the adds in tree form (shift3 + shift 6) + (current_source_values + accumulator)
__m512i
sum_shift3and6
=
_mm512_add_epi64
(
shifted3lanes
,
shifted6lanes
);
__m512i
sum_src_carry
=
_mm512_add_epi64
(
src_longs
,
carry_over
);
accumulator
=
_mm512_add_epi64
(
sum_shift3and6
,
sum_src_carry
);
// Convert to packed double and add to the line above to get the true integral value
__m512d
accumulator_pd
=
_mm512_cvtepu64_pd
(
accumulator
);
__m512d
integral_pd
=
_mm512_add_pd
(
accumulator_pd
,
above_values
);
return
integral_pd
;
}
static
inline
__m512i
read_64_bytes
(
const
__m512i
*
srcs
,
__mmask64
data_mask
)
{
return
_mm512_maskz_loadu_epi8
(
data_mask
,
srcs
);
}
static
inline
__m512i
convert_lower_8bytes_to_longs
(
__m128i
src_16bytes
)
{
return
_mm512_cvtepu8_epi64
(
src_16bytes
);
}
static
inline
__m128i
shift_right_8_bytes
(
__m128i
src_16bytes
)
{
return
_mm_maskz_compress_epi64
(
2
,
src_16bytes
);
}
static
inline
__m512i
shift_right_16_bytes
(
__m512i
src_64byte_chunk
)
{
return
_mm512_maskz_compress_epi64
(
0xFC
,
src_64byte_chunk
);
}
};
}
// end of anonymous namespace
namespace
opt_AVX512_SKX
{
// This is the implementation for the external callers interface entry point.
// It should be the only function called into this file from outside
// Any new implementations should be directed from here
void
calculate_integral_avx512
(
const
uchar
*
src
,
size_t
_srcstep
,
double
*
sum
,
size_t
_sumstep
,
double
*
sqsum
,
size_t
_sqsumstep
,
int
width
,
int
height
,
int
cn
)
{
IntegralCalculator_3Channel
calculator
;
calculator
.
calculate_integral_avx512
(
src
,
_srcstep
,
sum
,
_sumstep
,
sqsum
,
_sqsumstep
,
width
,
height
,
cn
);
}
}
// end namespace opt_AVX512_SXK
}
// end namespace cv
modules/imgproc/src/sumpixels.cpp
浏览文件 @
0165ffa9
...
...
@@ -10,7 +10,7 @@
// License Agreement
// For Open Source Computer Vision Library
//
// Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
// Copyright (C) 2000-2008,
2019
Intel Corporation, all rights reserved.
// Copyright (C) 2009, Willow Garage Inc., all rights reserved.
// Copyright (C) 2014, Itseez Inc., all rights reserved.
// Third party copyrights are property of their respective owners.
...
...
@@ -44,6 +44,7 @@
#include "precomp.hpp"
#include "opencl_kernels_imgproc.hpp"
#include "opencv2/core/hal/intrin.hpp"
#include "sumpixels.hpp"
namespace
cv
...
...
@@ -62,6 +63,37 @@ struct Integral_SIMD
}
};
template
<
>
struct
Integral_SIMD
<
uchar
,
double
,
double
>
{
Integral_SIMD
()
{};
bool
operator
()(
const
uchar
*
src
,
size_t
_srcstep
,
double
*
sum
,
size_t
_sumstep
,
double
*
sqsum
,
size_t
_sqsumstep
,
double
*
tilted
,
size_t
_tiltedstep
,
int
width
,
int
height
,
int
cn
)
const
{
#if CV_TRY_AVX512_SKX
CV_UNUSED
(
_tiltedstep
);
// TODO: Add support for 1,2, and 4 channels
if
(
CV_CPU_HAS_SUPPORT_AVX512_SKX
&&
!
tilted
&&
cn
==
3
){
opt_AVX512_SKX
::
calculate_integral_avx512
(
src
,
_srcstep
,
sum
,
_sumstep
,
sqsum
,
_sqsumstep
,
width
,
height
,
cn
);
return
true
;
}
#else
// Avoid warnings in some builds
CV_UNUSED
(
src
);
CV_UNUSED
(
_srcstep
);
CV_UNUSED
(
sum
);
CV_UNUSED
(
_sumstep
);
CV_UNUSED
(
sqsum
);
CV_UNUSED
(
_sqsumstep
);
CV_UNUSED
(
tilted
);
CV_UNUSED
(
_tiltedstep
);
CV_UNUSED
(
width
);
CV_UNUSED
(
height
);
CV_UNUSED
(
cn
);
#endif
return
false
;
}
};
#if CV_SIMD && CV_SIMD_WIDTH <= 64
template
<
>
...
...
modules/imgproc/src/sumpixels.hpp
0 → 100644
浏览文件 @
0165ffa9
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
//
// Copyright (C) 2019, Intel Corporation, all rights reserved.
#ifndef OPENCV_IMGPROC_SUM_PIXELS_HPP
#define OPENCV_IMGPROC_SUM_PIXELS_HPP
namespace
cv
{
namespace
opt_AVX512_SKX
{
#if CV_TRY_AVX512_SKX
void
calculate_integral_avx512
(
const
uchar
*
src
,
size_t
_srcstep
,
double
*
sum
,
size_t
_sumstep
,
double
*
sqsum
,
size_t
_sqsumstep
,
int
width
,
int
height
,
int
cn
);
#endif
}
// end namespace opt_AVX512_SKX
}
// end namespace cv
#endif
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录