Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
b3269b08
O
Opencv
项目概览
Greenplum
/
Opencv
大约 1 年 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
b3269b08
编写于
7月 20, 2022
作者:
T
Tomoaki Teshima
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
neon: add dotprod dispatch implementation
* read vector at runtime * add enum
上级
2a82467a
变更
8
隐藏空白更改
内联
并排
Showing
8 changed file
with
106 addition
and
38 deletion
+106
-38
cmake/OpenCVCompilerOptimizations.cmake
cmake/OpenCVCompilerOptimizations.cmake
+5
-2
cmake/checks/cpu_dotprod.cpp
cmake/checks/cpu_dotprod.cpp
+24
-0
modules/core/CMakeLists.txt
modules/core/CMakeLists.txt
+1
-1
modules/core/include/opencv2/core/cv_cpu_dispatch.h
modules/core/include/opencv2/core/cv_cpu_dispatch.h
+4
-0
modules/core/include/opencv2/core/cv_cpu_helper.h
modules/core/include/opencv2/core/cv_cpu_helper.h
+21
-0
modules/core/include/opencv2/core/cvdef.h
modules/core/include/opencv2/core/cvdef.h
+2
-0
modules/core/include/opencv2/core/hal/intrin_neon.hpp
modules/core/include/opencv2/core/hal/intrin_neon.hpp
+30
-35
modules/core/src/system.cpp
modules/core/src/system.cpp
+19
-0
未找到文件。
cmake/OpenCVCompilerOptimizations.cmake
浏览文件 @
b3269b08
...
...
@@ -46,7 +46,7 @@
set
(
CPU_ALL_OPTIMIZATIONS
"SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;POPCNT;AVX;FP16;AVX2;FMA3;AVX_512F"
)
list
(
APPEND CPU_ALL_OPTIMIZATIONS
"AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SKX;AVX512_CNL;AVX512_CLX;AVX512_ICL"
)
list
(
APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16
)
list
(
APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16
NEON_DOTPROD
)
list
(
APPEND CPU_ALL_OPTIMIZATIONS MSA
)
list
(
APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3
)
list
(
REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS
)
...
...
@@ -326,6 +326,7 @@ if(X86 OR X86_64)
elseif
(
ARM OR AARCH64
)
ocv_update
(
CPU_NEON_TEST_FILE
"
${
OpenCV_SOURCE_DIR
}
/cmake/checks/cpu_neon.cpp"
)
ocv_update
(
CPU_FP16_TEST_FILE
"
${
OpenCV_SOURCE_DIR
}
/cmake/checks/cpu_fp16.cpp"
)
ocv_update
(
CPU_NEON_DOTPROD_TEST_FILE
"
${
OpenCV_SOURCE_DIR
}
/cmake/checks/cpu_dotprod.cpp"
)
if
(
NOT AARCH64
)
ocv_update
(
CPU_KNOWN_OPTIMIZATIONS
"VFPV3;NEON;FP16"
)
if
(
NOT MSVC
)
...
...
@@ -337,9 +338,11 @@ elseif(ARM OR AARCH64)
endif
()
ocv_update
(
CPU_FP16_IMPLIES
"NEON"
)
else
()
ocv_update
(
CPU_KNOWN_OPTIMIZATIONS
"NEON;FP16"
)
ocv_update
(
CPU_KNOWN_OPTIMIZATIONS
"NEON;FP16
;NEON_DOTPROD
"
)
ocv_update
(
CPU_NEON_FLAGS_ON
""
)
ocv_update
(
CPU_FP16_IMPLIES
"NEON"
)
ocv_update
(
CPU_NEON_DOTPROD_FLAGS_ON
"-march=armv8.2-a+dotprod"
)
ocv_update
(
CPU_NEON_DOTPROD_IMPLIES
"NEON"
)
set
(
CPU_BASELINE
"NEON;FP16"
CACHE STRING
"
${
HELP_CPU_BASELINE
}
"
)
endif
()
elseif
(
MIPS
)
...
...
cmake/checks/cpu_dotprod.cpp
0 → 100644
浏览文件 @
b3269b08
#include <stdio.h>
#if defined __GNUC__ && (defined __arm__ || defined __aarch64__)
#include "arm_neon.h"
int
test
()
{
const
unsigned
int
src
[]
=
{
0
,
0
,
0
,
0
};
unsigned
int
dst
[
4
];
uint32x4_t
v_src
=
*
(
uint32x4_t
*
)
src
;
uint8x16_t
v_m0
=
*
(
uint8x16_t
*
)
src
;
uint8x16_t
v_m1
=
*
(
uint8x16_t
*
)
src
;
uint32x4_t
v_dst
=
vdotq_u32
(
v_src
,
v_m0
,
v_m1
);
*
(
uint32x4_t
*
)
dst
=
v_dst
;
return
(
int
)
dst
[
0
];
}
#else
#error "DOTPROD is not supported"
#endif
int
main
()
{
printf
(
"%d
\n
"
,
test
());
return
0
;
}
modules/core/CMakeLists.txt
浏览文件 @
b3269b08
...
...
@@ -6,7 +6,7 @@ ocv_add_dispatched_file(arithm SSE2 SSE4_1 AVX2 VSX3)
ocv_add_dispatched_file
(
convert SSE2 AVX2 VSX3
)
ocv_add_dispatched_file
(
convert_scale SSE2 AVX2
)
ocv_add_dispatched_file
(
count_non_zero SSE2 AVX2
)
ocv_add_dispatched_file
(
matmul SSE2 SSE4_1 AVX2 AVX512_SKX
)
ocv_add_dispatched_file
(
matmul SSE2 SSE4_1 AVX2 AVX512_SKX
NEON_DOTPROD
)
ocv_add_dispatched_file
(
mean SSE2 AVX2
)
ocv_add_dispatched_file
(
merge SSE2 AVX2
)
ocv_add_dispatched_file
(
split SSE2 AVX2
)
...
...
modules/core/include/opencv2/core/cv_cpu_dispatch.h
浏览文件 @
b3269b08
...
...
@@ -79,6 +79,10 @@
# endif
# define CV_FP16 1
#endif
#ifdef CV_CPU_COMPILE_NEON_DOTPROD
# include <arm_neon.h>
# define CV_NEON_DOT 1
#endif
#ifdef CV_CPU_COMPILE_AVX2
# include <immintrin.h>
# define CV_AVX2 1
...
...
modules/core/include/opencv2/core/cv_cpu_helper.h
浏览文件 @
b3269b08
...
...
@@ -420,6 +420,27 @@
#endif
#define __CV_CPU_DISPATCH_CHAIN_NEON(fn, args, mode, ...) CV_CPU_CALL_NEON(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_NEON_DOTPROD
# define CV_TRY_NEON_DOTPROD 1
# define CV_CPU_FORCE_NEON_DOTPROD 1
# define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 1
# define CV_CPU_CALL_NEON_DOTPROD(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_NEON_DOTPROD_(fn, args) return (opt_NEON_DOTPROD::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_NEON_DOTPROD
# define CV_TRY_NEON_DOTPROD 1
# define CV_CPU_FORCE_NEON_DOTPROD 0
# define CV_CPU_HAS_SUPPORT_NEON_DOTPROD (cv::checkHardwareSupport(CV_CPU_NEON_DOTPROD))
# define CV_CPU_CALL_NEON_DOTPROD(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
# define CV_CPU_CALL_NEON_DOTPROD_(fn, args) if (CV_CPU_HAS_SUPPORT_NEON_DOTPROD) return (opt_NEON_DOTPROD::fn args)
#else
# define CV_TRY_NEON_DOTPROD 0
# define CV_CPU_FORCE_NEON_DOTPROD 0
# define CV_CPU_HAS_SUPPORT_NEON_DOTPROD 0
# define CV_CPU_CALL_NEON_DOTPROD(fn, args)
# define CV_CPU_CALL_NEON_DOTPROD_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_NEON_DOTPROD(fn, args, mode, ...) CV_CPU_CALL_NEON_DOTPROD(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_MSA
# define CV_TRY_MSA 1
# define CV_CPU_FORCE_MSA 1
...
...
modules/core/include/opencv2/core/cvdef.h
浏览文件 @
b3269b08
...
...
@@ -282,6 +282,7 @@ namespace cv {
#define CV_CPU_AVX_5124FMAPS 27
#define CV_CPU_NEON 100
#define CV_CPU_NEON_DOTPROD 101
#define CV_CPU_MSA 150
...
...
@@ -334,6 +335,7 @@ enum CpuFeatures {
CPU_AVX_5124FMAPS
=
27
,
CPU_NEON
=
100
,
CPU_NEON_DOTPROD
=
101
,
CPU_MSA
=
150
,
...
...
modules/core/include/opencv2/core/hal/intrin_neon.hpp
浏览文件 @
b3269b08
...
...
@@ -78,8 +78,6 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define CV_NEON_AARCH64 0
#endif
// TODO
#define CV_NEON_DOT 0
//////////// Utils ////////////
...
...
@@ -667,11 +665,22 @@ inline v_int64x2 v_dotprod(const v_int32x4& a, const v_int32x4& b, const v_int64
}
// 8 >> 32
#ifdef CV_NEON_DOT
#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP(_Tpvec1, _Tpvec2, suffix) \
inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b) \
{ \
return _Tpvec1(vdotq_##suffix(vdupq_n_##suffix(0), a.val, b.val));\
} \
inline _Tpvec1 v_dotprod_expand(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
{ \
return _Tpvec1(vdotq_##suffix(c.val, a.val, b.val)); \
}
OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP
(
v_uint32x4
,
v_uint8x16
,
u32
)
OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_OP
(
v_int32x4
,
v_int8x16
,
s32
)
#else
inline
v_uint32x4
v_dotprod_expand
(
const
v_uint8x16
&
a
,
const
v_uint8x16
&
b
)
{
#if CV_NEON_DOT
return
v_uint32x4
(
vdotq_u32
(
vdupq_n_u32
(
0
),
a
.
val
,
b
.
val
));
#else
const
uint8x16_t
zero
=
vreinterpretq_u8_u32
(
vdupq_n_u32
(
0
));
const
uint8x16_t
mask
=
vreinterpretq_u8_u32
(
vdupq_n_u32
(
0x00FF00FF
));
const
uint16x8_t
zero32
=
vreinterpretq_u16_u32
(
vdupq_n_u32
(
0
));
...
...
@@ -687,23 +696,15 @@ inline v_uint32x4 v_dotprod_expand(const v_uint8x16& a, const v_uint8x16& b)
uint32x4_t
s1
=
vaddq_u32
(
vshrq_n_u32
(
vreinterpretq_u32_u16
(
even
),
16
),
vshrq_n_u32
(
vreinterpretq_u32_u16
(
odd
),
16
));
return
v_uint32x4
(
vaddq_u32
(
s0
,
s1
));
#endif
}
inline
v_uint32x4
v_dotprod_expand
(
const
v_uint8x16
&
a
,
const
v_uint8x16
&
b
,
const
v_uint32x4
&
c
)
{
#if CV_NEON_DOT
return
v_uint32x4
(
vdotq_u32
(
c
.
val
,
a
.
val
,
b
.
val
));
#else
return
v_dotprod_expand
(
a
,
b
)
+
c
;
#endif
}
inline
v_int32x4
v_dotprod_expand
(
const
v_int8x16
&
a
,
const
v_int8x16
&
b
)
{
#if CV_NEON_DOT
return
v_int32x4
(
vdotq_s32
(
vdupq_n_s32
(
0
),
a
.
val
,
b
.
val
));
#else
int16x8_t
p0
=
vmull_s8
(
vget_low_s8
(
a
.
val
),
vget_low_s8
(
b
.
val
));
int16x8_t
p1
=
vmull_s8
(
vget_high_s8
(
a
.
val
),
vget_high_s8
(
b
.
val
));
int16x8_t
uzp1
,
uzp2
;
...
...
@@ -712,18 +713,13 @@ inline v_int32x4 v_dotprod_expand(const v_int8x16& a, const v_int8x16& b)
int16x4_t
uzpl1
,
uzpl2
;
_v128_unzip
(
vget_low_s16
(
sum
),
vget_high_s16
(
sum
),
uzpl1
,
uzpl2
);
return
v_int32x4
(
vaddl_s16
(
uzpl1
,
uzpl2
));
#endif
}
inline
v_int32x4
v_dotprod_expand
(
const
v_int8x16
&
a
,
const
v_int8x16
&
b
,
const
v_int32x4
&
c
)
{
#if CV_NEON_DOT
return
v_int32x4
(
vdotq_s32
(
c
.
val
,
a
.
val
,
b
.
val
));
#else
return
v_dotprod_expand
(
a
,
b
)
+
c
;
#endif
}
#endif
// 16 >> 64
inline
v_uint64x2
v_dotprod_expand
(
const
v_uint16x8
&
a
,
const
v_uint16x8
&
b
)
{
...
...
@@ -832,45 +828,44 @@ inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_
}
// 8 >> 32
#ifdef CV_NEON_DOT
#define OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP(_Tpvec1, _Tpvec2, suffix) \
inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b) \
{ \
return v_dotprod_expand(a, b); \
} \
inline _Tpvec1 v_dotprod_expand_fast(const _Tpvec2& a, const _Tpvec2& b, const _Tpvec1& c) \
{ \
return v_dotprod_expand(a, b, c); \
}
OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP
(
v_uint32x4
,
v_uint8x16
,
u32
)
OPENCV_HAL_IMPL_NEON_DOT_PRODUCT_FAST_OP
(
v_int32x4
,
v_int8x16
,
s32
)
#else
inline
v_uint32x4
v_dotprod_expand_fast
(
const
v_uint8x16
&
a
,
const
v_uint8x16
&
b
)
{
#if CV_NEON_DOT
return
v_uint32x4
(
vdotq_u32
(
vdupq_n_u32
(
0
),
a
.
val
,
b
.
val
));
#else
uint16x8_t
p0
=
vmull_u8
(
vget_low_u8
(
a
.
val
),
vget_low_u8
(
b
.
val
));
uint16x8_t
p1
=
vmull_u8
(
vget_high_u8
(
a
.
val
),
vget_high_u8
(
b
.
val
));
uint32x4_t
s0
=
vaddl_u16
(
vget_low_u16
(
p0
),
vget_low_u16
(
p1
));
uint32x4_t
s1
=
vaddl_u16
(
vget_high_u16
(
p0
),
vget_high_u16
(
p1
));
return
v_uint32x4
(
vaddq_u32
(
s0
,
s1
));
#endif
}
inline
v_uint32x4
v_dotprod_expand_fast
(
const
v_uint8x16
&
a
,
const
v_uint8x16
&
b
,
const
v_uint32x4
&
c
)
{
#if CV_NEON_DOT
return
v_uint32x4
(
vdotq_u32
(
c
.
val
,
a
.
val
,
b
.
val
));
#else
return
v_dotprod_expand_fast
(
a
,
b
)
+
c
;
#endif
}
inline
v_int32x4
v_dotprod_expand_fast
(
const
v_int8x16
&
a
,
const
v_int8x16
&
b
)
{
#if CV_NEON_DOT
return
v_int32x4
(
vdotq_s32
(
vdupq_n_s32
(
0
),
a
.
val
,
b
.
val
));
#else
int16x8_t
prod
=
vmull_s8
(
vget_low_s8
(
a
.
val
),
vget_low_s8
(
b
.
val
));
prod
=
vmlal_s8
(
prod
,
vget_high_s8
(
a
.
val
),
vget_high_s8
(
b
.
val
));
return
v_int32x4
(
vaddl_s16
(
vget_low_s16
(
prod
),
vget_high_s16
(
prod
)));
#endif
}
inline
v_int32x4
v_dotprod_expand_fast
(
const
v_int8x16
&
a
,
const
v_int8x16
&
b
,
const
v_int32x4
&
c
)
{
#if CV_NEON_DOT
return
v_int32x4
(
vdotq_s32
(
c
.
val
,
a
.
val
,
b
.
val
));
#else
return
v_dotprod_expand_fast
(
a
,
b
)
+
c
;
#endif
}
#endif
// 16 >> 64
inline
v_uint64x2
v_dotprod_expand_fast
(
const
v_uint16x8
&
a
,
const
v_uint16x8
&
b
)
...
...
modules/core/src/system.cpp
浏览文件 @
b3269b08
...
...
@@ -411,6 +411,7 @@ struct HWFeatures
g_hwFeatureNames
[
CPU_AVX_5124FMAPS
]
=
"AVX5124FMAPS"
;
g_hwFeatureNames
[
CPU_NEON
]
=
"NEON"
;
g_hwFeatureNames
[
CPU_NEON_DOTPROD
]
=
"NEON_DOTPROD"
;
g_hwFeatureNames
[
CPU_VSX
]
=
"VSX"
;
g_hwFeatureNames
[
CPU_VSX3
]
=
"VSX3"
;
...
...
@@ -555,6 +556,24 @@ struct HWFeatures
#ifdef __aarch64__
have
[
CV_CPU_NEON
]
=
true
;
have
[
CV_CPU_FP16
]
=
true
;
int
cpufile
=
open
(
"/proc/self/auxv"
,
O_RDONLY
);
if
(
cpufile
>=
0
)
{
Elf64_auxv_t
auxv
;
const
size_t
size_auxv_t
=
sizeof
(
auxv
);
while
((
size_t
)
read
(
cpufile
,
&
auxv
,
size_auxv_t
)
==
size_auxv_t
)
{
if
(
auxv
.
a_type
==
AT_HWCAP
)
{
have
[
CV_CPU_NEON_DOTPROD
]
=
(
auxv
.
a_un
.
a_val
&
(
1
<<
20
))
!=
0
;
break
;
}
}
close
(
cpufile
);
}
#elif defined __arm__ && defined __ANDROID__
#if defined HAVE_CPUFEATURES
CV_LOG_INFO
(
NULL
,
"calling android_getCpuFeatures() ..."
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录