Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
422f802e
O
Opencv
项目概览
Greenplum
/
Opencv
大约 1 年 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
422f802e
编写于
8月 04, 2020
作者:
A
Alexander Alekhin
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #17922 from joy2myself:build_riscv_with_c++_intrin
上级
11eff8ba
ff4c3873
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
1782 addition
and
0 deletion
+1782
-0
cmake/OpenCVCompilerOptimizations.cmake
cmake/OpenCVCompilerOptimizations.cmake
+11
-0
cmake/checks/cpu_rvv.cpp
cmake/checks/cpu_rvv.cpp
+23
-0
modules/core/include/opencv2/core/cv_cpu_dispatch.h
modules/core/include/opencv2/core/cv_cpu_dispatch.h
+8
-0
modules/core/include/opencv2/core/cv_cpu_helper.h
modules/core/include/opencv2/core/cv_cpu_helper.h
+21
-0
modules/core/include/opencv2/core/cvdef.h
modules/core/include/opencv2/core/cvdef.h
+4
-0
modules/core/include/opencv2/core/hal/intrin.hpp
modules/core/include/opencv2/core/hal/intrin.hpp
+4
-0
modules/core/include/opencv2/core/hal/intrin_rvv.hpp
modules/core/include/opencv2/core/hal/intrin_rvv.hpp
+1678
-0
modules/core/src/system.cpp
modules/core/src/system.cpp
+6
-0
platforms/linux/riscv64-clang.toolchain.cmake
platforms/linux/riscv64-clang.toolchain.cmake
+27
-0
未找到文件。
cmake/OpenCVCompilerOptimizations.cmake
浏览文件 @
422f802e
...
...
@@ -49,6 +49,7 @@ list(APPEND CPU_ALL_OPTIMIZATIONS "AVX512_COMMON;AVX512_KNL;AVX512_KNM;AVX512_SK
list
(
APPEND CPU_ALL_OPTIMIZATIONS NEON VFPV3 FP16
)
list
(
APPEND CPU_ALL_OPTIMIZATIONS MSA
)
list
(
APPEND CPU_ALL_OPTIMIZATIONS VSX VSX3
)
list
(
APPEND CPU_ALL_OPTIMIZATIONS RVV
)
list
(
REMOVE_DUPLICATES CPU_ALL_OPTIMIZATIONS
)
ocv_update
(
CPU_VFPV3_FEATURE_ALIAS
""
)
...
...
@@ -102,6 +103,8 @@ ocv_optimization_process_obsolete_option(ENABLE_NEON NEON OFF)
ocv_optimization_process_obsolete_option
(
ENABLE_VSX VSX ON
)
ocv_optimization_process_obsolete_option
(
ENABLE_RVV RVV OFF
)
macro
(
ocv_is_optimization_in_list resultvar check_opt
)
set
(
__checked
""
)
set
(
__queue
${
ARGN
}
)
...
...
@@ -366,6 +369,14 @@ elseif(PPC64LE)
set
(
CPU_DISPATCH
"VSX3"
CACHE STRING
"
${
HELP_CPU_DISPATCH
}
"
)
set
(
CPU_BASELINE
"VSX"
CACHE STRING
"
${
HELP_CPU_BASELINE
}
"
)
elseif
(
RISCV
)
ocv_update
(
CPU_RVV_TEST_FILE
"
${
OpenCV_SOURCE_DIR
}
/cmake/checks/cpu_rvv.cpp"
)
ocv_update
(
CPU_KNOWN_OPTIMIZATIONS
"RVV"
)
ocv_update
(
CPU_RVV_FLAGS_ON
""
)
set
(
CPU_DISPATCH
"RVV"
CACHE STRING
"
${
HELP_CPU_DISPATCH
}
"
)
set
(
CPU_BASELINE
"RVV"
CACHE STRING
"
${
HELP_CPU_BASELINE
}
"
)
endif
()
# Helper values for cmake-gui
...
...
cmake/checks/cpu_rvv.cpp
0 → 100644
浏览文件 @
422f802e
#include <stdio.h>
#if defined(__riscv)
# include <riscv_vector.h>
# define CV_RVV 1
#endif
#if defined CV_RVV
int
test
()
{
const
float
src
[]
=
{
0.0
f
,
0.0
f
,
0.0
f
,
0.0
f
};
vfloat32m1_t
val
=
vle32_v_f32m1
((
const
float
*
)(
src
));
return
(
int
)
vfmv_f_s_f32m1_f32
(
val
);
}
#else
#error "RISC-V vector extension(RVV) is not supported"
#endif
int
main
()
{
printf
(
"%d
\n
"
,
test
());
return
0
;
}
modules/core/include/opencv2/core/cv_cpu_dispatch.h
浏览文件 @
422f802e
...
...
@@ -168,6 +168,10 @@
# include <wasm_simd128.h>
#endif
#if defined CV_CPU_COMPILE_RVV
# define CV_RVV 1
#endif
#endif // CV_ENABLE_INTRINSICS && !CV_DISABLE_OPTIMIZATION && !__CUDACC__
#if defined CV_CPU_COMPILE_AVX && !defined CV_CPU_BASELINE_COMPILE_AVX
...
...
@@ -343,3 +347,7 @@ struct VZeroUpperGuard {
#ifndef CV_WASM_SIMD
# define CV_WASM_SIMD 0
#endif
#ifndef CV_RVV
# define CV_RVV 0
#endif
modules/core/include/opencv2/core/cv_cpu_helper.h
浏览文件 @
422f802e
...
...
@@ -483,5 +483,26 @@
#endif
#define __CV_CPU_DISPATCH_CHAIN_VSX3(fn, args, mode, ...) CV_CPU_CALL_VSX3(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#if !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_COMPILE_RVV
# define CV_TRY_RVV 1
# define CV_CPU_FORCE_RVV 1
# define CV_CPU_HAS_SUPPORT_RVV 1
# define CV_CPU_CALL_RVV(fn, args) return (cpu_baseline::fn args)
# define CV_CPU_CALL_RVV_(fn, args) return (opt_RVV::fn args)
#elif !defined CV_DISABLE_OPTIMIZATION && defined CV_ENABLE_INTRINSICS && defined CV_CPU_DISPATCH_COMPILE_RVV
# define CV_TRY_RVV 1
# define CV_CPU_FORCE_RVV 0
# define CV_CPU_HAS_SUPPORT_RVV (cv::checkHardwareSupport(CV_CPU_RVV))
# define CV_CPU_CALL_RVV(fn, args) if (CV_CPU_HAS_SUPPORT_RVV) return (opt_RVV::fn args)
# define CV_CPU_CALL_RVV_(fn, args) if (CV_CPU_HAS_SUPPORT_RVV) return (opt_RVV::fn args)
#else
# define CV_TRY_RVV 0
# define CV_CPU_FORCE_RVV 0
# define CV_CPU_HAS_SUPPORT_RVV 0
# define CV_CPU_CALL_RVV(fn, args)
# define CV_CPU_CALL_RVV_(fn, args)
#endif
#define __CV_CPU_DISPATCH_CHAIN_RVV(fn, args, mode, ...) CV_CPU_CALL_RVV(fn, args); __CV_EXPAND(__CV_CPU_DISPATCH_CHAIN_ ## mode(fn, args, __VA_ARGS__))
#define CV_CPU_CALL_BASELINE(fn, args) return (cpu_baseline::fn args)
#define __CV_CPU_DISPATCH_CHAIN_BASELINE(fn, args, mode, ...) CV_CPU_CALL_BASELINE(fn, args)
/* last in sequence */
modules/core/include/opencv2/core/cvdef.h
浏览文件 @
422f802e
...
...
@@ -272,6 +272,8 @@ namespace cv {
#define CV_CPU_VSX 200
#define CV_CPU_VSX3 201
#define CV_CPU_RVV 210
// CPU features groups
#define CV_CPU_AVX512_SKX 256
#define CV_CPU_AVX512_COMMON 257
...
...
@@ -324,6 +326,8 @@ enum CpuFeatures {
CPU_VSX
=
200
,
CPU_VSX3
=
201
,
CPU_RVV
=
210
,
CPU_AVX512_SKX
=
256
,
//!< Skylake-X with AVX-512F/CD/BW/DQ/VL
CPU_AVX512_COMMON
=
257
,
//!< Common instructions AVX-512F/CD for all CPUs that support AVX-512
CPU_AVX512_KNL
=
258
,
//!< Knights Landing with AVX-512F/CD/ER/PF
...
...
modules/core/include/opencv2/core/hal/intrin.hpp
浏览文件 @
422f802e
...
...
@@ -199,6 +199,7 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
# undef CV_VSX
# undef CV_FP16
# undef CV_MSA
# undef CV_RVV
#endif
#if (CV_SSE2 || CV_NEON || CV_VSX || CV_MSA || CV_WASM_SIMD) && !defined(CV_FORCE_SIMD128_CPP)
...
...
@@ -226,6 +227,9 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
#elif CV_WASM_SIMD && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_wasm.hpp"
#elif CV_RVV && !defined(CV_FORCE_SIMD128_CPP)
#include "opencv2/core/hal/intrin_rvv.hpp"
#else
#include "opencv2/core/hal/intrin_cpp.hpp"
...
...
modules/core/include/opencv2/core/hal/intrin_rvv.hpp
0 → 100644
浏览文件 @
422f802e
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.
#ifndef OPENCV_HAL_INTRIN_RVV_HPP
#define OPENCV_HAL_INTRIN_RVV_HPP
#include <limits>
#include <cstring>
#include <algorithm>
#include "opencv2/core/saturate.hpp"
#define CV_SIMD128_CPP 1
#if defined(CV_FORCE_SIMD128_CPP) || defined(CV_DOXYGEN)
#define CV_SIMD128 1
#define CV_SIMD128_64F 1
#endif
namespace
cv
{
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#endif
template
<
typename
_Tp
,
int
n
>
struct
v_reg
{
typedef
_Tp
lane_type
;
enum
{
nlanes
=
n
};
explicit
v_reg
(
const
_Tp
*
ptr
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
s
[
i
]
=
ptr
[
i
];
}
v_reg
(
_Tp
s0
,
_Tp
s1
)
{
s
[
0
]
=
s0
;
s
[
1
]
=
s1
;
}
v_reg
(
_Tp
s0
,
_Tp
s1
,
_Tp
s2
,
_Tp
s3
)
{
s
[
0
]
=
s0
;
s
[
1
]
=
s1
;
s
[
2
]
=
s2
;
s
[
3
]
=
s3
;
}
v_reg
(
_Tp
s0
,
_Tp
s1
,
_Tp
s2
,
_Tp
s3
,
_Tp
s4
,
_Tp
s5
,
_Tp
s6
,
_Tp
s7
)
{
s
[
0
]
=
s0
;
s
[
1
]
=
s1
;
s
[
2
]
=
s2
;
s
[
3
]
=
s3
;
s
[
4
]
=
s4
;
s
[
5
]
=
s5
;
s
[
6
]
=
s6
;
s
[
7
]
=
s7
;
}
v_reg
(
_Tp
s0
,
_Tp
s1
,
_Tp
s2
,
_Tp
s3
,
_Tp
s4
,
_Tp
s5
,
_Tp
s6
,
_Tp
s7
,
_Tp
s8
,
_Tp
s9
,
_Tp
s10
,
_Tp
s11
,
_Tp
s12
,
_Tp
s13
,
_Tp
s14
,
_Tp
s15
)
{
s
[
0
]
=
s0
;
s
[
1
]
=
s1
;
s
[
2
]
=
s2
;
s
[
3
]
=
s3
;
s
[
4
]
=
s4
;
s
[
5
]
=
s5
;
s
[
6
]
=
s6
;
s
[
7
]
=
s7
;
s
[
8
]
=
s8
;
s
[
9
]
=
s9
;
s
[
10
]
=
s10
;
s
[
11
]
=
s11
;
s
[
12
]
=
s12
;
s
[
13
]
=
s13
;
s
[
14
]
=
s14
;
s
[
15
]
=
s15
;
}
v_reg
()
{}
v_reg
(
const
v_reg
<
_Tp
,
n
>
&
r
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
s
[
i
]
=
r
.
s
[
i
];
}
_Tp
get0
()
const
{
return
s
[
0
];
}
_Tp
get
(
const
int
i
)
const
{
return
s
[
i
];
}
v_reg
<
_Tp
,
n
>
high
()
const
{
v_reg
<
_Tp
,
n
>
c
;
int
i
;
for
(
i
=
0
;
i
<
n
/
2
;
i
++
)
{
c
.
s
[
i
]
=
s
[
i
+
(
n
/
2
)];
c
.
s
[
i
+
(
n
/
2
)]
=
0
;
}
return
c
;
}
static
v_reg
<
_Tp
,
n
>
zero
()
{
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
(
_Tp
)
0
;
return
c
;
}
static
v_reg
<
_Tp
,
n
>
all
(
_Tp
s
)
{
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
s
;
return
c
;
}
template
<
typename
_Tp2
,
int
n2
>
v_reg
<
_Tp2
,
n2
>
reinterpret_as
()
const
{
size_t
bytes
=
std
::
min
(
sizeof
(
_Tp2
)
*
n2
,
sizeof
(
_Tp
)
*
n
);
v_reg
<
_Tp2
,
n2
>
c
;
std
::
memcpy
(
&
c
.
s
[
0
],
&
s
[
0
],
bytes
);
return
c
;
}
v_reg
&
operator
=
(
const
v_reg
<
_Tp
,
n
>
&
r
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
s
[
i
]
=
r
.
s
[
i
];
return
*
this
;
}
_Tp
s
[
n
];
};
typedef
v_reg
<
uchar
,
16
>
v_uint8x16
;
typedef
v_reg
<
schar
,
16
>
v_int8x16
;
typedef
v_reg
<
ushort
,
8
>
v_uint16x8
;
typedef
v_reg
<
short
,
8
>
v_int16x8
;
typedef
v_reg
<
unsigned
,
4
>
v_uint32x4
;
typedef
v_reg
<
int
,
4
>
v_int32x4
;
typedef
v_reg
<
float
,
4
>
v_float32x4
;
typedef
v_reg
<
double
,
2
>
v_float64x2
;
typedef
v_reg
<
uint64
,
2
>
v_uint64x2
;
typedef
v_reg
<
int64
,
2
>
v_int64x2
;
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>
operator
+
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>&
operator
+=
(
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>
operator
-
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>&
operator
-=
(
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>
operator
*
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>&
operator
*=
(
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>
operator
/
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>&
operator
/=
(
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>
operator
&
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>&
operator
&=
(
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>
operator
|
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>&
operator
|=
(
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>
operator
^
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>&
operator
^=
(
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
);
template
<
typename
_Tp
,
int
n
>
CV_INLINE
v_reg
<
_Tp
,
n
>
operator
~
(
const
v_reg
<
_Tp
,
n
>&
a
);
#ifndef CV_DOXYGEN
#define CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, ...) \
__CV_EXPAND(macro_name(uchar, __VA_ARGS__)) \
__CV_EXPAND(macro_name(schar, __VA_ARGS__)) \
__CV_EXPAND(macro_name(ushort, __VA_ARGS__)) \
__CV_EXPAND(macro_name(short, __VA_ARGS__)) \
__CV_EXPAND(macro_name(unsigned, __VA_ARGS__)) \
__CV_EXPAND(macro_name(int, __VA_ARGS__)) \
__CV_EXPAND(macro_name(uint64, __VA_ARGS__)) \
__CV_EXPAND(macro_name(int64, __VA_ARGS__)) \
#define CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, ...) \
__CV_EXPAND(macro_name(float, __VA_ARGS__)) \
__CV_EXPAND(macro_name(double, __VA_ARGS__)) \
#define CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(macro_name, ...) \
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(macro_name, __VA_ARGS__) \
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(macro_name, __VA_ARGS__) \
#define CV__HAL_INTRIN_IMPL_BIN_OP_(_Tp, bin_op) \
template<int n> inline \
v_reg<_Tp, n> operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
v_reg<_Tp, n> c; \
for( int i = 0; i < n; i++ ) \
c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
return c; \
} \
template<int n> inline \
v_reg<_Tp, n>& operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
for( int i = 0; i < n; i++ ) \
a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
return a; \
}
#define CV__HAL_INTRIN_IMPL_BIN_OP(bin_op) CV__HAL_INTRIN_EXPAND_WITH_ALL_TYPES(CV__HAL_INTRIN_IMPL_BIN_OP_, bin_op)
CV__HAL_INTRIN_IMPL_BIN_OP
(
+
)
CV__HAL_INTRIN_IMPL_BIN_OP
(
-
)
CV__HAL_INTRIN_IMPL_BIN_OP
(
*
)
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES
(
CV__HAL_INTRIN_IMPL_BIN_OP_
,
/
)
#define CV__HAL_INTRIN_IMPL_BIT_OP_(_Tp, bit_op) \
template<int n> CV_INLINE \
v_reg<_Tp, n> operator bit_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
v_reg<_Tp, n> c; \
typedef typename V_TypeTraits<_Tp>::int_type itype; \
for( int i = 0; i < n; i++ ) \
c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
return c; \
} \
template<int n> CV_INLINE \
v_reg<_Tp, n>& operator bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
typedef typename V_TypeTraits<_Tp>::int_type itype; \
for( int i = 0; i < n; i++ ) \
a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
return a; \
}
#define CV__HAL_INTRIN_IMPL_BIT_OP(bit_op) \
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op) \
CV__HAL_INTRIN_EXPAND_WITH_FP_TYPES(CV__HAL_INTRIN_IMPL_BIT_OP_, bit_op)
/* TODO: FIXIT remove this after masks refactoring */
CV__HAL_INTRIN_IMPL_BIT_OP
(
&
)
CV__HAL_INTRIN_IMPL_BIT_OP
(
|
)
CV__HAL_INTRIN_IMPL_BIT_OP
(
^
)
#define CV__HAL_INTRIN_IMPL_BITWISE_NOT_(_Tp, dummy) \
template<int n> CV_INLINE \
v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a) \
{ \
v_reg<_Tp, n> c; \
for( int i = 0; i < n; i++ ) \
c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i])); \
return c; \
} \
CV__HAL_INTRIN_EXPAND_WITH_INTEGER_TYPES
(
CV__HAL_INTRIN_IMPL_BITWISE_NOT_
,
~
)
#endif
#define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
{ \
v_reg<_Tp2, n> c; \
for( int i = 0; i < n; i++ ) \
c.s[i] = cfunc(a.s[i]); \
return c; \
}
#define OPENCV_HAL_IMPL_MATH_FUNC_FLOAT(func, cfunc) \
inline v_reg<int, 4> func(const v_reg<float, 4>& a) \
{ \
v_reg<int, 4> c; \
for( int i = 0; i < 4; i++ ) \
c.s[i] = cfunc(a.s[i]); \
return c; \
} \
inline v_reg<int, 4> func(const v_reg<double, 2>& a) \
{ \
v_reg<int, 4> c; \
for( int i = 0; i < 2; i++ ) \
{ \
c.s[i] = cfunc(a.s[i]); \
c.s[i + 2] = 0; \
} \
return c; \
}
OPENCV_HAL_IMPL_MATH_FUNC
(
v_sqrt
,
std
::
sqrt
,
_Tp
)
OPENCV_HAL_IMPL_MATH_FUNC
(
v_sin
,
std
::
sin
,
_Tp
)
OPENCV_HAL_IMPL_MATH_FUNC
(
v_cos
,
std
::
cos
,
_Tp
)
OPENCV_HAL_IMPL_MATH_FUNC
(
v_exp
,
std
::
exp
,
_Tp
)
OPENCV_HAL_IMPL_MATH_FUNC
(
v_log
,
std
::
log
,
_Tp
)
OPENCV_HAL_IMPL_MATH_FUNC
(
v_abs
,
(
typename
V_TypeTraits
<
_Tp
>::
abs_type
)
std
::
abs
,
typename
V_TypeTraits
<
_Tp
>::
abs_type
)
OPENCV_HAL_IMPL_MATH_FUNC_FLOAT
(
v_round
,
cvRound
)
OPENCV_HAL_IMPL_MATH_FUNC_FLOAT
(
v_floor
,
cvFloor
)
OPENCV_HAL_IMPL_MATH_FUNC_FLOAT
(
v_ceil
,
cvCeil
)
OPENCV_HAL_IMPL_MATH_FUNC_FLOAT
(
v_trunc
,
int
)
#define OPENCV_HAL_IMPL_MINMAX_FUNC(func, cfunc) \
template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
v_reg<_Tp, n> c; \
for( int i = 0; i < n; i++ ) \
c.s[i] = cfunc(a.s[i], b.s[i]); \
return c; \
}
#define OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC(func, cfunc) \
template<typename _Tp, int n> inline _Tp func(const v_reg<_Tp, n>& a) \
{ \
_Tp c = a.s[0]; \
for( int i = 1; i < n; i++ ) \
c = cfunc(c, a.s[i]); \
return c; \
}
OPENCV_HAL_IMPL_MINMAX_FUNC
(
v_min
,
std
::
min
)
OPENCV_HAL_IMPL_MINMAX_FUNC
(
v_max
,
std
::
max
)
OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC
(
v_reduce_min
,
std
::
min
)
OPENCV_HAL_IMPL_REDUCE_MINMAX_FUNC
(
v_reduce_max
,
std
::
max
)
static
const
unsigned
char
popCountTable
[]
=
{
0
,
1
,
1
,
2
,
1
,
2
,
2
,
3
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
4
,
5
,
5
,
6
,
5
,
6
,
6
,
7
,
1
,
2
,
2
,
3
,
2
,
3
,
3
,
4
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
4
,
5
,
5
,
6
,
5
,
6
,
6
,
7
,
2
,
3
,
3
,
4
,
3
,
4
,
4
,
5
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
4
,
5
,
5
,
6
,
5
,
6
,
6
,
7
,
3
,
4
,
4
,
5
,
4
,
5
,
5
,
6
,
4
,
5
,
5
,
6
,
5
,
6
,
6
,
7
,
4
,
5
,
5
,
6
,
5
,
6
,
6
,
7
,
5
,
6
,
6
,
7
,
6
,
7
,
7
,
8
,
};
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
abs_type
,
n
>
v_popcount
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
abs_type
,
n
>
b
=
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
abs_type
,
n
>::
zero
();
for
(
int
i
=
0
;
i
<
n
*
(
int
)
sizeof
(
_Tp
);
i
++
)
b
.
s
[
i
/
sizeof
(
_Tp
)]
+=
popCountTable
[
v_reinterpret_as_u8
(
a
).
s
[
i
]];
return
b
;
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_minmax
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
v_reg
<
_Tp
,
n
>&
minval
,
v_reg
<
_Tp
,
n
>&
maxval
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
minval
.
s
[
i
]
=
std
::
min
(
a
.
s
[
i
],
b
.
s
[
i
]);
maxval
.
s
[
i
]
=
std
::
max
(
a
.
s
[
i
],
b
.
s
[
i
]);
}
}
#define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
template<typename _Tp, int n> \
inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
typedef typename V_TypeTraits<_Tp>::int_type itype; \
v_reg<_Tp, n> c; \
for( int i = 0; i < n; i++ ) \
c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
return c; \
}
OPENCV_HAL_IMPL_CMP_OP
(
<
)
OPENCV_HAL_IMPL_CMP_OP
(
>
)
OPENCV_HAL_IMPL_CMP_OP
(
<=
)
OPENCV_HAL_IMPL_CMP_OP
(
>=
)
OPENCV_HAL_IMPL_CMP_OP
(
==
)
OPENCV_HAL_IMPL_CMP_OP
(
!=
)
template
<
int
n
>
inline
v_reg
<
float
,
n
>
v_not_nan
(
const
v_reg
<
float
,
n
>&
a
)
{
typedef
typename
V_TypeTraits
<
float
>::
int_type
itype
;
v_reg
<
float
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
V_TypeTraits
<
float
>::
reinterpret_from_int
((
itype
)
-
(
int
)(
a
.
s
[
i
]
==
a
.
s
[
i
]));
return
c
;
}
template
<
int
n
>
inline
v_reg
<
double
,
n
>
v_not_nan
(
const
v_reg
<
double
,
n
>&
a
)
{
typedef
typename
V_TypeTraits
<
double
>::
int_type
itype
;
v_reg
<
double
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
V_TypeTraits
<
double
>::
reinterpret_from_int
((
itype
)
-
(
int
)(
a
.
s
[
i
]
==
a
.
s
[
i
]));
return
c
;
}
#define OPENCV_HAL_IMPL_ARITHM_OP(func, bin_op, cast_op, _Tp2) \
template<typename _Tp, int n> \
inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
typedef _Tp2 rtype; \
v_reg<rtype, n> c; \
for( int i = 0; i < n; i++ ) \
c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
return c; \
}
OPENCV_HAL_IMPL_ARITHM_OP
(
v_add_wrap
,
+
,
(
_Tp
),
_Tp
)
OPENCV_HAL_IMPL_ARITHM_OP
(
v_sub_wrap
,
-
,
(
_Tp
),
_Tp
)
OPENCV_HAL_IMPL_ARITHM_OP
(
v_mul_wrap
,
*
,
(
_Tp
),
_Tp
)
template
<
typename
T
>
inline
T
_absdiff
(
T
a
,
T
b
)
{
return
a
>
b
?
a
-
b
:
b
-
a
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
abs_type
,
n
>
v_absdiff
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>
&
b
)
{
typedef
typename
V_TypeTraits
<
_Tp
>::
abs_type
rtype
;
v_reg
<
rtype
,
n
>
c
;
const
rtype
mask
=
(
rtype
)(
std
::
numeric_limits
<
_Tp
>::
is_signed
?
(
1
<<
(
sizeof
(
rtype
)
*
8
-
1
))
:
0
);
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
rtype
ua
=
a
.
s
[
i
]
^
mask
;
rtype
ub
=
b
.
s
[
i
]
^
mask
;
c
.
s
[
i
]
=
_absdiff
(
ua
,
ub
);
}
return
c
;
}
inline
v_float32x4
v_absdiff
(
const
v_float32x4
&
a
,
const
v_float32x4
&
b
)
{
v_float32x4
c
;
for
(
int
i
=
0
;
i
<
c
.
nlanes
;
i
++
)
c
.
s
[
i
]
=
_absdiff
(
a
.
s
[
i
],
b
.
s
[
i
]);
return
c
;
}
inline
v_float64x2
v_absdiff
(
const
v_float64x2
&
a
,
const
v_float64x2
&
b
)
{
v_float64x2
c
;
for
(
int
i
=
0
;
i
<
c
.
nlanes
;
i
++
)
c
.
s
[
i
]
=
_absdiff
(
a
.
s
[
i
],
b
.
s
[
i
]);
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_absdiffs
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
saturate_cast
<
_Tp
>
(
std
::
abs
(
a
.
s
[
i
]
-
b
.
s
[
i
]));
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_invsqrt
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
1.
f
/
std
::
sqrt
(
a
.
s
[
i
]);
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_magnitude
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
std
::
sqrt
(
a
.
s
[
i
]
*
a
.
s
[
i
]
+
b
.
s
[
i
]
*
b
.
s
[
i
]);
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_sqr_magnitude
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
a
.
s
[
i
]
*
a
.
s
[
i
]
+
b
.
s
[
i
]
*
b
.
s
[
i
];
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_fma
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
const
v_reg
<
_Tp
,
n
>&
c
)
{
v_reg
<
_Tp
,
n
>
d
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
d
.
s
[
i
]
=
a
.
s
[
i
]
*
b
.
s
[
i
]
+
c
.
s
[
i
];
return
d
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_muladd
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
const
v_reg
<
_Tp
,
n
>&
c
)
{
return
v_fma
(
a
,
b
,
c
);
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
v_dotprod
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
typedef
typename
V_TypeTraits
<
_Tp
>::
w_type
w_type
;
v_reg
<
w_type
,
n
/
2
>
c
;
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
c
.
s
[
i
]
=
(
w_type
)
a
.
s
[
i
*
2
]
*
b
.
s
[
i
*
2
]
+
(
w_type
)
a
.
s
[
i
*
2
+
1
]
*
b
.
s
[
i
*
2
+
1
];
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
v_dotprod
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
const
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>&
c
)
{
typedef
typename
V_TypeTraits
<
_Tp
>::
w_type
w_type
;
v_reg
<
w_type
,
n
/
2
>
s
;
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
s
.
s
[
i
]
=
(
w_type
)
a
.
s
[
i
*
2
]
*
b
.
s
[
i
*
2
]
+
(
w_type
)
a
.
s
[
i
*
2
+
1
]
*
b
.
s
[
i
*
2
+
1
]
+
c
.
s
[
i
];
return
s
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
v_dotprod_fast
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
return
v_dotprod
(
a
,
b
);
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
v_dotprod_fast
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
const
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>&
c
)
{
return
v_dotprod
(
a
,
b
,
c
);
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
q_type
,
n
/
4
>
v_dotprod_expand
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
typedef
typename
V_TypeTraits
<
_Tp
>::
q_type
q_type
;
v_reg
<
q_type
,
n
/
4
>
s
;
for
(
int
i
=
0
;
i
<
(
n
/
4
);
i
++
)
s
.
s
[
i
]
=
(
q_type
)
a
.
s
[
i
*
4
]
*
b
.
s
[
i
*
4
]
+
(
q_type
)
a
.
s
[
i
*
4
+
1
]
*
b
.
s
[
i
*
4
+
1
]
+
(
q_type
)
a
.
s
[
i
*
4
+
2
]
*
b
.
s
[
i
*
4
+
2
]
+
(
q_type
)
a
.
s
[
i
*
4
+
3
]
*
b
.
s
[
i
*
4
+
3
];
return
s
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
q_type
,
n
/
4
>
v_dotprod_expand
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
const
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
q_type
,
n
/
4
>&
c
)
{
typedef
typename
V_TypeTraits
<
_Tp
>::
q_type
q_type
;
v_reg
<
q_type
,
n
/
4
>
s
;
for
(
int
i
=
0
;
i
<
(
n
/
4
);
i
++
)
s
.
s
[
i
]
=
(
q_type
)
a
.
s
[
i
*
4
]
*
b
.
s
[
i
*
4
]
+
(
q_type
)
a
.
s
[
i
*
4
+
1
]
*
b
.
s
[
i
*
4
+
1
]
+
(
q_type
)
a
.
s
[
i
*
4
+
2
]
*
b
.
s
[
i
*
4
+
2
]
+
(
q_type
)
a
.
s
[
i
*
4
+
3
]
*
b
.
s
[
i
*
4
+
3
]
+
c
.
s
[
i
];
return
s
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
q_type
,
n
/
4
>
v_dotprod_expand_fast
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
return
v_dotprod_expand
(
a
,
b
);
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
q_type
,
n
/
4
>
v_dotprod_expand_fast
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
const
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
q_type
,
n
/
4
>&
c
)
{
return
v_dotprod_expand
(
a
,
b
,
c
);
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_mul_expand
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>&
c
,
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>&
d
)
{
typedef
typename
V_TypeTraits
<
_Tp
>::
w_type
w_type
;
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
{
c
.
s
[
i
]
=
(
w_type
)
a
.
s
[
i
]
*
b
.
s
[
i
];
d
.
s
[
i
]
=
(
w_type
)
a
.
s
[
i
+
(
n
/
2
)]
*
b
.
s
[
i
+
(
n
/
2
)];
}
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_mul_hi
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
typedef
typename
V_TypeTraits
<
_Tp
>::
w_type
w_type
;
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
(
_Tp
)(((
w_type
)
a
.
s
[
i
]
*
b
.
s
[
i
])
>>
sizeof
(
_Tp
)
*
8
);
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_hsum
(
const
v_reg
<
_Tp
,
n
>&
a
,
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>&
c
)
{
typedef
typename
V_TypeTraits
<
_Tp
>::
w_type
w_type
;
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
{
c
.
s
[
i
]
=
(
w_type
)
a
.
s
[
i
*
2
]
+
a
.
s
[
i
*
2
+
1
];
}
}
#define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
{ \
v_reg<_Tp, n> c; \
for( int i = 0; i < n; i++ ) \
c.s[i] = (_Tp)(a.s[i] shift_op imm); \
return c; \
}
OPENCV_HAL_IMPL_SHIFT_OP
(
<<
)
OPENCV_HAL_IMPL_SHIFT_OP
(
>>
)
#define OPENCV_HAL_IMPL_ROTATE_SHIFT_OP(suffix,opA,opB) \
template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a) \
{ \
v_reg<_Tp, n> b; \
for (int i = 0; i < n; i++) \
{ \
int sIndex = i opA imm; \
if (0 <= sIndex && sIndex < n) \
{ \
b.s[i] = a.s[sIndex]; \
} \
else \
{ \
b.s[i] = 0; \
} \
} \
return b; \
} \
template<int imm, typename _Tp, int n> inline v_reg<_Tp, n> v_rotate_##suffix(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
v_reg<_Tp, n> c; \
for (int i = 0; i < n; i++) \
{ \
int aIndex = i opA imm; \
int bIndex = i opA imm opB n; \
if (0 <= bIndex && bIndex < n) \
{ \
c.s[i] = b.s[bIndex]; \
} \
else if (0 <= aIndex && aIndex < n) \
{ \
c.s[i] = a.s[aIndex]; \
} \
else \
{ \
c.s[i] = 0; \
} \
} \
return c; \
}
OPENCV_HAL_IMPL_ROTATE_SHIFT_OP
(
left
,
-
,
+
)
OPENCV_HAL_IMPL_ROTATE_SHIFT_OP
(
right
,
+
,
-
)
template
<
typename
_Tp
,
int
n
>
inline
typename
V_TypeTraits
<
_Tp
>::
sum_type
v_reduce_sum
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
typename
V_TypeTraits
<
_Tp
>::
sum_type
c
=
a
.
s
[
0
];
for
(
int
i
=
1
;
i
<
n
;
i
++
)
c
+=
a
.
s
[
i
];
return
c
;
}
inline
v_float32x4
v_reduce_sum4
(
const
v_float32x4
&
a
,
const
v_float32x4
&
b
,
const
v_float32x4
&
c
,
const
v_float32x4
&
d
)
{
v_float32x4
r
;
r
.
s
[
0
]
=
a
.
s
[
0
]
+
a
.
s
[
1
]
+
a
.
s
[
2
]
+
a
.
s
[
3
];
r
.
s
[
1
]
=
b
.
s
[
0
]
+
b
.
s
[
1
]
+
b
.
s
[
2
]
+
b
.
s
[
3
];
r
.
s
[
2
]
=
c
.
s
[
0
]
+
c
.
s
[
1
]
+
c
.
s
[
2
]
+
c
.
s
[
3
];
r
.
s
[
3
]
=
d
.
s
[
0
]
+
d
.
s
[
1
]
+
d
.
s
[
2
]
+
d
.
s
[
3
];
return
r
;
}
template
<
typename
_Tp
,
int
n
>
inline
typename
V_TypeTraits
<
typename
V_TypeTraits
<
_Tp
>::
abs_type
>::
sum_type
v_reduce_sad
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
typename
V_TypeTraits
<
typename
V_TypeTraits
<
_Tp
>::
abs_type
>::
sum_type
c
=
_absdiff
(
a
.
s
[
0
],
b
.
s
[
0
]);
for
(
int
i
=
1
;
i
<
n
;
i
++
)
c
+=
_absdiff
(
a
.
s
[
i
],
b
.
s
[
i
]);
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
int
v_signmask
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
int
mask
=
0
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
mask
|=
(
V_TypeTraits
<
_Tp
>::
reinterpret_int
(
a
.
s
[
i
])
<
0
)
<<
i
;
return
mask
;
}
template
<
typename
_Tp
,
int
n
>
inline
int
v_scan_forward
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
if
(
V_TypeTraits
<
_Tp
>::
reinterpret_int
(
a
.
s
[
i
])
<
0
)
return
i
;
return
0
;
}
template
<
typename
_Tp
,
int
n
>
inline
bool
v_check_all
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
if
(
V_TypeTraits
<
_Tp
>::
reinterpret_int
(
a
.
s
[
i
])
>=
0
)
return
false
;
return
true
;
}
template
<
typename
_Tp
,
int
n
>
inline
bool
v_check_any
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
if
(
V_TypeTraits
<
_Tp
>::
reinterpret_int
(
a
.
s
[
i
])
<
0
)
return
true
;
return
false
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_select
(
const
v_reg
<
_Tp
,
n
>&
mask
,
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
typedef
V_TypeTraits
<
_Tp
>
Traits
;
typedef
typename
Traits
::
int_type
int_type
;
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
int_type
m
=
Traits
::
reinterpret_int
(
mask
.
s
[
i
]);
CV_DbgAssert
(
m
==
0
||
m
==
(
~
(
int_type
)
0
));
// restrict mask values: 0 or 0xff/0xffff/etc
c
.
s
[
i
]
=
m
?
a
.
s
[
i
]
:
b
.
s
[
i
];
}
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_expand
(
const
v_reg
<
_Tp
,
n
>&
a
,
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>&
b0
,
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>&
b1
)
{
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
{
b0
.
s
[
i
]
=
a
.
s
[
i
];
b1
.
s
[
i
]
=
a
.
s
[
i
+
(
n
/
2
)];
}
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
v_expand_low
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
b
;
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
b
.
s
[
i
]
=
a
.
s
[
i
];
return
b
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
v_expand_high
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
b
;
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
b
.
s
[
i
]
=
a
.
s
[
i
+
(
n
/
2
)];
return
b
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
int_type
,
n
>
v_reinterpret_as_int
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
int_type
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
V_TypeTraits
<
_Tp
>::
reinterpret_int
(
a
.
s
[
i
]);
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
uint_type
,
n
>
v_reinterpret_as_uint
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
uint_type
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
V_TypeTraits
<
_Tp
>::
reinterpret_uint
(
a
.
s
[
i
]);
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_zip
(
const
v_reg
<
_Tp
,
n
>&
a0
,
const
v_reg
<
_Tp
,
n
>&
a1
,
v_reg
<
_Tp
,
n
>&
b0
,
v_reg
<
_Tp
,
n
>&
b1
)
{
int
i
;
for
(
i
=
0
;
i
<
n
/
2
;
i
++
)
{
b0
.
s
[
i
*
2
]
=
a0
.
s
[
i
];
b0
.
s
[
i
*
2
+
1
]
=
a1
.
s
[
i
];
}
for
(
;
i
<
n
;
i
++
)
{
b1
.
s
[
i
*
2
-
n
]
=
a0
.
s
[
i
];
b1
.
s
[
i
*
2
-
n
+
1
]
=
a1
.
s
[
i
];
}
}
template
<
typename
_Tp
>
inline
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
v_load
(
const
_Tp
*
ptr
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
return
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
(
ptr
);
}
template
<
typename
_Tp
>
inline
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
v_load_aligned
(
const
_Tp
*
ptr
)
{
CV_Assert
(
isAligned
<
sizeof
(
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
)
>
(
ptr
));
return
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
(
ptr
);
}
template
<
typename
_Tp
>
inline
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
v_load_low
(
const
_Tp
*
ptr
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
c
;
for
(
int
i
=
0
;
i
<
c
.
nlanes
/
2
;
i
++
)
{
c
.
s
[
i
]
=
ptr
[
i
];
}
return
c
;
}
template
<
typename
_Tp
>
inline
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
v_load_halves
(
const
_Tp
*
loptr
,
const
_Tp
*
hiptr
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
loptr
));
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
hiptr
));
#endif
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
c
;
for
(
int
i
=
0
;
i
<
c
.
nlanes
/
2
;
i
++
)
{
c
.
s
[
i
]
=
loptr
[
i
];
c
.
s
[
i
+
c
.
nlanes
/
2
]
=
hiptr
[
i
];
}
return
c
;
}
template
<
typename
_Tp
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
V_TypeTraits
<
_Tp
>::
nlanes128
/
2
>
v_load_expand
(
const
_Tp
*
ptr
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
typedef
typename
V_TypeTraits
<
_Tp
>::
w_type
w_type
;
v_reg
<
w_type
,
V_TypeTraits
<
w_type
>::
nlanes128
>
c
;
for
(
int
i
=
0
;
i
<
c
.
nlanes
;
i
++
)
{
c
.
s
[
i
]
=
ptr
[
i
];
}
return
c
;
}
template
<
typename
_Tp
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
q_type
,
V_TypeTraits
<
_Tp
>::
nlanes128
/
4
>
v_load_expand_q
(
const
_Tp
*
ptr
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
typedef
typename
V_TypeTraits
<
_Tp
>::
q_type
q_type
;
v_reg
<
q_type
,
V_TypeTraits
<
q_type
>::
nlanes128
>
c
;
for
(
int
i
=
0
;
i
<
c
.
nlanes
;
i
++
)
{
c
.
s
[
i
]
=
ptr
[
i
];
}
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_load_deinterleave
(
const
_Tp
*
ptr
,
v_reg
<
_Tp
,
n
>&
a
,
v_reg
<
_Tp
,
n
>&
b
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
int
i
,
i2
;
for
(
i
=
i2
=
0
;
i
<
n
;
i
++
,
i2
+=
2
)
{
a
.
s
[
i
]
=
ptr
[
i2
];
b
.
s
[
i
]
=
ptr
[
i2
+
1
];
}
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_load_deinterleave
(
const
_Tp
*
ptr
,
v_reg
<
_Tp
,
n
>&
a
,
v_reg
<
_Tp
,
n
>&
b
,
v_reg
<
_Tp
,
n
>&
c
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
int
i
,
i3
;
for
(
i
=
i3
=
0
;
i
<
n
;
i
++
,
i3
+=
3
)
{
a
.
s
[
i
]
=
ptr
[
i3
];
b
.
s
[
i
]
=
ptr
[
i3
+
1
];
c
.
s
[
i
]
=
ptr
[
i3
+
2
];
}
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_load_deinterleave
(
const
_Tp
*
ptr
,
v_reg
<
_Tp
,
n
>&
a
,
v_reg
<
_Tp
,
n
>&
b
,
v_reg
<
_Tp
,
n
>&
c
,
v_reg
<
_Tp
,
n
>&
d
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
int
i
,
i4
;
for
(
i
=
i4
=
0
;
i
<
n
;
i
++
,
i4
+=
4
)
{
a
.
s
[
i
]
=
ptr
[
i4
];
b
.
s
[
i
]
=
ptr
[
i4
+
1
];
c
.
s
[
i
]
=
ptr
[
i4
+
2
];
d
.
s
[
i
]
=
ptr
[
i4
+
3
];
}
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_store_interleave
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
hal
::
StoreMode
/*mode*/
=
hal
::
STORE_UNALIGNED
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
int
i
,
i2
;
for
(
i
=
i2
=
0
;
i
<
n
;
i
++
,
i2
+=
2
)
{
ptr
[
i2
]
=
a
.
s
[
i
];
ptr
[
i2
+
1
]
=
b
.
s
[
i
];
}
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_store_interleave
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
const
v_reg
<
_Tp
,
n
>&
c
,
hal
::
StoreMode
/*mode*/
=
hal
::
STORE_UNALIGNED
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
int
i
,
i3
;
for
(
i
=
i3
=
0
;
i
<
n
;
i
++
,
i3
+=
3
)
{
ptr
[
i3
]
=
a
.
s
[
i
];
ptr
[
i3
+
1
]
=
b
.
s
[
i
];
ptr
[
i3
+
2
]
=
c
.
s
[
i
];
}
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_store_interleave
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
const
v_reg
<
_Tp
,
n
>&
c
,
const
v_reg
<
_Tp
,
n
>&
d
,
hal
::
StoreMode
/*mode*/
=
hal
::
STORE_UNALIGNED
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
int
i
,
i4
;
for
(
i
=
i4
=
0
;
i
<
n
;
i
++
,
i4
+=
4
)
{
ptr
[
i4
]
=
a
.
s
[
i
];
ptr
[
i4
+
1
]
=
b
.
s
[
i
];
ptr
[
i4
+
2
]
=
c
.
s
[
i
];
ptr
[
i4
+
3
]
=
d
.
s
[
i
];
}
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_store
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
for
(
int
i
=
0
;
i
<
n
;
i
++
)
ptr
[
i
]
=
a
.
s
[
i
];
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_store
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
,
hal
::
StoreMode
/*mode*/
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
v_store
(
ptr
,
a
);
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_store_low
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
ptr
[
i
]
=
a
.
s
[
i
];
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_store_high
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
)
{
#if CV_STRONG_ALIGNMENT
CV_Assert
(
isAligned
<
sizeof
(
_Tp
)
>
(
ptr
));
#endif
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
ptr
[
i
]
=
a
.
s
[
i
+
(
n
/
2
)];
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_store_aligned
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
)
{
CV_Assert
(
isAligned
<
sizeof
(
v_reg
<
_Tp
,
n
>
)
>
(
ptr
));
v_store
(
ptr
,
a
);
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_store_aligned_nocache
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
)
{
CV_Assert
(
isAligned
<
sizeof
(
v_reg
<
_Tp
,
n
>
)
>
(
ptr
));
v_store
(
ptr
,
a
);
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_store_aligned
(
_Tp
*
ptr
,
const
v_reg
<
_Tp
,
n
>&
a
,
hal
::
StoreMode
/*mode*/
)
{
CV_Assert
(
isAligned
<
sizeof
(
v_reg
<
_Tp
,
n
>
)
>
(
ptr
));
v_store
(
ptr
,
a
);
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_combine_low
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
{
c
.
s
[
i
]
=
a
.
s
[
i
];
c
.
s
[
i
+
(
n
/
2
)]
=
b
.
s
[
i
];
}
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_combine_high
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
{
c
.
s
[
i
]
=
a
.
s
[
i
+
(
n
/
2
)];
c
.
s
[
i
+
(
n
/
2
)]
=
b
.
s
[
i
+
(
n
/
2
)];
}
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
void
v_recombine
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
,
v_reg
<
_Tp
,
n
>&
low
,
v_reg
<
_Tp
,
n
>&
high
)
{
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
{
low
.
s
[
i
]
=
a
.
s
[
i
];
low
.
s
[
i
+
(
n
/
2
)]
=
b
.
s
[
i
];
high
.
s
[
i
]
=
a
.
s
[
i
+
(
n
/
2
)];
high
.
s
[
i
+
(
n
/
2
)]
=
b
.
s
[
i
+
(
n
/
2
)];
}
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_reverse
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
a
.
s
[
n
-
i
-
1
];
return
c
;
}
template
<
int
s
,
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_extract
(
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
v_reg
<
_Tp
,
n
>
r
;
const
int
shift
=
n
-
s
;
int
i
=
0
;
for
(;
i
<
shift
;
++
i
)
r
.
s
[
i
]
=
a
.
s
[
i
+
s
];
for
(;
i
<
n
;
++
i
)
r
.
s
[
i
]
=
b
.
s
[
i
-
shift
];
return
r
;
}
template
<
int
s
,
typename
_Tp
,
int
n
>
inline
_Tp
v_extract_n
(
const
v_reg
<
_Tp
,
n
>&
v
)
{
CV_DbgAssert
(
s
>=
0
&&
s
<
n
);
return
v
.
s
[
s
];
}
template
<
int
i
,
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_broadcast_element
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
CV_DbgAssert
(
i
>=
0
&&
i
<
n
);
return
v_reg
<
_Tp
,
n
>::
all
(
a
.
s
[
i
]);
}
template
<
int
n
>
inline
v_reg
<
int
,
n
>
v_round
(
const
v_reg
<
float
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
cvRound
(
a
.
s
[
i
]);
return
c
;
}
template
<
int
n
>
inline
v_reg
<
int
,
n
*
2
>
v_round
(
const
v_reg
<
double
,
n
>&
a
,
const
v_reg
<
double
,
n
>&
b
)
{
v_reg
<
int
,
n
*
2
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
c
.
s
[
i
]
=
cvRound
(
a
.
s
[
i
]);
c
.
s
[
i
+
n
]
=
cvRound
(
b
.
s
[
i
]);
}
return
c
;
}
template
<
int
n
>
inline
v_reg
<
int
,
n
>
v_floor
(
const
v_reg
<
float
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
cvFloor
(
a
.
s
[
i
]);
return
c
;
}
template
<
int
n
>
inline
v_reg
<
int
,
n
>
v_ceil
(
const
v_reg
<
float
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
cvCeil
(
a
.
s
[
i
]);
return
c
;
}
template
<
int
n
>
inline
v_reg
<
int
,
n
>
v_trunc
(
const
v_reg
<
float
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
(
int
)(
a
.
s
[
i
]);
return
c
;
}
template
<
int
n
>
inline
v_reg
<
int
,
n
*
2
>
v_round
(
const
v_reg
<
double
,
n
>&
a
)
{
v_reg
<
int
,
n
*
2
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
c
.
s
[
i
]
=
cvRound
(
a
.
s
[
i
]);
c
.
s
[
i
+
n
]
=
0
;
}
return
c
;
}
template
<
int
n
>
inline
v_reg
<
int
,
n
*
2
>
v_floor
(
const
v_reg
<
double
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
c
.
s
[
i
]
=
cvFloor
(
a
.
s
[
i
]);
c
.
s
[
i
+
n
]
=
0
;
}
return
c
;
}
template
<
int
n
>
inline
v_reg
<
int
,
n
*
2
>
v_ceil
(
const
v_reg
<
double
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
c
.
s
[
i
]
=
cvCeil
(
a
.
s
[
i
]);
c
.
s
[
i
+
n
]
=
0
;
}
return
c
;
}
template
<
int
n
>
inline
v_reg
<
int
,
n
*
2
>
v_trunc
(
const
v_reg
<
double
,
n
>&
a
)
{
v_reg
<
int
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
c
.
s
[
i
]
=
cvCeil
(
a
.
s
[
i
]);
c
.
s
[
i
+
n
]
=
0
;
}
return
c
;
}
template
<
int
n
>
inline
v_reg
<
float
,
n
>
v_cvt_f32
(
const
v_reg
<
int
,
n
>&
a
)
{
v_reg
<
float
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
(
float
)
a
.
s
[
i
];
return
c
;
}
template
<
int
n
>
inline
v_reg
<
float
,
n
*
2
>
v_cvt_f32
(
const
v_reg
<
double
,
n
>&
a
)
{
v_reg
<
float
,
n
*
2
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
c
.
s
[
i
]
=
(
float
)
a
.
s
[
i
];
c
.
s
[
i
+
n
]
=
0
;
}
return
c
;
}
template
<
int
n
>
inline
v_reg
<
float
,
n
*
2
>
v_cvt_f32
(
const
v_reg
<
double
,
n
>&
a
,
const
v_reg
<
double
,
n
>&
b
)
{
v_reg
<
float
,
n
*
2
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
c
.
s
[
i
]
=
(
float
)
a
.
s
[
i
];
c
.
s
[
i
+
n
]
=
(
float
)
b
.
s
[
i
];
}
return
c
;
}
CV_INLINE
v_reg
<
double
,
2
>
v_cvt_f64
(
const
v_reg
<
int
,
4
>&
a
)
{
enum
{
n
=
2
};
v_reg
<
double
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
(
double
)
a
.
s
[
i
];
return
c
;
}
CV_INLINE
v_reg
<
double
,
2
>
v_cvt_f64_high
(
const
v_reg
<
int
,
4
>&
a
)
{
enum
{
n
=
2
};
v_reg
<
double
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
(
double
)
a
.
s
[
i
+
2
];
return
c
;
}
CV_INLINE
v_reg
<
double
,
2
>
v_cvt_f64
(
const
v_reg
<
float
,
4
>&
a
)
{
enum
{
n
=
2
};
v_reg
<
double
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
(
double
)
a
.
s
[
i
];
return
c
;
}
CV_INLINE
v_reg
<
double
,
2
>
v_cvt_f64_high
(
const
v_reg
<
float
,
4
>&
a
)
{
enum
{
n
=
2
};
v_reg
<
double
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
(
double
)
a
.
s
[
i
+
2
];
return
c
;
}
CV_INLINE
v_reg
<
double
,
2
>
v_cvt_f64
(
const
v_reg
<
int64
,
2
>&
a
)
{
enum
{
n
=
2
};
v_reg
<
double
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
(
double
)
a
.
s
[
i
];
return
c
;
}
CV_INLINE
v_reg
<
double
,
2
>
v_cvt_f64_high
(
const
v_reg
<
int64
,
2
>&
a
)
{
enum
{
n
=
2
};
v_reg
<
double
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
(
double
)
a
.
s
[
i
];
return
c
;
}
template
<
typename
_Tp
>
inline
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
v_lut
(
const
_Tp
*
tab
,
const
int
*
idx
)
{
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
c
;
for
(
int
i
=
0
;
i
<
V_TypeTraits
<
_Tp
>::
nlanes128
;
i
++
)
c
.
s
[
i
]
=
tab
[
idx
[
i
]];
return
c
;
}
template
<
typename
_Tp
>
inline
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
v_lut_pairs
(
const
_Tp
*
tab
,
const
int
*
idx
)
{
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
c
;
for
(
int
i
=
0
;
i
<
V_TypeTraits
<
_Tp
>::
nlanes128
;
i
++
)
c
.
s
[
i
]
=
tab
[
idx
[
i
/
2
]
+
i
%
2
];
return
c
;
}
template
<
typename
_Tp
>
inline
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
v_lut_quads
(
const
_Tp
*
tab
,
const
int
*
idx
)
{
v_reg
<
_Tp
,
V_TypeTraits
<
_Tp
>::
nlanes128
>
c
;
for
(
int
i
=
0
;
i
<
V_TypeTraits
<
_Tp
>::
nlanes128
;
i
++
)
c
.
s
[
i
]
=
tab
[
idx
[
i
/
4
]
+
i
%
4
];
return
c
;
}
template
<
int
n
>
inline
v_reg
<
int
,
n
>
v_lut
(
const
int
*
tab
,
const
v_reg
<
int
,
n
>&
idx
)
{
v_reg
<
int
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
tab
[
idx
.
s
[
i
]];
return
c
;
}
template
<
int
n
>
inline
v_reg
<
unsigned
,
n
>
v_lut
(
const
unsigned
*
tab
,
const
v_reg
<
int
,
n
>&
idx
)
{
v_reg
<
int
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
tab
[
idx
.
s
[
i
]];
return
c
;
}
template
<
int
n
>
inline
v_reg
<
float
,
n
>
v_lut
(
const
float
*
tab
,
const
v_reg
<
int
,
n
>&
idx
)
{
v_reg
<
float
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
tab
[
idx
.
s
[
i
]];
return
c
;
}
template
<
int
n
>
inline
v_reg
<
double
,
n
>
v_lut
(
const
double
*
tab
,
const
v_reg
<
int
,
n
*
2
>&
idx
)
{
v_reg
<
double
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
c
.
s
[
i
]
=
tab
[
idx
.
s
[
i
]];
return
c
;
}
inline
v_int32x4
v_lut
(
const
int
*
tab
,
const
v_int32x4
&
idxvec
)
{
return
v_lut
(
tab
,
idxvec
.
s
);
}
inline
v_uint32x4
v_lut
(
const
unsigned
*
tab
,
const
v_int32x4
&
idxvec
)
{
return
v_lut
(
tab
,
idxvec
.
s
);
}
inline
v_float32x4
v_lut
(
const
float
*
tab
,
const
v_int32x4
&
idxvec
)
{
return
v_lut
(
tab
,
idxvec
.
s
);
}
inline
v_float64x2
v_lut
(
const
double
*
tab
,
const
v_int32x4
&
idxvec
)
{
return
v_lut
(
tab
,
idxvec
.
s
);
}
template
<
int
n
>
inline
void
v_lut_deinterleave
(
const
float
*
tab
,
const
v_reg
<
int
,
n
>&
idx
,
v_reg
<
float
,
n
>&
x
,
v_reg
<
float
,
n
>&
y
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
int
j
=
idx
.
s
[
i
];
x
.
s
[
i
]
=
tab
[
j
];
y
.
s
[
i
]
=
tab
[
j
+
1
];
}
}
template
<
int
n
>
inline
void
v_lut_deinterleave
(
const
double
*
tab
,
const
v_reg
<
int
,
n
*
2
>&
idx
,
v_reg
<
double
,
n
>&
x
,
v_reg
<
double
,
n
>&
y
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
int
j
=
idx
.
s
[
i
];
x
.
s
[
i
]
=
tab
[
j
];
y
.
s
[
i
]
=
tab
[
j
+
1
];
}
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_interleave_pairs
(
const
v_reg
<
_Tp
,
n
>&
vec
)
{
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
/
4
;
i
++
)
{
c
.
s
[
4
*
i
]
=
vec
.
s
[
4
*
i
];
c
.
s
[
4
*
i
+
1
]
=
vec
.
s
[
4
*
i
+
2
];
c
.
s
[
4
*
i
+
2
]
=
vec
.
s
[
4
*
i
+
1
];
c
.
s
[
4
*
i
+
3
]
=
vec
.
s
[
4
*
i
+
3
];
}
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_interleave_quads
(
const
v_reg
<
_Tp
,
n
>&
vec
)
{
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
/
8
;
i
++
)
{
c
.
s
[
8
*
i
]
=
vec
.
s
[
8
*
i
];
c
.
s
[
8
*
i
+
1
]
=
vec
.
s
[
8
*
i
+
4
];
c
.
s
[
8
*
i
+
2
]
=
vec
.
s
[
8
*
i
+
1
];
c
.
s
[
8
*
i
+
3
]
=
vec
.
s
[
8
*
i
+
5
];
c
.
s
[
8
*
i
+
4
]
=
vec
.
s
[
8
*
i
+
2
];
c
.
s
[
8
*
i
+
5
]
=
vec
.
s
[
8
*
i
+
6
];
c
.
s
[
8
*
i
+
6
]
=
vec
.
s
[
8
*
i
+
3
];
c
.
s
[
8
*
i
+
7
]
=
vec
.
s
[
8
*
i
+
7
];
}
return
c
;
}
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
_Tp
,
n
>
v_pack_triplets
(
const
v_reg
<
_Tp
,
n
>&
vec
)
{
v_reg
<
_Tp
,
n
>
c
;
for
(
int
i
=
0
;
i
<
n
/
4
;
i
++
)
{
c
.
s
[
3
*
i
]
=
vec
.
s
[
4
*
i
];
c
.
s
[
3
*
i
+
1
]
=
vec
.
s
[
4
*
i
+
1
];
c
.
s
[
3
*
i
+
2
]
=
vec
.
s
[
4
*
i
+
2
];
}
return
c
;
}
template
<
typename
_Tp
>
inline
void
v_transpose4x4
(
v_reg
<
_Tp
,
4
>&
a0
,
const
v_reg
<
_Tp
,
4
>&
a1
,
const
v_reg
<
_Tp
,
4
>&
a2
,
const
v_reg
<
_Tp
,
4
>&
a3
,
v_reg
<
_Tp
,
4
>&
b0
,
v_reg
<
_Tp
,
4
>&
b1
,
v_reg
<
_Tp
,
4
>&
b2
,
v_reg
<
_Tp
,
4
>&
b3
)
{
b0
=
v_reg
<
_Tp
,
4
>
(
a0
.
s
[
0
],
a1
.
s
[
0
],
a2
.
s
[
0
],
a3
.
s
[
0
]);
b1
=
v_reg
<
_Tp
,
4
>
(
a0
.
s
[
1
],
a1
.
s
[
1
],
a2
.
s
[
1
],
a3
.
s
[
1
]);
b2
=
v_reg
<
_Tp
,
4
>
(
a0
.
s
[
2
],
a1
.
s
[
2
],
a2
.
s
[
2
],
a3
.
s
[
2
]);
b3
=
v_reg
<
_Tp
,
4
>
(
a0
.
s
[
3
],
a1
.
s
[
3
],
a2
.
s
[
3
],
a3
.
s
[
3
]);
}
#define OPENCV_HAL_IMPL_C_INIT_ZERO(_Tpvec, _Tp, suffix) \
inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); }
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_uint8x16
,
uchar
,
u8
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_int8x16
,
schar
,
s8
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_uint16x8
,
ushort
,
u16
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_int16x8
,
short
,
s16
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_uint32x4
,
unsigned
,
u32
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_int32x4
,
int
,
s32
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_float32x4
,
float
,
f32
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_float64x2
,
double
,
f64
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_uint64x2
,
uint64
,
u64
)
OPENCV_HAL_IMPL_C_INIT_ZERO
(
v_int64x2
,
int64
,
s64
)
#define OPENCV_HAL_IMPL_C_INIT_VAL(_Tpvec, _Tp, suffix) \
inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); }
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_uint8x16
,
uchar
,
u8
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_int8x16
,
schar
,
s8
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_uint16x8
,
ushort
,
u16
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_int16x8
,
short
,
s16
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_uint32x4
,
unsigned
,
u32
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_int32x4
,
int
,
s32
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_float32x4
,
float
,
f32
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_float64x2
,
double
,
f64
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_uint64x2
,
uint64
,
u64
)
OPENCV_HAL_IMPL_C_INIT_VAL
(
v_int64x2
,
int64
,
s64
)
#define OPENCV_HAL_IMPL_C_REINTERPRET(_Tpvec, _Tp, suffix) \
template<typename _Tp0, int n0> inline _Tpvec \
v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
{ return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(); }
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_uint8x16
,
uchar
,
u8
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_int8x16
,
schar
,
s8
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_uint16x8
,
ushort
,
u16
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_int16x8
,
short
,
s16
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_uint32x4
,
unsigned
,
u32
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_int32x4
,
int
,
s32
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_float32x4
,
float
,
f32
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_float64x2
,
double
,
f64
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_uint64x2
,
uint64
,
u64
)
OPENCV_HAL_IMPL_C_REINTERPRET
(
v_int64x2
,
int64
,
s64
)
#define OPENCV_HAL_IMPL_C_SHIFTL(_Tpvec, _Tp) \
template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
{ return a << n; }
OPENCV_HAL_IMPL_C_SHIFTL
(
v_uint16x8
,
ushort
)
OPENCV_HAL_IMPL_C_SHIFTL
(
v_int16x8
,
short
)
OPENCV_HAL_IMPL_C_SHIFTL
(
v_uint32x4
,
unsigned
)
OPENCV_HAL_IMPL_C_SHIFTL
(
v_int32x4
,
int
)
OPENCV_HAL_IMPL_C_SHIFTL
(
v_uint64x2
,
uint64
)
OPENCV_HAL_IMPL_C_SHIFTL
(
v_int64x2
,
int64
)
#define OPENCV_HAL_IMPL_C_SHIFTR(_Tpvec, _Tp) \
template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
{ return a >> n; }
OPENCV_HAL_IMPL_C_SHIFTR
(
v_uint16x8
,
ushort
)
OPENCV_HAL_IMPL_C_SHIFTR
(
v_int16x8
,
short
)
OPENCV_HAL_IMPL_C_SHIFTR
(
v_uint32x4
,
unsigned
)
OPENCV_HAL_IMPL_C_SHIFTR
(
v_int32x4
,
int
)
OPENCV_HAL_IMPL_C_SHIFTR
(
v_uint64x2
,
uint64
)
OPENCV_HAL_IMPL_C_SHIFTR
(
v_int64x2
,
int64
)
#define OPENCV_HAL_IMPL_C_RSHIFTR(_Tpvec, _Tp) \
template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
{ \
_Tpvec c; \
for( int i = 0; i < _Tpvec::nlanes; i++ ) \
c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
return c; \
}
OPENCV_HAL_IMPL_C_RSHIFTR
(
v_uint16x8
,
ushort
)
OPENCV_HAL_IMPL_C_RSHIFTR
(
v_int16x8
,
short
)
OPENCV_HAL_IMPL_C_RSHIFTR
(
v_uint32x4
,
unsigned
)
OPENCV_HAL_IMPL_C_RSHIFTR
(
v_int32x4
,
int
)
OPENCV_HAL_IMPL_C_RSHIFTR
(
v_uint64x2
,
uint64
)
OPENCV_HAL_IMPL_C_RSHIFTR
(
v_int64x2
,
int64
)
#define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tpnvec, _Tpn, pack_suffix, cast) \
inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpnvec c; \
for( int i = 0; i < _Tpvec::nlanes; i++ ) \
{ \
c.s[i] = cast<_Tpn>(a.s[i]); \
c.s[i+_Tpvec::nlanes] = cast<_Tpn>(b.s[i]); \
} \
return c; \
}
OPENCV_HAL_IMPL_C_PACK
(
v_uint16x8
,
v_uint8x16
,
uchar
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_PACK
(
v_int16x8
,
v_int8x16
,
schar
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_PACK
(
v_uint32x4
,
v_uint16x8
,
ushort
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_PACK
(
v_int32x4
,
v_int16x8
,
short
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_PACK
(
v_uint64x2
,
v_uint32x4
,
unsigned
,
pack
,
static_cast
)
OPENCV_HAL_IMPL_C_PACK
(
v_int64x2
,
v_int32x4
,
int
,
pack
,
static_cast
)
OPENCV_HAL_IMPL_C_PACK
(
v_int16x8
,
v_uint8x16
,
uchar
,
pack_u
,
saturate_cast
)
OPENCV_HAL_IMPL_C_PACK
(
v_int32x4
,
v_uint16x8
,
ushort
,
pack_u
,
saturate_cast
)
#define OPENCV_HAL_IMPL_C_RSHR_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpnvec c; \
for( int i = 0; i < _Tpvec::nlanes; i++ ) \
{ \
c.s[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
c.s[i+_Tpvec::nlanes] = cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
} \
return c; \
}
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_uint16x8
,
ushort
,
v_uint8x16
,
uchar
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_int16x8
,
short
,
v_int8x16
,
schar
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_uint32x4
,
unsigned
,
v_uint16x8
,
ushort
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_int32x4
,
int
,
v_int16x8
,
short
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_uint64x2
,
uint64
,
v_uint32x4
,
unsigned
,
pack
,
static_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_int64x2
,
int64
,
v_int32x4
,
int
,
pack
,
static_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_int16x8
,
short
,
v_uint8x16
,
uchar
,
pack_u
,
saturate_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK
(
v_int32x4
,
int
,
v_uint16x8
,
ushort
,
pack_u
,
saturate_cast
)
#define OPENCV_HAL_IMPL_C_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
{ \
for( int i = 0; i < _Tpvec::nlanes; i++ ) \
ptr[i] = cast<_Tpn>(a.s[i]); \
}
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_uint16x8
,
ushort
,
v_uint8x16
,
uchar
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_int16x8
,
short
,
v_int8x16
,
schar
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_uint32x4
,
unsigned
,
v_uint16x8
,
ushort
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_int32x4
,
int
,
v_int16x8
,
short
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_uint64x2
,
uint64
,
v_uint32x4
,
unsigned
,
pack
,
static_cast
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_int64x2
,
int64
,
v_int32x4
,
int
,
pack
,
static_cast
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_int16x8
,
short
,
v_uint8x16
,
uchar
,
pack_u
,
saturate_cast
)
OPENCV_HAL_IMPL_C_PACK_STORE
(
v_int32x4
,
int
,
v_uint16x8
,
ushort
,
pack_u
,
saturate_cast
)
#define OPENCV_HAL_IMPL_C_RSHR_PACK_STORE(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix, cast) \
template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
{ \
for( int i = 0; i < _Tpvec::nlanes; i++ ) \
ptr[i] = cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
}
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_uint16x8
,
ushort
,
v_uint8x16
,
uchar
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_int16x8
,
short
,
v_int8x16
,
schar
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_uint32x4
,
unsigned
,
v_uint16x8
,
ushort
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_int32x4
,
int
,
v_int16x8
,
short
,
pack
,
saturate_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_uint64x2
,
uint64
,
v_uint32x4
,
unsigned
,
pack
,
static_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_int64x2
,
int64
,
v_int32x4
,
int
,
pack
,
static_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_int16x8
,
short
,
v_uint8x16
,
uchar
,
pack_u
,
saturate_cast
)
OPENCV_HAL_IMPL_C_RSHR_PACK_STORE
(
v_int32x4
,
int
,
v_uint16x8
,
ushort
,
pack_u
,
saturate_cast
)
template
<
typename
_Tpm
,
typename
_Tp
,
int
n
>
inline
void
_pack_b
(
_Tpm
*
mptr
,
const
v_reg
<
_Tp
,
n
>&
a
,
const
v_reg
<
_Tp
,
n
>&
b
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
mptr
[
i
]
=
(
_Tpm
)
a
.
s
[
i
];
mptr
[
i
+
n
]
=
(
_Tpm
)
b
.
s
[
i
];
}
}
inline
v_uint8x16
v_pack_b
(
const
v_uint16x8
&
a
,
const
v_uint16x8
&
b
)
{
v_uint8x16
mask
;
_pack_b
(
mask
.
s
,
a
,
b
);
return
mask
;
}
inline
v_uint8x16
v_pack_b
(
const
v_uint32x4
&
a
,
const
v_uint32x4
&
b
,
const
v_uint32x4
&
c
,
const
v_uint32x4
&
d
)
{
v_uint8x16
mask
;
_pack_b
(
mask
.
s
,
a
,
b
);
_pack_b
(
mask
.
s
+
8
,
c
,
d
);
return
mask
;
}
inline
v_uint8x16
v_pack_b
(
const
v_uint64x2
&
a
,
const
v_uint64x2
&
b
,
const
v_uint64x2
&
c
,
const
v_uint64x2
&
d
,
const
v_uint64x2
&
e
,
const
v_uint64x2
&
f
,
const
v_uint64x2
&
g
,
const
v_uint64x2
&
h
)
{
v_uint8x16
mask
;
_pack_b
(
mask
.
s
,
a
,
b
);
_pack_b
(
mask
.
s
+
4
,
c
,
d
);
_pack_b
(
mask
.
s
+
8
,
e
,
f
);
_pack_b
(
mask
.
s
+
12
,
g
,
h
);
return
mask
;
}
inline
v_float32x4
v_matmul
(
const
v_float32x4
&
v
,
const
v_float32x4
&
m0
,
const
v_float32x4
&
m1
,
const
v_float32x4
&
m2
,
const
v_float32x4
&
m3
)
{
return
v_float32x4
(
v
.
s
[
0
]
*
m0
.
s
[
0
]
+
v
.
s
[
1
]
*
m1
.
s
[
0
]
+
v
.
s
[
2
]
*
m2
.
s
[
0
]
+
v
.
s
[
3
]
*
m3
.
s
[
0
],
v
.
s
[
0
]
*
m0
.
s
[
1
]
+
v
.
s
[
1
]
*
m1
.
s
[
1
]
+
v
.
s
[
2
]
*
m2
.
s
[
1
]
+
v
.
s
[
3
]
*
m3
.
s
[
1
],
v
.
s
[
0
]
*
m0
.
s
[
2
]
+
v
.
s
[
1
]
*
m1
.
s
[
2
]
+
v
.
s
[
2
]
*
m2
.
s
[
2
]
+
v
.
s
[
3
]
*
m3
.
s
[
2
],
v
.
s
[
0
]
*
m0
.
s
[
3
]
+
v
.
s
[
1
]
*
m1
.
s
[
3
]
+
v
.
s
[
2
]
*
m2
.
s
[
3
]
+
v
.
s
[
3
]
*
m3
.
s
[
3
]);
}
inline
v_float32x4
v_matmuladd
(
const
v_float32x4
&
v
,
const
v_float32x4
&
m0
,
const
v_float32x4
&
m1
,
const
v_float32x4
&
m2
,
const
v_float32x4
&
m3
)
{
return
v_float32x4
(
v
.
s
[
0
]
*
m0
.
s
[
0
]
+
v
.
s
[
1
]
*
m1
.
s
[
0
]
+
v
.
s
[
2
]
*
m2
.
s
[
0
]
+
m3
.
s
[
0
],
v
.
s
[
0
]
*
m0
.
s
[
1
]
+
v
.
s
[
1
]
*
m1
.
s
[
1
]
+
v
.
s
[
2
]
*
m2
.
s
[
1
]
+
m3
.
s
[
1
],
v
.
s
[
0
]
*
m0
.
s
[
2
]
+
v
.
s
[
1
]
*
m1
.
s
[
2
]
+
v
.
s
[
2
]
*
m2
.
s
[
2
]
+
m3
.
s
[
2
],
v
.
s
[
0
]
*
m0
.
s
[
3
]
+
v
.
s
[
1
]
*
m1
.
s
[
3
]
+
v
.
s
[
2
]
*
m2
.
s
[
3
]
+
m3
.
s
[
3
]);
}
inline
v_float64x2
v_dotprod_expand
(
const
v_int32x4
&
a
,
const
v_int32x4
&
b
)
{
return
v_fma
(
v_cvt_f64
(
a
),
v_cvt_f64
(
b
),
v_cvt_f64_high
(
a
)
*
v_cvt_f64_high
(
b
));
}
inline
v_float64x2
v_dotprod_expand
(
const
v_int32x4
&
a
,
const
v_int32x4
&
b
,
const
v_float64x2
&
c
)
{
return
v_fma
(
v_cvt_f64
(
a
),
v_cvt_f64
(
b
),
v_fma
(
v_cvt_f64_high
(
a
),
v_cvt_f64_high
(
b
),
c
));
}
inline
v_float64x2
v_dotprod_expand_fast
(
const
v_int32x4
&
a
,
const
v_int32x4
&
b
)
{
return
v_dotprod_expand
(
a
,
b
);
}
inline
v_float64x2
v_dotprod_expand_fast
(
const
v_int32x4
&
a
,
const
v_int32x4
&
b
,
const
v_float64x2
&
c
)
{
return
v_dotprod_expand
(
a
,
b
,
c
);
}
////// FP16 support ///////
inline
v_reg
<
float
,
V_TypeTraits
<
float
>::
nlanes128
>
v_load_expand
(
const
float16_t
*
ptr
)
{
v_reg
<
float
,
V_TypeTraits
<
float
>::
nlanes128
>
v
;
for
(
int
i
=
0
;
i
<
v
.
nlanes
;
i
++
)
{
v
.
s
[
i
]
=
ptr
[
i
];
}
return
v
;
}
inline
void
v_pack_store
(
float16_t
*
ptr
,
const
v_reg
<
float
,
V_TypeTraits
<
float
>::
nlanes128
>&
v
)
{
for
(
int
i
=
0
;
i
<
v
.
nlanes
;
i
++
)
{
ptr
[
i
]
=
float16_t
(
v
.
s
[
i
]);
}
}
inline
void
v_cleanup
()
{}
#ifndef CV_DOXYGEN
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
#endif
}
#endif
modules/core/src/system.cpp
浏览文件 @
422f802e
...
...
@@ -399,6 +399,8 @@ struct HWFeatures
g_hwFeatureNames
[
CPU_AVX512_CNL
]
=
"AVX512-CNL"
;
g_hwFeatureNames
[
CPU_AVX512_CLX
]
=
"AVX512-CLX"
;
g_hwFeatureNames
[
CPU_AVX512_ICL
]
=
"AVX512-ICL"
;
g_hwFeatureNames
[
CPU_RVV
]
=
"RVV"
;
}
void
initialize
(
void
)
...
...
@@ -597,6 +599,10 @@ struct HWFeatures
have
[
CV_CPU_VSX3
]
=
(
CV_VSX3
);
#endif
#if defined __riscv && defined __riscv_vector
have
[
CV_CPU_RVV
]
=
true
;
#endif
bool
skip_baseline_check
=
false
;
#ifndef NO_GETENV
if
(
getenv
(
"OPENCV_SKIP_CPU_BASELINE_CHECK"
))
...
...
platforms/linux/riscv64-clang.toolchain.cmake
0 → 100644
浏览文件 @
422f802e
set
(
CMAKE_SYSTEM_NAME Linux
)
set
(
CMAKE_SYSTEM_PROCESSOR riscv64
)
set
(
RISCV_CLANG_BUILD_ROOT /opt/rvv-llvm CACHE PATH
"Path to CLANG for RISC-V cross compiler build directory"
)
set
(
RISCV_GCC_INSTALL_ROOT /opt/RISCV CACHE PATH
"Path to GCC for RISC-V cross compiler installation directory"
)
set
(
CMAKE_SYSROOT
${
RISCV_GCC_INSTALL_ROOT
}
/sysroot CACHE PATH
"RISC-V sysroot"
)
set
(
CLANG_TARGET_TRIPLE riscv64-unknown-linux-gnu
)
set
(
CMAKE_C_COMPILER
${
RISCV_CLANG_BUILD_ROOT
}
/bin/clang
)
set
(
CMAKE_C_COMPILER_TARGET
${
CLANG_TARGET_TRIPLE
}
)
set
(
CMAKE_CXX_COMPILER
${
RISCV_CLANG_BUILD_ROOT
}
/bin/clang++
)
set
(
CMAKE_CXX_COMPILER_TARGET
${
CLANG_TARGET_TRIPLE
}
)
set
(
CMAKE_ASM_COMPILER
${
RISCV_CLANG_BUILD_ROOT
}
/bin/clang
)
set
(
CMAKE_ASM_COMPILER_TARGET
${
CLANG_TARGET_TRIPLE
}
)
# Don't run the linker on compiler check
set
(
CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY
)
set
(
CMAKE_C_FLAGS
"-march=rv64gcv --gcc-toolchain=
${
RISCV_GCC_INSTALL_ROOT
}
-w
${
CMAKE_C_FLAGS
}
"
)
set
(
CMAKE_CXX_FLAGS
"-march=rv64gcv --gcc-toolchain=
${
RISCV_GCC_INSTALL_ROOT
}
-w
${
CXX_FLAGS
}
"
)
set
(
CMAKE_FIND_ROOT_PATH
${
CMAKE_SYSROOT
}
)
set
(
CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER
)
set
(
CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY
)
set
(
CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY
)
set
(
CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录