Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Greenplum
Opencv
提交
1cc3f7ab
O
Opencv
项目概览
Greenplum
/
Opencv
11 个月 前同步成功
通知
7
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
O
Opencv
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
1cc3f7ab
编写于
10月 15, 2018
作者:
A
Alexander Alekhin
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #12516 from seiko2plus:changeUnvMultiply16
上级
803ff64b
8965f3ae
变更
15
隐藏空白更改
内联
并排
Showing
15 changed file
with
2172 addition
and
1828 deletion
+2172
-1828
modules/core/include/opencv2/core/hal/intrin.hpp
modules/core/include/opencv2/core/hal/intrin.hpp
+8
-0
modules/core/include/opencv2/core/hal/intrin_avx.hpp
modules/core/include/opencv2/core/hal/intrin_avx.hpp
+118
-22
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
+52
-6
modules/core/include/opencv2/core/hal/intrin_forward.hpp
modules/core/include/opencv2/core/hal/intrin_forward.hpp
+158
-0
modules/core/include/opencv2/core/hal/intrin_neon.hpp
modules/core/include/opencv2/core/hal/intrin_neon.hpp
+43
-2
modules/core/include/opencv2/core/hal/intrin_sse.hpp
modules/core/include/opencv2/core/hal/intrin_sse.hpp
+92
-97
modules/core/include/opencv2/core/hal/intrin_sse_em.hpp
modules/core/include/opencv2/core/hal/intrin_sse_em.hpp
+167
-0
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
+35
-18
modules/core/include/opencv2/core/vsx_utils.hpp
modules/core/include/opencv2/core/vsx_utils.hpp
+12
-10
modules/core/test/test_intrin_utils.hpp
modules/core/test/test_intrin_utils.hpp
+21
-9
modules/imgproc/CMakeLists.txt
modules/imgproc/CMakeLists.txt
+1
-1
modules/imgproc/perf/perf_accumulate.cpp
modules/imgproc/perf/perf_accumulate.cpp
+97
-89
modules/imgproc/src/accum.simd.hpp
modules/imgproc/src/accum.simd.hpp
+1347
-1562
modules/imgproc/src/smooth.cpp
modules/imgproc/src/smooth.cpp
+19
-10
modules/video/src/lkpyramid.cpp
modules/video/src/lkpyramid.cpp
+2
-2
未找到文件。
modules/core/include/opencv2/core/hal/intrin.hpp
浏览文件 @
1cc3f7ab
...
...
@@ -139,8 +139,14 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
# undef CV_FP16
#endif
#if CV_SSE2 || CV_NEON || CV_VSX
#define CV__SIMD_FORWARD 128
#include "opencv2/core/hal/intrin_forward.hpp"
#endif
#if CV_SSE2
#include "opencv2/core/hal/intrin_sse_em.hpp"
#include "opencv2/core/hal/intrin_sse.hpp"
#elif CV_NEON
...
...
@@ -168,6 +174,8 @@ using namespace CV_CPU_OPTIMIZATION_HAL_NAMESPACE;
// (and will be mapped to v256_ counterparts) (e.g. vx_load() => v256_load())
#if CV_AVX2
#define CV__SIMD_FORWARD 256
#include "opencv2/core/hal/intrin_forward.hpp"
#include "opencv2/core/hal/intrin_avx.hpp"
#endif
...
...
modules/core/include/opencv2/core/hal/intrin_avx.hpp
浏览文件 @
1cc3f7ab
...
...
@@ -82,6 +82,14 @@ inline __m128 _v256_extract_low(const __m256& v)
inline
__m128d
_v256_extract_low
(
const
__m256d
&
v
)
{
return
_mm256_castpd256_pd128
(
v
);
}
inline
__m256i
_v256_packs_epu32
(
const
__m256i
&
a
,
const
__m256i
&
b
)
{
const
__m256i
m
=
_mm256_set1_epi32
(
65535
);
__m256i
am
=
_mm256_min_epu32
(
a
,
m
);
__m256i
bm
=
_mm256_min_epu32
(
b
,
m
);
return
_mm256_packus_epi32
(
am
,
bm
);
}
///////// Types ////////////
struct
v_uint8x32
...
...
@@ -626,10 +634,8 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(+, v_int8x32, _mm256_adds_epi8)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
-
,
v_int8x32
,
_mm256_subs_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
+
,
v_uint16x16
,
_mm256_adds_epu16
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
-
,
v_uint16x16
,
_mm256_subs_epu16
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
*
,
v_uint16x16
,
_mm256_mullo_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
+
,
v_int16x16
,
_mm256_adds_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
-
,
v_int16x16
,
_mm256_subs_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
*
,
v_int16x16
,
_mm256_mullo_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
+
,
v_uint32x8
,
_mm256_add_epi32
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
-
,
v_uint32x8
,
_mm256_sub_epi32
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
*
,
v_uint32x8
,
_mm256_mullo_epi32
)
...
...
@@ -650,13 +656,103 @@ OPENCV_HAL_IMPL_AVX_BIN_OP(-, v_float64x4, _mm256_sub_pd)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
*
,
v_float64x4
,
_mm256_mul_pd
)
OPENCV_HAL_IMPL_AVX_BIN_OP
(
/
,
v_float64x4
,
_mm256_div_pd
)
// saturating multiply 8-bit, 16-bit
inline
v_uint8x32
operator
*
(
const
v_uint8x32
&
a
,
const
v_uint8x32
&
b
)
{
v_uint16x16
c
,
d
;
v_mul_expand
(
a
,
b
,
c
,
d
);
return
v_pack_u
(
v_reinterpret_as_s16
(
c
),
v_reinterpret_as_s16
(
d
));
}
inline
v_int8x32
operator
*
(
const
v_int8x32
&
a
,
const
v_int8x32
&
b
)
{
v_int16x16
c
,
d
;
v_mul_expand
(
a
,
b
,
c
,
d
);
return
v_pack
(
c
,
d
);
}
inline
v_uint16x16
operator
*
(
const
v_uint16x16
&
a
,
const
v_uint16x16
&
b
)
{
__m256i
pl
=
_mm256_mullo_epi16
(
a
.
val
,
b
.
val
);
__m256i
ph
=
_mm256_mulhi_epu16
(
a
.
val
,
b
.
val
);
__m256i
p0
=
_mm256_unpacklo_epi16
(
pl
,
ph
);
__m256i
p1
=
_mm256_unpackhi_epi16
(
pl
,
ph
);
return
v_uint16x16
(
_v256_packs_epu32
(
p0
,
p1
));
}
inline
v_int16x16
operator
*
(
const
v_int16x16
&
a
,
const
v_int16x16
&
b
)
{
__m256i
pl
=
_mm256_mullo_epi16
(
a
.
val
,
b
.
val
);
__m256i
ph
=
_mm256_mulhi_epi16
(
a
.
val
,
b
.
val
);
__m256i
p0
=
_mm256_unpacklo_epi16
(
pl
,
ph
);
__m256i
p1
=
_mm256_unpackhi_epi16
(
pl
,
ph
);
return
v_int16x16
(
_mm256_packs_epi32
(
p0
,
p1
));
}
inline
v_uint8x32
&
operator
*=
(
v_uint8x32
&
a
,
const
v_uint8x32
&
b
)
{
a
=
a
*
b
;
return
a
;
}
inline
v_int8x32
&
operator
*=
(
v_int8x32
&
a
,
const
v_int8x32
&
b
)
{
a
=
a
*
b
;
return
a
;
}
inline
v_uint16x16
&
operator
*=
(
v_uint16x16
&
a
,
const
v_uint16x16
&
b
)
{
a
=
a
*
b
;
return
a
;
}
inline
v_int16x16
&
operator
*=
(
v_int16x16
&
a
,
const
v_int16x16
&
b
)
{
a
=
a
*
b
;
return
a
;
}
/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_uint8x32
,
_mm256_add_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_int8x32
,
_mm256_add_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_uint16x16
,
_mm256_add_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_int16x16
,
_mm256_add_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_uint8x32
,
_mm256_sub_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_int8x32
,
_mm256_sub_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_uint16x16
,
_mm256_sub_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_int16x16
,
_mm256_sub_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_mul_wrap
,
v_uint16x16
,
_mm256_mullo_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_mul_wrap
,
v_int16x16
,
_mm256_mullo_epi16
)
inline
v_uint8x32
v_mul_wrap
(
const
v_uint8x32
&
a
,
const
v_uint8x32
&
b
)
{
__m256i
ad
=
_mm256_srai_epi16
(
a
.
val
,
8
);
__m256i
bd
=
_mm256_srai_epi16
(
b
.
val
,
8
);
__m256i
p0
=
_mm256_mullo_epi16
(
a
.
val
,
b
.
val
);
// even
__m256i
p1
=
_mm256_slli_epi16
(
_mm256_mullo_epi16
(
ad
,
bd
),
8
);
// odd
const
__m256i
b01
=
_mm256_set1_epi32
(
0xFF00FF00
);
return
v_uint8x32
(
_mm256_blendv_epi8
(
p0
,
p1
,
b01
));
}
inline
v_int8x32
v_mul_wrap
(
const
v_int8x32
&
a
,
const
v_int8x32
&
b
)
{
return
v_reinterpret_as_s8
(
v_mul_wrap
(
v_reinterpret_as_u8
(
a
),
v_reinterpret_as_u8
(
b
)));
}
// Multiply and expand
inline
void
v_mul_expand
(
const
v_uint8x32
&
a
,
const
v_uint8x32
&
b
,
v_uint16x16
&
c
,
v_uint16x16
&
d
)
{
v_uint16x16
a0
,
a1
,
b0
,
b1
;
v_expand
(
a
,
a0
,
a1
);
v_expand
(
b
,
b0
,
b1
);
c
=
v_mul_wrap
(
a0
,
b0
);
d
=
v_mul_wrap
(
a1
,
b1
);
}
inline
void
v_mul_expand
(
const
v_int8x32
&
a
,
const
v_int8x32
&
b
,
v_int16x16
&
c
,
v_int16x16
&
d
)
{
v_int16x16
a0
,
a1
,
b0
,
b1
;
v_expand
(
a
,
a0
,
a1
);
v_expand
(
b
,
b0
,
b1
);
c
=
v_mul_wrap
(
a0
,
b0
);
d
=
v_mul_wrap
(
a1
,
b1
);
}
inline
void
v_mul_expand
(
const
v_int16x16
&
a
,
const
v_int16x16
&
b
,
v_int32x8
&
c
,
v_int32x8
&
d
)
{
v_int16x16
vhi
=
v_int16x16
(
_mm256_mulhi_epi16
(
a
.
val
,
b
.
val
));
v_int16x16
v0
,
v1
;
v_zip
(
a
*
b
,
vhi
,
v0
,
v1
);
v_zip
(
v_mul_wrap
(
a
,
b
)
,
vhi
,
v0
,
v1
);
c
=
v_reinterpret_as_s32
(
v0
);
d
=
v_reinterpret_as_s32
(
v1
);
...
...
@@ -668,7 +764,7 @@ inline void v_mul_expand(const v_uint16x16& a, const v_uint16x16& b,
v_uint16x16
vhi
=
v_uint16x16
(
_mm256_mulhi_epu16
(
a
.
val
,
b
.
val
));
v_uint16x16
v0
,
v1
;
v_zip
(
a
*
b
,
vhi
,
v0
,
v1
);
v_zip
(
v_mul_wrap
(
a
,
b
)
,
vhi
,
v0
,
v1
);
c
=
v_reinterpret_as_u32
(
v0
);
d
=
v_reinterpret_as_u32
(
v1
);
...
...
@@ -685,20 +781,6 @@ inline void v_mul_expand(const v_uint32x8& a, const v_uint32x8& b,
inline
v_int16x16
v_mul_hi
(
const
v_int16x16
&
a
,
const
v_int16x16
&
b
)
{
return
v_int16x16
(
_mm256_mulhi_epi16
(
a
.
val
,
b
.
val
));
}
inline
v_uint16x16
v_mul_hi
(
const
v_uint16x16
&
a
,
const
v_uint16x16
&
b
)
{
return
v_uint16x16
(
_mm256_mulhi_epu16
(
a
.
val
,
b
.
val
));
}
/** Non-saturating arithmetics **/
#define OPENCV_HAL_IMPL_AVX_BIN_FUNC(func, _Tpvec, intrin) \
inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
{ return _Tpvec(intrin(a.val, b.val)); }
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_uint8x32
,
_mm256_add_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_int8x32
,
_mm256_add_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_uint16x16
,
_mm256_add_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_add_wrap
,
v_int16x16
,
_mm256_add_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_uint8x32
,
_mm256_sub_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_int8x32
,
_mm256_sub_epi8
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_uint16x16
,
_mm256_sub_epi16
)
OPENCV_HAL_IMPL_AVX_BIN_FUNC
(
v_sub_wrap
,
v_int16x16
,
_mm256_sub_epi16
)
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_AVX_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
...
...
@@ -1385,6 +1467,10 @@ OPENCV_HAL_IMPL_AVX_TRANSPOSE4x4(v_float32x8, ps, _mm256_castps_si256, _mm256_ca
b0.val = intrin(_v256_extract_low(a.val)); \
b1.val = intrin(_v256_extract_high(a.val)); \
} \
inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ return _Tpwvec(intrin(_v256_extract_low(a.val))); } \
inline _Tpwvec v_expand_high(const _Tpvec& a) \
{ return _Tpwvec(intrin(_v256_extract_high(a.val))); } \
inline _Tpwvec v256_load_expand(const _Tp* ptr) \
{ \
__m128i a = _mm_loadu_si128((const __m128i*)ptr); \
...
...
@@ -1430,7 +1516,12 @@ inline void v_pack_store(schar* ptr, const v_int16x16& a)
{
v_store_low
(
ptr
,
v_pack
(
a
,
a
));
}
inline
void
v_pack_store
(
uchar
*
ptr
,
const
v_uint16x16
&
a
)
{
v_store_low
(
ptr
,
v_pack
(
a
,
a
));
}
{
const
__m256i
m
=
_mm256_set1_epi16
(
255
);
__m256i
am
=
_mm256_min_epu16
(
a
.
val
,
m
);
am
=
_v256_shuffle_odd_64
(
_mm256_packus_epi16
(
am
,
am
));
v_store_low
(
ptr
,
v_uint8x32
(
am
));
}
inline
void
v_pack_u_store
(
uchar
*
ptr
,
const
v_int16x16
&
a
)
{
v_store_low
(
ptr
,
v_pack_u
(
a
,
a
));
}
...
...
@@ -1484,16 +1575,21 @@ inline v_int16x16 v_pack(const v_int32x8& a, const v_int32x8& b)
{
return
v_int16x16
(
_v256_shuffle_odd_64
(
_mm256_packs_epi32
(
a
.
val
,
b
.
val
)));
}
inline
v_uint16x16
v_pack
(
const
v_uint32x8
&
a
,
const
v_uint32x8
&
b
)
{
return
v_uint16x16
(
_v256_shuffle_odd_64
(
_
mm256_packus_epi
32
(
a
.
val
,
b
.
val
)));
}
{
return
v_uint16x16
(
_v256_shuffle_odd_64
(
_
v256_packs_epu
32
(
a
.
val
,
b
.
val
)));
}
inline
v_uint16x16
v_pack_u
(
const
v_int32x8
&
a
,
const
v_int32x8
&
b
)
{
return
v_
pack
(
v_reinterpret_as_u32
(
a
),
v_reinterpret_as_u32
(
b
));
}
{
return
v_
uint16x16
(
_v256_shuffle_odd_64
(
_mm256_packus_epi32
(
a
.
val
,
b
.
val
)
));
}
inline
void
v_pack_store
(
short
*
ptr
,
const
v_int32x8
&
a
)
{
v_store_low
(
ptr
,
v_pack
(
a
,
a
));
}
inline
void
v_pack_store
(
ushort
*
ptr
,
const
v_uint32x8
&
a
)
{
v_store_low
(
ptr
,
v_pack
(
a
,
a
));
}
{
const
__m256i
m
=
_mm256_set1_epi32
(
65535
);
__m256i
am
=
_mm256_min_epu32
(
a
.
val
,
m
);
am
=
_v256_shuffle_odd_64
(
_mm256_packus_epi32
(
am
,
am
));
v_store_low
(
ptr
,
v_uint16x16
(
am
));
}
inline
void
v_pack_u_store
(
ushort
*
ptr
,
const
v_int32x8
&
a
)
{
v_store_low
(
ptr
,
v_pack_u
(
a
,
a
));
}
...
...
modules/core/include/opencv2/core/hal/intrin_cpp.hpp
浏览文件 @
1cc3f7ab
...
...
@@ -108,7 +108,7 @@ block and to save contents of the register to memory block.
These operations allow to reorder or recombine elements in one or multiple vectors.
- Interleave, deinterleave (2, 3 and 4 channels): @ref v_load_deinterleave, @ref v_store_interleave
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
- Expand: @ref v_load_expand, @ref v_load_expand_q, @ref v_expand
, @ref v_expand_low, @ref v_expand_high
- Pack: @ref v_pack, @ref v_pack_u, @ref v_rshr_pack, @ref v_rshr_pack_u,
@ref v_pack_store, @ref v_pack_u_store, @ref v_rshr_pack_store, @ref v_rshr_pack_u_store
- Recombine: @ref v_zip, @ref v_recombine, @ref v_combine_low, @ref v_combine_high
...
...
@@ -185,11 +185,14 @@ Regular integers:
|load, store | x | x | x | x | x | x |
|interleave | x | x | x | x | x | x |
|expand | x | x | x | x | x | x |
|expand_low | x | x | x | x | x | x |
|expand_high | x | x | x | x | x | x |
|expand_q | x | x | | | | |
|add, sub | x | x | x | x | x | x |
|add_wrap, sub_wrap | x | x | x | x | | |
|mul | | | x | x | x | x |
|mul_expand | | | x | x | x | |
|mul_wrap | x | x | x | x | | |
|mul | x | x | x | x | x | x |
|mul_expand | x | x | x | x | x | |
|compare | x | x | x | x | x | x |
|shift | | | x | x | x | x |
|dotprod | | | | x | | |
...
...
@@ -680,7 +683,7 @@ OPENCV_HAL_IMPL_CMP_OP(!=)
//! @brief Helper macro
//! @ingroup core_hal_intrin_impl
#define OPENCV_HAL_IMPL_A
DD_SUB
_OP(func, bin_op, cast_op, _Tp2) \
#define OPENCV_HAL_IMPL_A
RITHM
_OP(func, bin_op, cast_op, _Tp2) \
template<typename _Tp, int n> \
inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
{ \
...
...
@@ -694,12 +697,17 @@ inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
/** @brief Add values without saturation
For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_A
DD_SUB
_OP
(
v_add_wrap
,
+
,
(
_Tp
),
_Tp
)
OPENCV_HAL_IMPL_A
RITHM
_OP
(
v_add_wrap
,
+
,
(
_Tp
),
_Tp
)
/** @brief Subtract values without saturation
For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_ADD_SUB_OP
(
v_sub_wrap
,
-
,
(
_Tp
),
_Tp
)
OPENCV_HAL_IMPL_ARITHM_OP
(
v_sub_wrap
,
-
,
(
_Tp
),
_Tp
)
/** @brief Multiply values without saturation
For 8- and 16-bit integer values. */
OPENCV_HAL_IMPL_ARITHM_OP
(
v_mul_wrap
,
*
,
(
_Tp
),
_Tp
)
//! @cond IGNORED
template
<
typename
T
>
inline
T
_absdiff
(
T
a
,
T
b
)
...
...
@@ -1106,6 +1114,44 @@ template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
}
}
/** @brief Expand lower values to the wider pack type
Same as cv::v_expand, but return lower half of the vector.
Scheme:
@code
int32x4 int64x2
{A B C D} ==> {A B}
@endcode */
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
v_expand_low
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
b
;
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
b
.
s
[
i
]
=
a
.
s
[
i
];
return
b
;
}
/** @brief Expand higher values to the wider pack type
Same as cv::v_expand_low, but expand higher half of the vector instead.
Scheme:
@code
int32x4 int64x2
{A B C D} ==> {C D}
@endcode */
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
v_expand_high
(
const
v_reg
<
_Tp
,
n
>&
a
)
{
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
w_type
,
n
/
2
>
b
;
for
(
int
i
=
0
;
i
<
(
n
/
2
);
i
++
)
b
.
s
[
i
]
=
a
.
s
[
i
+
(
n
/
2
)];
return
b
;
}
//! @cond IGNORED
template
<
typename
_Tp
,
int
n
>
inline
v_reg
<
typename
V_TypeTraits
<
_Tp
>::
int_type
,
n
>
v_reinterpret_as_int
(
const
v_reg
<
_Tp
,
n
>&
a
)
...
...
modules/core/include/opencv2/core/hal/intrin_forward.hpp
0 → 100644
浏览文件 @
1cc3f7ab
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#ifndef CV__SIMD_FORWARD
#error "Need to pre-define forward width"
#endif
namespace
cv
{
//! @cond IGNORED
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
/** Types **/
#if CV__SIMD_FORWARD == 512
// [todo] 512
#error "AVX512 Not implemented yet"
#elif CV__SIMD_FORWARD == 256
// 256
#define __CV_VX(fun) v256_##fun
#define __CV_V_UINT8 v_uint8x32
#define __CV_V_INT8 v_int8x32
#define __CV_V_UINT16 v_uint16x16
#define __CV_V_INT16 v_int16x16
#define __CV_V_UINT32 v_uint32x8
#define __CV_V_INT32 v_int32x8
#define __CV_V_UINT64 v_uint64x4
#define __CV_V_INT64 v_int64x4
#define __CV_V_FLOAT32 v_float32x8
#define __CV_V_FLOAT64 v_float64x4
struct
v_uint8x32
;
struct
v_int8x32
;
struct
v_uint16x16
;
struct
v_int16x16
;
struct
v_uint32x8
;
struct
v_int32x8
;
struct
v_uint64x4
;
struct
v_int64x4
;
struct
v_float32x8
;
struct
v_float64x4
;
#else
// 128
#define __CV_VX(fun) v_##fun
#define __CV_V_UINT8 v_uint8x16
#define __CV_V_INT8 v_int8x16
#define __CV_V_UINT16 v_uint16x8
#define __CV_V_INT16 v_int16x8
#define __CV_V_UINT32 v_uint32x4
#define __CV_V_INT32 v_int32x4
#define __CV_V_UINT64 v_uint64x2
#define __CV_V_INT64 v_int64x2
#define __CV_V_FLOAT32 v_float32x4
#define __CV_V_FLOAT64 v_float64x2
struct
v_uint8x16
;
struct
v_int8x16
;
struct
v_uint16x8
;
struct
v_int16x8
;
struct
v_uint32x4
;
struct
v_int32x4
;
struct
v_uint64x2
;
struct
v_int64x2
;
struct
v_float32x4
;
struct
v_float64x2
;
#endif
/** Value reordering **/
// Expansion
void
v_expand
(
const
__CV_V_UINT8
&
,
__CV_V_UINT16
&
,
__CV_V_UINT16
&
);
void
v_expand
(
const
__CV_V_INT8
&
,
__CV_V_INT16
&
,
__CV_V_INT16
&
);
void
v_expand
(
const
__CV_V_UINT16
&
,
__CV_V_UINT32
&
,
__CV_V_UINT32
&
);
void
v_expand
(
const
__CV_V_INT16
&
,
__CV_V_INT32
&
,
__CV_V_INT32
&
);
void
v_expand
(
const
__CV_V_UINT32
&
,
__CV_V_UINT64
&
,
__CV_V_UINT64
&
);
void
v_expand
(
const
__CV_V_INT32
&
,
__CV_V_INT64
&
,
__CV_V_INT64
&
);
// Low Expansion
__CV_V_UINT16
v_expand_low
(
const
__CV_V_UINT8
&
);
__CV_V_INT16
v_expand_low
(
const
__CV_V_INT8
&
);
__CV_V_UINT32
v_expand_low
(
const
__CV_V_UINT16
&
);
__CV_V_INT32
v_expand_low
(
const
__CV_V_INT16
&
);
__CV_V_UINT64
v_expand_low
(
const
__CV_V_UINT32
&
);
__CV_V_INT64
v_expand_low
(
const
__CV_V_INT32
&
);
// High Expansion
__CV_V_UINT16
v_expand_high
(
const
__CV_V_UINT8
&
);
__CV_V_INT16
v_expand_high
(
const
__CV_V_INT8
&
);
__CV_V_UINT32
v_expand_high
(
const
__CV_V_UINT16
&
);
__CV_V_INT32
v_expand_high
(
const
__CV_V_INT16
&
);
__CV_V_UINT64
v_expand_high
(
const
__CV_V_UINT32
&
);
__CV_V_INT64
v_expand_high
(
const
__CV_V_INT32
&
);
// Load & Low Expansion
__CV_V_UINT16
__CV_VX
(
load_expand
)(
const
uchar
*
);
__CV_V_INT16
__CV_VX
(
load_expand
)(
const
schar
*
);
__CV_V_UINT32
__CV_VX
(
load_expand
)(
const
ushort
*
);
__CV_V_INT32
__CV_VX
(
load_expand
)(
const
short
*
);
__CV_V_UINT64
__CV_VX
(
load_expand
)(
const
uint
*
);
__CV_V_INT64
__CV_VX
(
load_expand
)(
const
int
*
);
// Load lower 8-bit and expand into 32-bit
__CV_V_UINT32
__CV_VX
(
load_expand_q
)(
const
uchar
*
);
__CV_V_INT32
__CV_VX
(
load_expand_q
)(
const
schar
*
);
// Saturating Pack
__CV_V_UINT8
v_pack
(
const
__CV_V_UINT16
&
,
const
__CV_V_UINT16
&
);
__CV_V_INT8
v_pack
(
const
__CV_V_INT16
&
,
const
__CV_V_INT16
&
);
__CV_V_UINT16
v_pack
(
const
__CV_V_UINT32
&
,
const
__CV_V_UINT32
&
);
__CV_V_INT16
v_pack
(
const
__CV_V_INT32
&
,
const
__CV_V_INT32
&
);
// Non-saturating Pack
__CV_V_UINT32
v_pack
(
const
__CV_V_UINT64
&
,
const
__CV_V_UINT64
&
);
__CV_V_INT32
v_pack
(
const
__CV_V_INT64
&
,
const
__CV_V_INT64
&
);
// Pack signed integers with unsigned saturation
__CV_V_UINT8
v_pack_u
(
const
__CV_V_INT16
&
,
const
__CV_V_INT16
&
);
__CV_V_UINT16
v_pack_u
(
const
__CV_V_INT32
&
,
const
__CV_V_INT32
&
);
/** Arithmetic, bitwise and comparison operations **/
// Non-saturating multiply
#if CV_VSX
template
<
typename
Tvec
>
Tvec
v_mul_wrap
(
const
Tvec
&
a
,
const
Tvec
&
b
);
#else
__CV_V_UINT8
v_mul_wrap
(
const
__CV_V_UINT8
&
,
const
__CV_V_UINT8
&
);
__CV_V_INT8
v_mul_wrap
(
const
__CV_V_INT8
&
,
const
__CV_V_INT8
&
);
__CV_V_UINT16
v_mul_wrap
(
const
__CV_V_UINT16
&
,
const
__CV_V_UINT16
&
);
__CV_V_INT16
v_mul_wrap
(
const
__CV_V_INT16
&
,
const
__CV_V_INT16
&
);
#endif
// Multiply and expand
#if CV_VSX
template
<
typename
Tvec
,
typename
Twvec
>
void
v_mul_expand
(
const
Tvec
&
a
,
const
Tvec
&
b
,
Twvec
&
c
,
Twvec
&
d
);
#else
void
v_mul_expand
(
const
__CV_V_UINT8
&
,
const
__CV_V_UINT8
&
,
__CV_V_UINT16
&
,
__CV_V_UINT16
&
);
void
v_mul_expand
(
const
__CV_V_INT8
&
,
const
__CV_V_INT8
&
,
__CV_V_INT16
&
,
__CV_V_INT16
&
);
void
v_mul_expand
(
const
__CV_V_UINT16
&
,
const
__CV_V_UINT16
&
,
__CV_V_UINT32
&
,
__CV_V_UINT32
&
);
void
v_mul_expand
(
const
__CV_V_INT16
&
,
const
__CV_V_INT16
&
,
__CV_V_INT32
&
,
__CV_V_INT32
&
);
void
v_mul_expand
(
const
__CV_V_UINT32
&
,
const
__CV_V_UINT32
&
,
__CV_V_UINT64
&
,
__CV_V_UINT64
&
);
void
v_mul_expand
(
const
__CV_V_INT32
&
,
const
__CV_V_INT32
&
,
__CV_V_INT64
&
,
__CV_V_INT64
&
);
#endif
/** Cleanup **/
#undef CV__SIMD_FORWARD
#undef __CV_VX
#undef __CV_V_UINT8
#undef __CV_V_INT8
#undef __CV_V_UINT16
#undef __CV_V_INT16
#undef __CV_V_UINT32
#undef __CV_V_INT32
#undef __CV_V_UINT64
#undef __CV_V_INT64
#undef __CV_V_FLOAT32
#undef __CV_V_FLOAT64
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
}
// cv::
\ No newline at end of file
modules/core/include/opencv2/core/hal/intrin_neon.hpp
浏览文件 @
1cc3f7ab
...
...
@@ -435,10 +435,8 @@ OPENCV_HAL_IMPL_NEON_BIN_OP(+, v_int8x16, vqaddq_s8)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
-
,
v_int8x16
,
vqsubq_s8
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
+
,
v_uint16x8
,
vqaddq_u16
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
-
,
v_uint16x8
,
vqsubq_u16
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
*
,
v_uint16x8
,
vmulq_u16
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
+
,
v_int16x8
,
vqaddq_s16
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
-
,
v_int16x8
,
vqsubq_s16
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
*
,
v_int16x8
,
vmulq_s16
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
+
,
v_int32x4
,
vaddq_s32
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
-
,
v_int32x4
,
vsubq_s32
)
OPENCV_HAL_IMPL_NEON_BIN_OP
(
*
,
v_int32x4
,
vmulq_s32
)
...
...
@@ -476,6 +474,37 @@ inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b)
}
#endif
// saturating multiply 8-bit, 16-bit
#define OPENCV_HAL_IMPL_NEON_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
OPENCV_HAL_IMPL_NEON_MUL_SAT
(
v_int8x16
,
v_int16x8
)
OPENCV_HAL_IMPL_NEON_MUL_SAT
(
v_uint8x16
,
v_uint16x8
)
OPENCV_HAL_IMPL_NEON_MUL_SAT
(
v_int16x8
,
v_int32x4
)
OPENCV_HAL_IMPL_NEON_MUL_SAT
(
v_uint16x8
,
v_uint32x4
)
// Multiply and expand
inline
void
v_mul_expand
(
const
v_int8x16
&
a
,
const
v_int8x16
&
b
,
v_int16x8
&
c
,
v_int16x8
&
d
)
{
c
.
val
=
vmull_s8
(
vget_low_s8
(
a
.
val
),
vget_low_s8
(
b
.
val
));
d
.
val
=
vmull_s8
(
vget_high_s8
(
a
.
val
),
vget_high_s8
(
b
.
val
));
}
inline
void
v_mul_expand
(
const
v_uint8x16
&
a
,
const
v_uint8x16
&
b
,
v_uint16x8
&
c
,
v_uint16x8
&
d
)
{
c
.
val
=
vmull_u8
(
vget_low_u8
(
a
.
val
),
vget_low_u8
(
b
.
val
));
d
.
val
=
vmull_u8
(
vget_high_u8
(
a
.
val
),
vget_high_u8
(
b
.
val
));
}
inline
void
v_mul_expand
(
const
v_int16x8
&
a
,
const
v_int16x8
&
b
,
v_int32x4
&
c
,
v_int32x4
&
d
)
{
...
...
@@ -714,6 +743,10 @@ OPENCV_HAL_IMPL_NEON_BIN_FUNC(v_uint8x16, v_sub_wrap, vsubq_u8)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_int8x16
,
v_sub_wrap
,
vsubq_s8
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint16x8
,
v_sub_wrap
,
vsubq_u16
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_int16x8
,
v_sub_wrap
,
vsubq_s16
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint8x16
,
v_mul_wrap
,
vmulq_u8
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_int8x16
,
v_mul_wrap
,
vmulq_s8
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint16x8
,
v_mul_wrap
,
vmulq_u16
)
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_int16x8
,
v_mul_wrap
,
vmulq_s16
)
// TODO: absdiff for signed integers
OPENCV_HAL_IMPL_NEON_BIN_FUNC
(
v_uint8x16
,
v_absdiff
,
vabdq_u8
)
...
...
@@ -1056,6 +1089,14 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
b0.val = vmovl_##suffix(vget_low_##suffix(a.val)); \
b1.val = vmovl_##suffix(vget_high_##suffix(a.val)); \
} \
inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ \
return _Tpwvec(vmovl_##suffix(vget_low_##suffix(a.val))); \
} \
inline _Tpwvec v_expand_high(const _Tpvec& a) \
{ \
return _Tpwvec(vmovl_##suffix(vget_high_##suffix(a.val))); \
} \
inline _Tpwvec v_load_expand(const _Tp* ptr) \
{ \
return _Tpwvec(vmovl_##suffix(vld1_##suffix(ptr))); \
...
...
modules/core/include/opencv2/core/hal/intrin_sse.hpp
浏览文件 @
1cc3f7ab
...
...
@@ -59,6 +59,8 @@ namespace cv
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
///////// Types ////////////
struct
v_uint8x16
{
typedef
uchar
lane_type
;
...
...
@@ -436,13 +438,7 @@ inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
}
inline
v_uint16x8
v_pack
(
const
v_uint32x4
&
a
,
const
v_uint32x4
&
b
)
{
__m128i
z
=
_mm_setzero_si128
(),
maxval32
=
_mm_set1_epi32
(
65535
),
delta32
=
_mm_set1_epi32
(
32768
);
__m128i
a1
=
_mm_sub_epi32
(
v_select_si128
(
_mm_cmpgt_epi32
(
z
,
a
.
val
),
maxval32
,
a
.
val
),
delta32
);
__m128i
b1
=
_mm_sub_epi32
(
v_select_si128
(
_mm_cmpgt_epi32
(
z
,
b
.
val
),
maxval32
,
b
.
val
),
delta32
);
__m128i
r
=
_mm_packs_epi32
(
a1
,
b1
);
return
v_uint16x8
(
_mm_sub_epi16
(
r
,
_mm_set1_epi16
(
-
32768
)));
}
{
return
v_uint16x8
(
_v128_packs_epu32
(
a
.
val
,
b
.
val
));
}
inline
void
v_pack_store
(
ushort
*
ptr
,
const
v_uint32x4
&
a
)
{
...
...
@@ -678,14 +674,14 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
-
,
v_int8x16
,
_mm_subs_epi8
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
+
,
v_uint16x8
,
_mm_adds_epu16
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
-
,
v_uint16x8
,
_mm_subs_epu16
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
*
,
v_uint16x8
,
_mm_mullo_epi16
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
+
,
v_int16x8
,
_mm_adds_epi16
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
-
,
v_int16x8
,
_mm_subs_epi16
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
*
,
v_int16x8
,
_mm_mullo_epi16
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
+
,
v_uint32x4
,
_mm_add_epi32
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
-
,
v_uint32x4
,
_mm_sub_epi32
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
*
,
v_uint32x4
,
_v128_mullo_epi32
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
+
,
v_int32x4
,
_mm_add_epi32
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
-
,
v_int32x4
,
_mm_sub_epi32
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
*
,
v_int32x4
,
_v128_mullo_epi32
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
+
,
v_float32x4
,
_mm_add_ps
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
-
,
v_float32x4
,
_mm_sub_ps
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
*
,
v_float32x4
,
_mm_mul_ps
)
...
...
@@ -699,35 +695,49 @@ OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
+
,
v_int64x2
,
_mm_add_epi64
)
OPENCV_HAL_IMPL_SSE_BIN_OP
(
-
,
v_int64x2
,
_mm_sub_epi64
)
inline
v_uint32x4
operator
*
(
const
v_uint32x4
&
a
,
const
v_uint32x4
&
b
)
{
__m128i
c0
=
_mm_mul_epu32
(
a
.
val
,
b
.
val
);
__m128i
c1
=
_mm_mul_epu32
(
_mm_srli_epi64
(
a
.
val
,
32
),
_mm_srli_epi64
(
b
.
val
,
32
));
__m128i
d0
=
_mm_unpacklo_epi32
(
c0
,
c1
);
__m128i
d1
=
_mm_unpackhi_epi32
(
c0
,
c1
);
return
v_uint32x4
(
_mm_unpacklo_epi64
(
d0
,
d1
));
}
inline
v_int32x4
operator
*
(
const
v_int32x4
&
a
,
const
v_int32x4
&
b
)
// saturating multiply 8-bit, 16-bit
#define OPENCV_HAL_IMPL_SSE_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
OPENCV_HAL_IMPL_SSE_MUL_SAT
(
v_int8x16
,
v_int16x8
)
OPENCV_HAL_IMPL_SSE_MUL_SAT
(
v_uint16x8
,
v_uint32x4
)
OPENCV_HAL_IMPL_SSE_MUL_SAT
(
v_int16x8
,
v_int32x4
)
inline
v_uint8x16
operator
*
(
const
v_uint8x16
&
a
,
const
v_uint8x16
&
b
)
{
#if CV_SSE4_1
return
v_int32x4
(
_mm_mullo_epi32
(
a
.
val
,
b
.
val
));
#else
__m128i
c0
=
_mm_mul_epu32
(
a
.
val
,
b
.
val
);
__m128i
c1
=
_mm_mul_epu32
(
_mm_srli_epi64
(
a
.
val
,
32
),
_mm_srli_epi64
(
b
.
val
,
32
));
__m128i
d0
=
_mm_unpacklo_epi32
(
c0
,
c1
);
__m128i
d1
=
_mm_unpackhi_epi32
(
c0
,
c1
);
return
v_int32x4
(
_mm_unpacklo_epi64
(
d0
,
d1
));
#endif
v_uint16x8
c
,
d
;
v_mul_expand
(
a
,
b
,
c
,
d
);
return
v_pack_u
(
v_reinterpret_as_s16
(
c
),
v_reinterpret_as_s16
(
d
));
}
inline
v_uint32x4
&
operator
*=
(
v_uint32x4
&
a
,
const
v_uint32x4
&
b
)
inline
v_uint8x16
&
operator
*=
(
v_uint8x16
&
a
,
const
v_uint8x16
&
b
)
{
a
=
a
*
b
;
return
a
;
}
// Multiply and expand
inline
void
v_mul_expand
(
const
v_uint8x16
&
a
,
const
v_uint8x16
&
b
,
v_uint16x8
&
c
,
v_uint16x8
&
d
)
{
a
=
a
*
b
;
return
a
;
v_uint16x8
a0
,
a1
,
b0
,
b1
;
v_expand
(
a
,
a0
,
a1
);
v_expand
(
b
,
b0
,
b1
);
c
=
v_mul_wrap
(
a0
,
b0
);
d
=
v_mul_wrap
(
a1
,
b1
);
}
inline
v_int32x4
&
operator
*=
(
v_int32x4
&
a
,
const
v_int32x4
&
b
)
inline
void
v_mul_expand
(
const
v_int8x16
&
a
,
const
v_int8x16
&
b
,
v_int16x8
&
c
,
v_int16x8
&
d
)
{
a
=
a
*
b
;
return
a
;
v_int16x8
a0
,
a1
,
b0
,
b1
;
v_expand
(
a
,
a0
,
a1
);
v_expand
(
b
,
b0
,
b1
);
c
=
v_mul_wrap
(
a0
,
b0
);
d
=
v_mul_wrap
(
a1
,
b1
);
}
inline
void
v_mul_expand
(
const
v_int16x8
&
a
,
const
v_int16x8
&
b
,
...
...
@@ -1018,6 +1028,22 @@ OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
OPENCV_HAL_IMPL_SSE_BIN_FUNC
(
v_int8x16
,
v_sub_wrap
,
_mm_sub_epi8
)
OPENCV_HAL_IMPL_SSE_BIN_FUNC
(
v_uint16x8
,
v_sub_wrap
,
_mm_sub_epi16
)
OPENCV_HAL_IMPL_SSE_BIN_FUNC
(
v_int16x8
,
v_sub_wrap
,
_mm_sub_epi16
)
OPENCV_HAL_IMPL_SSE_BIN_FUNC
(
v_uint16x8
,
v_mul_wrap
,
_mm_mullo_epi16
)
OPENCV_HAL_IMPL_SSE_BIN_FUNC
(
v_int16x8
,
v_mul_wrap
,
_mm_mullo_epi16
)
inline
v_uint8x16
v_mul_wrap
(
const
v_uint8x16
&
a
,
const
v_uint8x16
&
b
)
{
__m128i
ad
=
_mm_srai_epi16
(
a
.
val
,
8
);
__m128i
bd
=
_mm_srai_epi16
(
b
.
val
,
8
);
__m128i
p0
=
_mm_mullo_epi16
(
a
.
val
,
b
.
val
);
// even
__m128i
p1
=
_mm_slli_epi16
(
_mm_mullo_epi16
(
ad
,
bd
),
8
);
// odd
const
__m128i
b01
=
_mm_set1_epi32
(
0xFF00FF00
);
return
v_uint8x16
(
_v128_blendv_epi8
(
p0
,
p1
,
b01
));
}
inline
v_int8x16
v_mul_wrap
(
const
v_int8x16
&
a
,
const
v_int8x16
&
b
)
{
return
v_reinterpret_as_s8
(
v_mul_wrap
(
v_reinterpret_as_u8
(
a
),
v_reinterpret_as_u8
(
b
)));
}
#define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
...
...
@@ -1502,70 +1528,39 @@ OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
OPENCV_HAL_IMPL_SSE_SELECT
(
v_float64x2
,
pd
)
#endif
#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
{ \
__m128i z = _mm_setzero_si128(); \
b0.val = _mm_unpacklo_##suffix(a.val, z); \
b1.val = _mm_unpackhi_##suffix(a.val, z); \
} \
inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
{ \
__m128i z = _mm_setzero_si128(); \
return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
} \
inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
{ \
b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
} \
inline _Tpwsvec v_load_expand(const _Tps* ptr) \
{ \
__m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
}
OPENCV_HAL_IMPL_SSE_EXPAND
(
v_uint8x16
,
v_uint16x8
,
uchar
,
v_int8x16
,
v_int16x8
,
schar
,
epi8
,
epi16
,
8
)
OPENCV_HAL_IMPL_SSE_EXPAND
(
v_uint16x8
,
v_uint32x4
,
ushort
,
v_int16x8
,
v_int32x4
,
short
,
epi16
,
epi32
,
16
)
inline
void
v_expand
(
const
v_uint32x4
&
a
,
v_uint64x2
&
b0
,
v_uint64x2
&
b1
)
{
__m128i
z
=
_mm_setzero_si128
();
b0
.
val
=
_mm_unpacklo_epi32
(
a
.
val
,
z
);
b1
.
val
=
_mm_unpackhi_epi32
(
a
.
val
,
z
);
}
inline
v_uint64x2
v_load_expand
(
const
unsigned
*
ptr
)
{
__m128i
z
=
_mm_setzero_si128
();
return
v_uint64x2
(
_mm_unpacklo_epi32
(
_mm_loadl_epi64
((
const
__m128i
*
)
ptr
),
z
));
}
inline
void
v_expand
(
const
v_int32x4
&
a
,
v_int64x2
&
b0
,
v_int64x2
&
b1
)
{
__m128i
s
=
_mm_srai_epi32
(
a
.
val
,
31
);
b0
.
val
=
_mm_unpacklo_epi32
(
a
.
val
,
s
);
b1
.
val
=
_mm_unpackhi_epi32
(
a
.
val
,
s
);
}
inline
v_int64x2
v_load_expand
(
const
int
*
ptr
)
{
__m128i
a
=
_mm_loadl_epi64
((
const
__m128i
*
)
ptr
);
__m128i
s
=
_mm_srai_epi32
(
a
,
31
);
return
v_int64x2
(
_mm_unpacklo_epi32
(
a
,
s
));
}
inline
v_uint32x4
v_load_expand_q
(
const
uchar
*
ptr
)
{
__m128i
z
=
_mm_setzero_si128
();
__m128i
a
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)
ptr
);
return
v_uint32x4
(
_mm_unpacklo_epi16
(
_mm_unpacklo_epi8
(
a
,
z
),
z
));
}
inline
v_int32x4
v_load_expand_q
(
const
schar
*
ptr
)
{
__m128i
a
=
_mm_cvtsi32_si128
(
*
(
const
int
*
)
ptr
);
a
=
_mm_unpacklo_epi8
(
a
,
a
);
a
=
_mm_unpacklo_epi8
(
a
,
a
);
return
v_int32x4
(
_mm_srai_epi32
(
a
,
24
));
}
/* Expand */
#define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpvec, _Tpwvec, _Tp, intrin) \
inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
{ \
b0.val = intrin(a.val); \
b1.val = __CV_CAT(intrin, _high)(a.val); \
} \
inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ return _Tpwvec(intrin(a.val)); } \
inline _Tpwvec v_expand_high(const _Tpvec& a) \
{ return _Tpwvec(__CV_CAT(intrin, _high)(a.val)); } \
inline _Tpwvec v_load_expand(const _Tp* ptr) \
{ \
__m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
return _Tpwvec(intrin(a)); \
}
OPENCV_HAL_IMPL_SSE_EXPAND
(
v_uint8x16
,
v_uint16x8
,
uchar
,
_v128_cvtepu8_epi16
)
OPENCV_HAL_IMPL_SSE_EXPAND
(
v_int8x16
,
v_int16x8
,
schar
,
_v128_cvtepi8_epi16
)
OPENCV_HAL_IMPL_SSE_EXPAND
(
v_uint16x8
,
v_uint32x4
,
ushort
,
_v128_cvtepu16_epi32
)
OPENCV_HAL_IMPL_SSE_EXPAND
(
v_int16x8
,
v_int32x4
,
short
,
_v128_cvtepi16_epi32
)
OPENCV_HAL_IMPL_SSE_EXPAND
(
v_uint32x4
,
v_uint64x2
,
unsigned
,
_v128_cvtepu32_epi64
)
OPENCV_HAL_IMPL_SSE_EXPAND
(
v_int32x4
,
v_int64x2
,
int
,
_v128_cvtepi32_epi64
)
#define OPENCV_HAL_IMPL_SSE_EXPAND_Q(_Tpvec, _Tp, intrin) \
inline _Tpvec v_load_expand_q(const _Tp* ptr) \
{ \
__m128i a = _mm_cvtsi32_si128(*(const int*)ptr); \
return _Tpvec(intrin(a)); \
}
OPENCV_HAL_IMPL_SSE_EXPAND_Q
(
v_uint32x4
,
uchar
,
_v128_cvtepu8_epi32
)
OPENCV_HAL_IMPL_SSE_EXPAND_Q
(
v_int32x4
,
schar
,
_v128_cvtepi8_epi32
)
#define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
...
...
modules/core/include/opencv2/core/hal/intrin_sse_em.hpp
0 → 100644
浏览文件 @
1cc3f7ab
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html
#ifndef OPENCV_HAL_INTRIN_SSE_EM_HPP
#define OPENCV_HAL_INTRIN_SSE_EM_HPP
namespace
cv
{
//! @cond IGNORED
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
#define OPENCV_HAL_SSE_WRAP_1(fun, tp) \
inline tp _v128_##fun(const tp& a) \
{ return _mm_##fun(a); }
#define OPENCV_HAL_SSE_WRAP_2(fun, tp) \
inline tp _v128_##fun(const tp& a, const tp& b) \
{ return _mm_##fun(a, b); }
#define OPENCV_HAL_SSE_WRAP_3(fun, tp) \
inline tp _v128_##fun(const tp& a, const tp& b, const tp& c) \
{ return _mm_##fun(a, b, c); }
///////////////////////////// XOP /////////////////////////////
// [todo] define CV_XOP
#if 1 // CV_XOP
inline
__m128i
_v128_comgt_epu32
(
const
__m128i
&
a
,
const
__m128i
&
b
)
{
const
__m128i
delta
=
_mm_set1_epi32
((
int
)
0x80000000
);
return
_mm_cmpgt_epi32
(
_mm_xor_si128
(
a
,
delta
),
_mm_xor_si128
(
b
,
delta
));
}
// wrapping XOP
#else
OPENCV_HAL_SSE_WRAP_2
(
_v128_comgt_epu32
,
__m128i
)
#endif // !CV_XOP
///////////////////////////// SSE4.1 /////////////////////////////
#if !CV_SSE4_1
/** Swizzle **/
inline
__m128i
_v128_blendv_epi8
(
const
__m128i
&
a
,
const
__m128i
&
b
,
const
__m128i
&
mask
)
{
return
_mm_xor_si128
(
a
,
_mm_and_si128
(
_mm_xor_si128
(
b
,
a
),
mask
));
}
/** Convert **/
// 8 >> 16
inline
__m128i
_v128_cvtepu8_epi16
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpacklo_epi8
(
a
,
z
);
}
inline
__m128i
_v128_cvtepi8_epi16
(
const
__m128i
&
a
)
{
return
_mm_srai_epi16
(
_mm_unpacklo_epi8
(
a
,
a
),
8
);
}
// 8 >> 32
inline
__m128i
_v128_cvtepu8_epi32
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpacklo_epi16
(
_mm_unpacklo_epi8
(
a
,
z
),
z
);
}
inline
__m128i
_v128_cvtepi8_epi32
(
const
__m128i
&
a
)
{
__m128i
r
=
_mm_unpacklo_epi8
(
a
,
a
);
r
=
_mm_unpacklo_epi8
(
r
,
r
);
return
_mm_srai_epi32
(
r
,
24
);
}
// 16 >> 32
inline
__m128i
_v128_cvtepu16_epi32
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpacklo_epi16
(
a
,
z
);
}
inline
__m128i
_v128_cvtepi16_epi32
(
const
__m128i
&
a
)
{
return
_mm_srai_epi32
(
_mm_unpacklo_epi16
(
a
,
a
),
16
);
}
// 32 >> 64
inline
__m128i
_v128_cvtepu32_epi64
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpacklo_epi32
(
a
,
z
);
}
inline
__m128i
_v128_cvtepi32_epi64
(
const
__m128i
&
a
)
{
return
_mm_unpacklo_epi32
(
a
,
_mm_srai_epi32
(
a
,
31
));
}
/** Arithmetic **/
inline
__m128i
_v128_mullo_epi32
(
const
__m128i
&
a
,
const
__m128i
&
b
)
{
__m128i
c0
=
_mm_mul_epu32
(
a
,
b
);
__m128i
c1
=
_mm_mul_epu32
(
_mm_srli_epi64
(
a
,
32
),
_mm_srli_epi64
(
b
,
32
));
__m128i
d0
=
_mm_unpacklo_epi32
(
c0
,
c1
);
__m128i
d1
=
_mm_unpackhi_epi32
(
c0
,
c1
);
return
_mm_unpacklo_epi64
(
d0
,
d1
);
}
/** Math **/
inline
__m128i
_v128_min_epu32
(
const
__m128i
&
a
,
const
__m128i
&
b
)
{
return
_v128_blendv_epi8
(
a
,
b
,
_v128_comgt_epu32
(
a
,
b
));
}
// wrapping SSE4.1
#else
OPENCV_HAL_SSE_WRAP_1
(
cvtepu8_epi16
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepi8_epi16
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepu8_epi32
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepi8_epi32
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepu16_epi32
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepi16_epi32
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepu32_epi64
,
__m128i
)
OPENCV_HAL_SSE_WRAP_1
(
cvtepi32_epi64
,
__m128i
)
OPENCV_HAL_SSE_WRAP_2
(
min_epu32
,
__m128i
)
OPENCV_HAL_SSE_WRAP_2
(
mullo_epi32
,
__m128i
)
OPENCV_HAL_SSE_WRAP_3
(
blendv_epi8
,
__m128i
)
#endif // !CV_SSE4_1
///////////////////////////// Revolutionary /////////////////////////////
/** Convert **/
// 16 << 8
inline
__m128i
_v128_cvtepu8_epi16_high
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpackhi_epi8
(
a
,
z
);
}
inline
__m128i
_v128_cvtepi8_epi16_high
(
const
__m128i
&
a
)
{
return
_mm_srai_epi16
(
_mm_unpackhi_epi8
(
a
,
a
),
8
);
}
// 32 << 16
inline
__m128i
_v128_cvtepu16_epi32_high
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpackhi_epi16
(
a
,
z
);
}
inline
__m128i
_v128_cvtepi16_epi32_high
(
const
__m128i
&
a
)
{
return
_mm_srai_epi32
(
_mm_unpackhi_epi16
(
a
,
a
),
16
);
}
// 64 << 32
inline
__m128i
_v128_cvtepu32_epi64_high
(
const
__m128i
&
a
)
{
const
__m128i
z
=
_mm_setzero_si128
();
return
_mm_unpackhi_epi32
(
a
,
z
);
}
inline
__m128i
_v128_cvtepi32_epi64_high
(
const
__m128i
&
a
)
{
return
_mm_unpackhi_epi32
(
a
,
_mm_srai_epi32
(
a
,
31
));
}
/** Miscellaneous **/
inline
__m128i
_v128_packs_epu32
(
const
__m128i
&
a
,
const
__m128i
&
b
)
{
const
__m128i
m
=
_mm_set1_epi32
(
65535
);
__m128i
am
=
_v128_min_epu32
(
a
,
m
);
__m128i
bm
=
_v128_min_epu32
(
b
,
m
);
#if CV_SSE4_1
return
_mm_packus_epi32
(
am
,
bm
);
#else
const
__m128i
d
=
_mm_set1_epi32
(
32768
),
nd
=
_mm_set1_epi16
(
-
32768
);
am
=
_mm_sub_epi32
(
am
,
d
);
bm
=
_mm_sub_epi32
(
bm
,
d
);
am
=
_mm_packs_epi32
(
am
,
bm
);
return
_mm_sub_epi16
(
am
,
nd
);
#endif
}
CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END
//! @endcond
}
// cv::
#endif // OPENCV_HAL_INTRIN_SSE_EM_HPP
\ No newline at end of file
modules/core/include/opencv2/core/hal/intrin_vsx.hpp
浏览文件 @
1cc3f7ab
...
...
@@ -315,6 +315,10 @@ inline void v_expand(const _Tpvec& a, _Tpwvec& b0, _Tpwvec& b1) \
b0.val = fh(a.val); \
b1.val = fl(a.val); \
} \
inline _Tpwvec v_expand_low(const _Tpvec& a) \
{ return _Tpwvec(fh(a.val)); } \
inline _Tpwvec v_expand_high(const _Tpvec& a) \
{ return _Tpwvec(fl(a.val)); } \
inline _Tpwvec v_load_expand(const _Tp* ptr) \
{ return _Tpwvec(fh(vec_ld_l8(ptr))); }
...
...
@@ -418,10 +422,8 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(+, v_int8x16, vec_adds)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_int8x16
,
vec_subs
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_uint16x8
,
vec_adds
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_uint16x8
,
vec_subs
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
*
,
v_uint16x8
,
vec_mul
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_int16x8
,
vec_adds
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_int16x8
,
vec_subs
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
*
,
v_int16x8
,
vec_mul
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_uint32x4
,
vec_add
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_uint32x4
,
vec_sub
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
*
,
v_uint32x4
,
vec_mul
)
...
...
@@ -441,16 +443,30 @@ OPENCV_HAL_IMPL_VSX_BIN_OP(-, v_uint64x2, vec_sub)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
+
,
v_int64x2
,
vec_add
)
OPENCV_HAL_IMPL_VSX_BIN_OP
(
-
,
v_int64x2
,
vec_sub
)
inline
void
v_mul_expand
(
const
v_int16x8
&
a
,
const
v_int16x8
&
b
,
v_int32x4
&
c
,
v_int32x4
&
d
)
// saturating multiply
#define OPENCV_HAL_IMPL_VSX_MUL_SAT(_Tpvec, _Tpwvec) \
inline _Tpvec operator * (const _Tpvec& a, const _Tpvec& b) \
{ \
_Tpwvec c, d; \
v_mul_expand(a, b, c, d); \
return v_pack(c, d); \
} \
inline _Tpvec& operator *= (_Tpvec& a, const _Tpvec& b) \
{ a = a * b; return a; }
OPENCV_HAL_IMPL_VSX_MUL_SAT
(
v_int8x16
,
v_int16x8
)
OPENCV_HAL_IMPL_VSX_MUL_SAT
(
v_uint8x16
,
v_uint16x8
)
OPENCV_HAL_IMPL_VSX_MUL_SAT
(
v_int16x8
,
v_int32x4
)
OPENCV_HAL_IMPL_VSX_MUL_SAT
(
v_uint16x8
,
v_uint32x4
)
template
<
typename
Tvec
,
typename
Twvec
>
inline
void
v_mul_expand
(
const
Tvec
&
a
,
const
Tvec
&
b
,
Twvec
&
c
,
Twvec
&
d
)
{
c
.
val
=
vec_mul
(
vec_unpackh
(
a
.
val
),
vec_unpackh
(
b
.
val
));
d
.
val
=
vec_mul
(
vec_unpackl
(
a
.
val
),
vec_unpackl
(
b
.
val
));
}
inline
void
v_mul_expand
(
const
v_uint16x8
&
a
,
const
v_uint16x8
&
b
,
v_uint32x4
&
c
,
v_uint32x4
&
d
)
{
c
.
val
=
vec_mul
(
vec_unpackhu
(
a
.
val
),
vec_unpackhu
(
b
.
val
));
d
.
val
=
vec_mul
(
vec_unpacklu
(
a
.
val
),
vec_unpacklu
(
b
.
val
));
Twvec
p0
=
Twvec
(
vec_mule
(
a
.
val
,
b
.
val
));
Twvec
p1
=
Twvec
(
vec_mulo
(
a
.
val
,
b
.
val
));
v_zip
(
p0
,
p1
,
c
,
d
);
}
inline
void
v_mul_expand
(
const
v_uint32x4
&
a
,
const
v_uint32x4
&
b
,
v_uint64x2
&
c
,
v_uint64x2
&
d
)
{
c
.
val
=
vec_mul
(
vec_unpackhu
(
a
.
val
),
vec_unpackhu
(
b
.
val
));
...
...
@@ -459,17 +475,17 @@ inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b, v_uint64x2& c
inline
v_int16x8
v_mul_hi
(
const
v_int16x8
&
a
,
const
v_int16x8
&
b
)
{
return
v_int16x8
(
vec_packs
(
vec_sra
(
vec_mul
(
vec_unpackh
(
a
.
val
),
vec_unpackh
(
b
.
val
)),
vec_uint4_sp
(
16
)),
vec_sra
(
vec_mul
(
vec_unpackl
(
a
.
val
),
vec_unpackl
(
b
.
val
)),
vec_uint4_sp
(
16
))
));
vec_int4
p0
=
vec_mule
(
a
.
val
,
b
.
val
);
vec_int4
p1
=
vec_mulo
(
a
.
val
,
b
.
val
);
static
const
vec_uchar16
perm
=
{
2
,
3
,
18
,
19
,
6
,
7
,
22
,
23
,
10
,
11
,
26
,
27
,
14
,
15
,
30
,
31
};
return
v_int16x8
(
vec_perm
(
vec_short8_c
(
p0
),
vec_short8_c
(
p1
),
perm
));
}
inline
v_uint16x8
v_mul_hi
(
const
v_uint16x8
&
a
,
const
v_uint16x8
&
b
)
{
return
v_uint16x8
(
vec_packs
(
vec_sr
(
vec_mul
(
vec_unpackhu
(
a
.
val
),
vec_unpackhu
(
b
.
val
)),
vec_uint4_sp
(
16
)),
vec_sr
(
vec_mul
(
vec_unpacklu
(
a
.
val
),
vec_unpacklu
(
b
.
val
)),
vec_uint4_sp
(
16
))
));
vec_uint4
p0
=
vec_mule
(
a
.
val
,
b
.
val
);
vec_uint4
p1
=
vec_mulo
(
a
.
val
,
b
.
val
);
static
const
vec_uchar16
perm
=
{
2
,
3
,
18
,
19
,
6
,
7
,
22
,
23
,
10
,
11
,
26
,
27
,
14
,
15
,
30
,
31
};
return
v_uint16x8
(
vec_perm
(
vec_ushort8_c
(
p0
),
vec_ushort8_c
(
p1
),
perm
));
}
/** Non-saturating arithmetics **/
...
...
@@ -480,6 +496,7 @@ inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_add_wrap
,
vec_add
)
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_sub_wrap
,
vec_sub
)
OPENCV_HAL_IMPL_VSX_BIN_FUNC
(
v_mul_wrap
,
vec_mul
)
/** Bitwise shifts **/
#define OPENCV_HAL_IMPL_VSX_SHIFT_OP(_Tpvec, shr, splfunc) \
...
...
modules/core/include/opencv2/core/vsx_utils.hpp
浏览文件 @
1cc3f7ab
...
...
@@ -130,19 +130,21 @@ VSX_FINLINE(rt) fnm(const rg& a, const rg& b) \
# undef vec_mul
# endif
/*
* there's no a direct instruction for supporting 16-bit multiplication in ISA 2.07,
* there's no a direct instruction for supporting
8-bit,
16-bit multiplication in ISA 2.07,
* XLC Implement it by using instruction "multiply even", "multiply odd" and "permute"
* todo: Do I need to support 8-bit ?
**/
# define VSX_IMPL_MULH(Tvec, Tcast) \
VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \
{ \
static const vec_uchar16 even_perm = {0, 1, 16, 17, 4, 5, 20, 21, \
8, 9, 24, 25, 12, 13, 28, 29}; \
return vec_perm(Tcast(vec_mule(a, b)), Tcast(vec_mulo(a, b)), even_perm); \
# define VSX_IMPL_MULH(Tvec, cperm) \
VSX_FINLINE(Tvec) vec_mul(const Tvec& a, const Tvec& b) \
{ \
static const vec_uchar16 ev_od = {cperm}; \
return vec_perm((Tvec)vec_mule(a, b), (Tvec)vec_mulo(a, b), ev_od); \
}
VSX_IMPL_MULH
(
vec_short8
,
vec_short8_c
)
VSX_IMPL_MULH
(
vec_ushort8
,
vec_ushort8_c
)
#define VSX_IMPL_MULH_P16 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30
VSX_IMPL_MULH
(
vec_char16
,
VSX_IMPL_MULH_P16
)
VSX_IMPL_MULH
(
vec_uchar16
,
VSX_IMPL_MULH_P16
)
#define VSX_IMPL_MULH_P8 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29
VSX_IMPL_MULH
(
vec_short8
,
VSX_IMPL_MULH_P8
)
VSX_IMPL_MULH
(
vec_ushort8
,
VSX_IMPL_MULH_P8
)
// vmuluwm can be used for unsigned or signed integers, that's what they said
VSX_IMPL_2VRG
(
vec_int4
,
vec_int4
,
vmuluwm
,
vec_mul
)
VSX_IMPL_2VRG
(
vec_uint4
,
vec_uint4
,
vmuluwm
,
vec_mul
)
...
...
modules/core/test/test_intrin_utils.hpp
浏览文件 @
1cc3f7ab
...
...
@@ -407,10 +407,13 @@ template<typename R> struct TheTest
Data
<
Rx2
>
resB
=
vx_load_expand
(
dataA
.
d
);
Rx2
c
,
d
;
Rx2
c
,
d
,
e
,
f
;
v_expand
(
a
,
c
,
d
);
Data
<
Rx2
>
resC
=
c
,
resD
=
d
;
e
=
v_expand_low
(
a
);
f
=
v_expand_high
(
a
);
Data
<
Rx2
>
resC
=
c
,
resD
=
d
,
resE
=
e
,
resF
=
f
;
const
int
n
=
Rx2
::
nlanes
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
...
...
@@ -418,6 +421,8 @@ template<typename R> struct TheTest
EXPECT_EQ
(
dataA
[
i
],
resB
[
i
]);
EXPECT_EQ
(
dataA
[
i
],
resC
[
i
]);
EXPECT_EQ
(
dataA
[
i
+
n
],
resD
[
i
]);
EXPECT_EQ
(
dataA
[
i
],
resE
[
i
]);
EXPECT_EQ
(
dataA
[
i
+
n
],
resF
[
i
]);
}
return
*
this
;
...
...
@@ -455,19 +460,21 @@ template<typename R> struct TheTest
return
*
this
;
}
TheTest
&
test_a
ddsub
_wrap
()
TheTest
&
test_a
rithm
_wrap
()
{
Data
<
R
>
dataA
,
dataB
;
dataB
.
reverse
();
R
a
=
dataA
,
b
=
dataB
;
Data
<
R
>
resC
=
v_add_wrap
(
a
,
b
),
resD
=
v_sub_wrap
(
a
,
b
);
resD
=
v_sub_wrap
(
a
,
b
),
resE
=
v_mul_wrap
(
a
,
b
);
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
SCOPED_TRACE
(
cv
::
format
(
"i=%d"
,
i
));
EXPECT_EQ
((
LaneType
)(
dataA
[
i
]
+
dataB
[
i
]),
resC
[
i
]);
EXPECT_EQ
((
LaneType
)(
dataA
[
i
]
-
dataB
[
i
]),
resD
[
i
]);
EXPECT_EQ
((
LaneType
)(
dataA
[
i
]
*
dataB
[
i
]),
resE
[
i
]);
}
return
*
this
;
}
...
...
@@ -475,6 +482,7 @@ template<typename R> struct TheTest
TheTest
&
test_mul
()
{
Data
<
R
>
dataA
,
dataB
;
dataA
[
1
]
=
static_cast
<
LaneType
>
(
std
::
numeric_limits
<
LaneType
>::
max
());
dataB
.
reverse
();
R
a
=
dataA
,
b
=
dataB
;
...
...
@@ -482,7 +490,7 @@ template<typename R> struct TheTest
for
(
int
i
=
0
;
i
<
R
::
nlanes
;
++
i
)
{
SCOPED_TRACE
(
cv
::
format
(
"i=%d"
,
i
));
EXPECT_EQ
(
dataA
[
i
]
*
dataB
[
i
]
,
resC
[
i
]);
EXPECT_EQ
(
saturate_cast
<
LaneType
>
(
dataA
[
i
]
*
dataB
[
i
])
,
resC
[
i
]);
}
return
*
this
;
...
...
@@ -1209,7 +1217,9 @@ void test_hal_intrin_uint8()
.
test_expand
()
.
test_expand_q
()
.
test_addsub
()
.
test_addsub_wrap
()
.
test_arithm_wrap
()
.
test_mul
()
.
test_mul_expand
()
.
test_cmp
()
.
test_logic
()
.
test_min_max
()
...
...
@@ -1242,7 +1252,9 @@ void test_hal_intrin_int8()
.
test_expand
()
.
test_expand_q
()
.
test_addsub
()
.
test_addsub_wrap
()
.
test_arithm_wrap
()
.
test_mul
()
.
test_mul_expand
()
.
test_cmp
()
.
test_logic
()
.
test_min_max
()
...
...
@@ -1267,7 +1279,7 @@ void test_hal_intrin_uint16()
.
test_interleave
()
.
test_expand
()
.
test_addsub
()
.
test_a
ddsub
_wrap
()
.
test_a
rithm
_wrap
()
.
test_mul
()
.
test_mul_expand
()
.
test_cmp
()
...
...
@@ -1295,7 +1307,7 @@ void test_hal_intrin_int16()
.
test_interleave
()
.
test_expand
()
.
test_addsub
()
.
test_a
ddsub
_wrap
()
.
test_a
rithm
_wrap
()
.
test_mul
()
.
test_mul_expand
()
.
test_cmp
()
...
...
modules/imgproc/CMakeLists.txt
浏览文件 @
1cc3f7ab
set
(
the_description
"Image Processing"
)
ocv_add_dispatched_file
(
accum SSE
2 AVX NEON
)
ocv_add_dispatched_file
(
accum SSE
4_1 AVX AVX2
)
ocv_define_module
(
imgproc opencv_core WRAP java python js
)
modules/imgproc/perf/perf_accumulate.cpp
浏览文件 @
1cc3f7ab
...
...
@@ -5,94 +5,102 @@
namespace
opencv_test
{
#ifdef HAVE_OPENVX
PERF_TEST_P
(
Size_MatType
,
Accumulate
,
testing
::
Combine
(
testing
::
Values
(
::
perf
::
szODD
,
::
perf
::
szQVGA
,
::
perf
::
szVGA
,
::
perf
::
sz1080p
),
testing
::
Values
(
CV_16SC1
,
CV_32FC1
)
)
)
#else
PERF_TEST_P
(
Size_MatType
,
Accumulate
,
testing
::
Combine
(
testing
::
Values
(
::
perf
::
szODD
,
::
perf
::
szQVGA
,
::
perf
::
szVGA
,
::
perf
::
sz1080p
),
testing
::
Values
(
CV_32FC1
)
)
)
#endif
{
Size
sz
=
get
<
0
>
(
GetParam
());
int
dstType
=
get
<
1
>
(
GetParam
());
Mat
src
(
sz
,
CV_8UC1
);
Mat
dst
(
sz
,
dstType
);
declare
.
time
(
100
);
declare
.
in
(
src
,
WARMUP_RNG
).
out
(
dst
);
TEST_CYCLE
()
accumulate
(
src
,
dst
);
SANITY_CHECK_NOTHING
();
}
#ifdef HAVE_OPENVX
PERF_TEST_P
(
Size_MatType
,
AccumulateSquare
,
testing
::
Combine
(
testing
::
Values
(
::
perf
::
szODD
,
::
perf
::
szQVGA
,
::
perf
::
szVGA
,
::
perf
::
sz1080p
),
testing
::
Values
(
CV_16SC1
,
CV_32FC1
)
)
)
#else
PERF_TEST_P
(
Size_MatType
,
AccumulateSquare
,
testing
::
Combine
(
testing
::
Values
(
::
perf
::
szODD
,
::
perf
::
szQVGA
,
::
perf
::
szVGA
,
::
perf
::
sz1080p
),
testing
::
Values
(
CV_32FC1
)
)
)
#endif
{
Size
sz
=
get
<
0
>
(
GetParam
());
int
dstType
=
get
<
1
>
(
GetParam
());
Mat
src
(
sz
,
CV_8UC1
);
Mat
dst
(
sz
,
dstType
);
declare
.
time
(
100
);
declare
.
in
(
src
,
WARMUP_RNG
).
out
(
dst
);
TEST_CYCLE
()
accumulateSquare
(
src
,
dst
);
SANITY_CHECK_NOTHING
();
}
#ifdef HAVE_OPENVX
PERF_TEST_P
(
Size_MatType
,
AccumulateWeighted
,
testing
::
Combine
(
testing
::
Values
(
::
perf
::
szODD
,
::
perf
::
szQVGA
,
::
perf
::
szVGA
,
::
perf
::
sz1080p
),
testing
::
Values
(
CV_8UC1
,
CV_32FC1
)
)
)
#else
PERF_TEST_P
(
Size_MatType
,
AccumulateWeighted
,
testing
::
Combine
(
testing
::
Values
(
::
perf
::
szODD
,
::
perf
::
szQVGA
,
::
perf
::
szVGA
,
::
perf
::
sz1080p
),
testing
::
Values
(
CV_32FC1
)
)
)
#endif
{
Size
sz
=
get
<
0
>
(
GetParam
());
int
dstType
=
get
<
1
>
(
GetParam
());
Mat
src
(
sz
,
CV_8UC1
);
Mat
dst
(
sz
,
dstType
);
declare
.
time
(
100
);
declare
.
in
(
src
,
WARMUP_RNG
).
out
(
dst
);
TEST_CYCLE
()
accumulateWeighted
(
src
,
dst
,
0.314
);
SANITY_CHECK_NOTHING
();
}
typedef
Size_MatType
Accumulate
;
#define MAT_TYPES_ACCUMLATE CV_8UC1, CV_16UC1, CV_32FC1
#define MAT_TYPES_ACCUMLATE_C MAT_TYPES_ACCUMLATE, CV_8UC3, CV_16UC3, CV_32FC3
#define MAT_TYPES_ACCUMLATE_D MAT_TYPES_ACCUMLATE, CV_64FC1
#define MAT_TYPES_ACCUMLATE_D_C MAT_TYPES_ACCUMLATE_C, CV_64FC1, CV_64FC1
#define PERF_ACCUMULATE_INIT(_FLTC) \
const Size srcSize = get<0>(GetParam()); \
const int srcType = get<1>(GetParam()); \
const int dstType = _FLTC(CV_MAT_CN(srcType)); \
Mat src1(srcSize, srcType), dst(srcSize, dstType); \
declare.in(src1, dst, WARMUP_RNG).out(dst);
#define PERF_ACCUMULATE_MASK_INIT(_FLTC) \
PERF_ACCUMULATE_INIT(_FLTC) \
Mat mask(srcSize, CV_8UC1); \
declare.in(mask, WARMUP_RNG);
#define PERF_TEST_P_ACCUMULATE(_NAME, _TYPES, _INIT, _FUN) \
PERF_TEST_P(Accumulate, _NAME, \
testing::Combine( \
testing::Values(sz1080p, sz720p, szVGA, szQVGA, szODD), \
testing::Values(_TYPES) \
) \
) \
{ \
_INIT \
TEST_CYCLE() _FUN; \
SANITY_CHECK_NOTHING(); \
}
/////////////////////////////////// Accumulate ///////////////////////////////////
PERF_TEST_P_ACCUMULATE
(
Accumulate
,
MAT_TYPES_ACCUMLATE
,
PERF_ACCUMULATE_INIT
(
CV_32FC
),
accumulate
(
src1
,
dst
))
PERF_TEST_P_ACCUMULATE
(
AccumulateMask
,
MAT_TYPES_ACCUMLATE_C
,
PERF_ACCUMULATE_MASK_INIT
(
CV_32FC
),
accumulate
(
src1
,
dst
,
mask
))
PERF_TEST_P_ACCUMULATE
(
AccumulateDouble
,
MAT_TYPES_ACCUMLATE_D
,
PERF_ACCUMULATE_INIT
(
CV_64FC
),
accumulate
(
src1
,
dst
))
PERF_TEST_P_ACCUMULATE
(
AccumulateDoubleMask
,
MAT_TYPES_ACCUMLATE_D_C
,
PERF_ACCUMULATE_MASK_INIT
(
CV_64FC
),
accumulate
(
src1
,
dst
,
mask
))
///////////////////////////// AccumulateSquare ///////////////////////////////////
PERF_TEST_P_ACCUMULATE
(
Square
,
MAT_TYPES_ACCUMLATE
,
PERF_ACCUMULATE_INIT
(
CV_32FC
),
accumulateSquare
(
src1
,
dst
))
PERF_TEST_P_ACCUMULATE
(
SquareMask
,
MAT_TYPES_ACCUMLATE_C
,
PERF_ACCUMULATE_MASK_INIT
(
CV_32FC
),
accumulateSquare
(
src1
,
dst
,
mask
))
PERF_TEST_P_ACCUMULATE
(
SquareDouble
,
MAT_TYPES_ACCUMLATE_D
,
PERF_ACCUMULATE_INIT
(
CV_64FC
),
accumulateSquare
(
src1
,
dst
))
PERF_TEST_P_ACCUMULATE
(
SquareDoubleMask
,
MAT_TYPES_ACCUMLATE_D_C
,
PERF_ACCUMULATE_MASK_INIT
(
CV_64FC
),
accumulateSquare
(
src1
,
dst
,
mask
))
///////////////////////////// AccumulateProduct ///////////////////////////////////
#define PERF_ACCUMULATE_INIT_2(_FLTC) \
PERF_ACCUMULATE_INIT(_FLTC) \
Mat src2(srcSize, srcType); \
declare.in(src2);
#define PERF_ACCUMULATE_MASK_INIT_2(_FLTC) \
PERF_ACCUMULATE_MASK_INIT(_FLTC) \
Mat src2(srcSize, srcType); \
declare.in(src2);
PERF_TEST_P_ACCUMULATE
(
Product
,
MAT_TYPES_ACCUMLATE
,
PERF_ACCUMULATE_INIT_2
(
CV_32FC
),
accumulateProduct
(
src1
,
src2
,
dst
))
PERF_TEST_P_ACCUMULATE
(
ProductMask
,
MAT_TYPES_ACCUMLATE_C
,
PERF_ACCUMULATE_MASK_INIT_2
(
CV_32FC
),
accumulateProduct
(
src1
,
src2
,
dst
,
mask
))
PERF_TEST_P_ACCUMULATE
(
ProductDouble
,
MAT_TYPES_ACCUMLATE_D
,
PERF_ACCUMULATE_INIT_2
(
CV_64FC
),
accumulateProduct
(
src1
,
src2
,
dst
))
PERF_TEST_P_ACCUMULATE
(
ProductDoubleMask
,
MAT_TYPES_ACCUMLATE_D_C
,
PERF_ACCUMULATE_MASK_INIT_2
(
CV_64FC
),
accumulateProduct
(
src1
,
src2
,
dst
,
mask
))
///////////////////////////// AccumulateWeighted ///////////////////////////////////
PERF_TEST_P_ACCUMULATE
(
Weighted
,
MAT_TYPES_ACCUMLATE
,
PERF_ACCUMULATE_INIT
(
CV_32FC
),
accumulateWeighted
(
src1
,
dst
,
0.123
))
PERF_TEST_P_ACCUMULATE
(
WeightedMask
,
MAT_TYPES_ACCUMLATE_C
,
PERF_ACCUMULATE_MASK_INIT
(
CV_32FC
),
accumulateWeighted
(
src1
,
dst
,
0.123
,
mask
))
PERF_TEST_P_ACCUMULATE
(
WeightedDouble
,
MAT_TYPES_ACCUMLATE_D
,
PERF_ACCUMULATE_INIT
(
CV_64FC
),
accumulateWeighted
(
src1
,
dst
,
0.123456
))
PERF_TEST_P_ACCUMULATE
(
WeightedDoubleMask
,
MAT_TYPES_ACCUMLATE_D_C
,
PERF_ACCUMULATE_MASK_INIT
(
CV_64FC
),
accumulateWeighted
(
src1
,
dst
,
0.123456
,
mask
))
}
// namespace
modules/imgproc/src/accum.simd.hpp
浏览文件 @
1cc3f7ab
因为 它太大了无法显示 source diff 。你可以改为
查看blob
。
modules/imgproc/src/smooth.cpp
浏览文件 @
1cc3f7ab
...
...
@@ -1825,7 +1825,7 @@ void hlineSmooth1N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
const
int
VECSZ
=
v_uint16
::
nlanes
;
v_uint16
v_mul
=
vx_setall_u16
(
*
((
uint16_t
*
)
m
));
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
)
v_store
((
uint16_t
*
)
dst
+
i
,
v_mul
*
vx_load_expand
(
src
+
i
));
v_store
((
uint16_t
*
)
dst
+
i
,
v_mul
_wrap
(
v_mul
,
vx_load_expand
(
src
+
i
)
));
#endif
for
(;
i
<
lencn
;
i
++
)
dst
[
i
]
=
m
[
0
]
*
src
[
i
];
...
...
@@ -1915,7 +1915,9 @@ void hlineSmooth3N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
v_uint16
v_mul1
=
vx_setall_u16
(
_m
[
1
]);
v_uint16
v_mul2
=
vx_setall_u16
(
_m
[
2
]);
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
v_store
((
uint16_t
*
)
dst
,
vx_load_expand
(
src
-
cn
)
*
v_mul0
+
vx_load_expand
(
src
)
*
v_mul1
+
vx_load_expand
(
src
+
cn
)
*
v_mul2
);
v_store
((
uint16_t
*
)
dst
,
v_mul_wrap
(
vx_load_expand
(
src
-
cn
),
v_mul0
)
+
v_mul_wrap
(
vx_load_expand
(
src
),
v_mul1
)
+
v_mul_wrap
(
vx_load_expand
(
src
+
cn
),
v_mul2
));
#endif
for
(;
i
<
lencn
;
i
++
,
src
++
,
dst
++
)
*
dst
=
m
[
0
]
*
src
[
-
cn
]
+
m
[
1
]
*
src
[
0
]
+
m
[
2
]
*
src
[
cn
];
...
...
@@ -2089,7 +2091,8 @@ void hlineSmooth3Naba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const
v_uint16
v_mul0
=
vx_setall_u16
(
_m
[
0
]);
v_uint16
v_mul1
=
vx_setall_u16
(
_m
[
1
]);
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
v_store
((
uint16_t
*
)
dst
,
(
vx_load_expand
(
src
-
cn
)
+
vx_load_expand
(
src
+
cn
))
*
v_mul0
+
vx_load_expand
(
src
)
*
v_mul1
);
v_store
((
uint16_t
*
)
dst
,
v_mul_wrap
(
vx_load_expand
(
src
-
cn
)
+
vx_load_expand
(
src
+
cn
),
v_mul0
)
+
v_mul_wrap
(
vx_load_expand
(
src
),
v_mul1
));
#endif
for
(;
i
<
lencn
;
i
++
,
src
++
,
dst
++
)
*
((
uint16_t
*
)
dst
)
=
((
uint16_t
*
)
m
)[
1
]
*
src
[
0
]
+
((
uint16_t
*
)
m
)[
0
]
*
((
uint16_t
)(
src
[
-
cn
])
+
(
uint16_t
)(
src
[
cn
]));
...
...
@@ -2285,7 +2288,11 @@ void hlineSmooth5N<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufi
v_uint16
v_mul3
=
vx_setall_u16
(
_m
[
3
]);
v_uint16
v_mul4
=
vx_setall_u16
(
_m
[
4
]);
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
v_store
((
uint16_t
*
)
dst
,
vx_load_expand
(
src
-
2
*
cn
)
*
v_mul0
+
vx_load_expand
(
src
-
cn
)
*
v_mul1
+
vx_load_expand
(
src
)
*
v_mul2
+
vx_load_expand
(
src
+
cn
)
*
v_mul3
+
vx_load_expand
(
src
+
2
*
cn
)
*
v_mul4
);
v_store
((
uint16_t
*
)
dst
,
v_mul_wrap
(
vx_load_expand
(
src
-
2
*
cn
),
v_mul0
)
+
v_mul_wrap
(
vx_load_expand
(
src
-
cn
),
v_mul1
)
+
v_mul_wrap
(
vx_load_expand
(
src
),
v_mul2
)
+
v_mul_wrap
(
vx_load_expand
(
src
+
cn
),
v_mul3
)
+
v_mul_wrap
(
vx_load_expand
(
src
+
2
*
cn
),
v_mul4
));
#endif
for
(;
i
<
lencn
;
i
++
,
src
++
,
dst
++
)
*
dst
=
m
[
0
]
*
src
[
-
2
*
cn
]
+
m
[
1
]
*
src
[
-
cn
]
+
m
[
2
]
*
src
[
0
]
+
m
[
3
]
*
src
[
cn
]
+
m
[
4
]
*
src
[
2
*
cn
];
...
...
@@ -2488,7 +2495,7 @@ void hlineSmooth5N14641<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
const
int
VECSZ
=
v_uint16
::
nlanes
;
v_uint16
v_6
=
vx_setall_u16
(
6
);
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
v_store
((
uint16_t
*
)
dst
,
(
v
x_load_expand
(
src
)
*
v_6
+
((
vx_load_expand
(
src
-
cn
)
+
vx_load_expand
(
src
+
cn
))
<<
2
)
+
vx_load_expand
(
src
-
2
*
cn
)
+
vx_load_expand
(
src
+
2
*
cn
))
<<
4
);
v_store
((
uint16_t
*
)
dst
,
(
v
_mul_wrap
(
vx_load_expand
(
src
),
v_6
)
+
((
vx_load_expand
(
src
-
cn
)
+
vx_load_expand
(
src
+
cn
))
<<
2
)
+
vx_load_expand
(
src
-
2
*
cn
)
+
vx_load_expand
(
src
+
2
*
cn
))
<<
4
);
#endif
for
(;
i
<
lencn
;
i
++
,
src
++
,
dst
++
)
*
((
uint16_t
*
)
dst
)
=
(
uint16_t
(
src
[
0
])
*
6
+
((
uint16_t
(
src
[
-
cn
])
+
uint16_t
(
src
[
cn
]))
<<
2
)
+
uint16_t
(
src
[
-
2
*
cn
])
+
uint16_t
(
src
[
2
*
cn
]))
<<
4
;
...
...
@@ -2689,7 +2696,9 @@ void hlineSmooth5Nabcba<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, cons
v_uint16
v_mul1
=
vx_setall_u16
(
_m
[
1
]);
v_uint16
v_mul2
=
vx_setall_u16
(
_m
[
2
]);
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
v_store
((
uint16_t
*
)
dst
,
(
vx_load_expand
(
src
-
2
*
cn
)
+
vx_load_expand
(
src
+
2
*
cn
))
*
v_mul0
+
(
vx_load_expand
(
src
-
cn
)
+
vx_load_expand
(
src
+
cn
))
*
v_mul1
+
vx_load_expand
(
src
)
*
v_mul2
);
v_store
((
uint16_t
*
)
dst
,
v_mul_wrap
(
vx_load_expand
(
src
-
2
*
cn
)
+
vx_load_expand
(
src
+
2
*
cn
),
v_mul0
)
+
v_mul_wrap
(
vx_load_expand
(
src
-
cn
)
+
vx_load_expand
(
src
+
cn
),
v_mul1
)
+
v_mul_wrap
(
vx_load_expand
(
src
),
v_mul2
));
#endif
for
(;
i
<
lencn
;
i
++
,
src
++
,
dst
++
)
*
((
uint16_t
*
)
dst
)
=
((
uint16_t
*
)
m
)[
0
]
*
((
uint16_t
)(
src
[
-
2
*
cn
])
+
(
uint16_t
)(
src
[
2
*
cn
]))
+
((
uint16_t
*
)
m
)[
1
]
*
((
uint16_t
)(
src
[
-
cn
])
+
(
uint16_t
)(
src
[
cn
]))
+
((
uint16_t
*
)
m
)[
2
]
*
src
[
0
];
...
...
@@ -2804,9 +2813,9 @@ void hlineSmooth<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, const ufixe
const
int
VECSZ
=
v_uint16
::
nlanes
;
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
{
v_uint16
v_res0
=
v
x_load_expand
(
src
)
*
vx_setall_u16
(
*
((
uint16_t
*
)
m
));
v_uint16
v_res0
=
v
_mul_wrap
(
vx_load_expand
(
src
),
vx_setall_u16
(
*
((
uint16_t
*
)
m
)
));
for
(
int
j
=
1
;
j
<
n
;
j
++
)
v_res0
+=
v
x_load_expand
(
src
+
j
*
cn
)
*
vx_setall_u16
(
*
((
uint16_t
*
)(
m
+
j
)));
v_res0
+=
v
_mul_wrap
(
vx_load_expand
(
src
+
j
*
cn
),
vx_setall_u16
(
*
((
uint16_t
*
)(
m
+
j
)
)));
v_store
((
uint16_t
*
)
dst
,
v_res0
);
}
#endif
...
...
@@ -2923,9 +2932,9 @@ void hlineSmoothONa_yzy_a<uint8_t, ufixedpoint16>(const uint8_t* src, int cn, co
const
int
VECSZ
=
v_uint16
::
nlanes
;
for
(;
i
<=
lencn
-
VECSZ
;
i
+=
VECSZ
,
src
+=
VECSZ
,
dst
+=
VECSZ
)
{
v_uint16
v_res0
=
v
x_load_expand
(
src
+
pre_shift
*
cn
)
*
vx_setall_u16
(
*
((
uint16_t
*
)(
m
+
pre_shift
)));
v_uint16
v_res0
=
v
_mul_wrap
(
vx_load_expand
(
src
+
pre_shift
*
cn
),
vx_setall_u16
(
*
((
uint16_t
*
)(
m
+
pre_shift
)
)));
for
(
int
j
=
0
;
j
<
pre_shift
;
j
++
)
v_res0
+=
(
vx_load_expand
(
src
+
j
*
cn
)
+
vx_load_expand
(
src
+
(
n
-
1
-
j
)
*
cn
))
*
vx_setall_u16
(
*
((
uint16_t
*
)(
m
+
j
)));
v_res0
+=
v_mul_wrap
(
vx_load_expand
(
src
+
j
*
cn
)
+
vx_load_expand
(
src
+
(
n
-
1
-
j
)
*
cn
),
vx_setall_u16
(
*
((
uint16_t
*
)(
m
+
j
)
)));
v_store
((
uint16_t
*
)
dst
,
v_res0
);
}
#endif
...
...
modules/video/src/lkpyramid.cpp
浏览文件 @
1cc3f7ab
...
...
@@ -93,7 +93,7 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
v_int16x8
s2
=
v_reinterpret_as_s16
(
v_load_expand
(
srow2
+
x
));
v_int16x8
t1
=
s2
-
s0
;
v_int16x8
t0
=
(
s0
+
s2
)
*
c3
+
s1
*
c10
;
v_int16x8
t0
=
v_mul_wrap
(
s0
+
s2
,
c3
)
+
v_mul_wrap
(
s1
,
c10
)
;
v_store
(
trow0
+
x
,
t0
);
v_store
(
trow1
+
x
,
t1
);
...
...
@@ -131,7 +131,7 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
v_int16x8
s4
=
v_load
(
trow1
+
x
+
cn
);
v_int16x8
t0
=
s1
-
s0
;
v_int16x8
t1
=
((
s2
+
s4
)
*
c3
)
+
(
s3
*
c10
);
v_int16x8
t1
=
v_mul_wrap
(
s2
+
s4
,
c3
)
+
v_mul_wrap
(
s3
,
c10
);
v_store_interleave
((
drow
+
x
*
2
),
t0
,
t1
);
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录