Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
8546c15d
MegEngine
项目概览
MegEngine 天元
/
MegEngine
大约 1 年 前同步成功
通知
396
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
8546c15d
编写于
6月 14, 2022
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
feat(gi): make elemwise apply gi class type
GitOrigin-RevId: 6ff1a8a55ce5e01a93b4c619833dcd70ebe2f735
上级
74fb63db
变更
28
展开全部
隐藏空白更改
内联
并排
Showing
28 changed file
with
1297 addition
and
838 deletion
+1297
-838
dnn/src/arm_common/elemwise_helper/elemwise_op.h
dnn/src/arm_common/elemwise_helper/elemwise_op.h
+46
-14
dnn/src/fallback/elemwise/gi_impl/gi_mathfun.cpp
dnn/src/fallback/elemwise/gi_impl/gi_mathfun.cpp
+1
-1
dnn/src/fallback/elemwise_helper/kimpl/abs.h
dnn/src/fallback/elemwise_helper/kimpl/abs.h
+32
-19
dnn/src/fallback/elemwise_helper/kimpl/add.h
dnn/src/fallback/elemwise_helper/kimpl/add.h
+47
-16
dnn/src/fallback/elemwise_helper/kimpl/exp.h
dnn/src/fallback/elemwise_helper/kimpl/exp.h
+22
-16
dnn/src/fallback/elemwise_helper/kimpl/fast_tanh.h
dnn/src/fallback/elemwise_helper/kimpl/fast_tanh.h
+9
-5
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_h_swish.h
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_h_swish.h
+31
-14
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_relu.h
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_relu.h
+43
-17
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_sigmoid.h
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_sigmoid.h
+11
-7
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_tanh.h
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_tanh.h
+11
-7
dnn/src/fallback/elemwise_helper/kimpl/fuse_mul_add3.h
dnn/src/fallback/elemwise_helper/kimpl/fuse_mul_add3.h
+30
-22
dnn/src/fallback/elemwise_helper/kimpl/hswish.h
dnn/src/fallback/elemwise_helper/kimpl/hswish.h
+51
-38
dnn/src/fallback/elemwise_helper/kimpl/max.h
dnn/src/fallback/elemwise_helper/kimpl/max.h
+29
-10
dnn/src/fallback/elemwise_helper/kimpl/min.h
dnn/src/fallback/elemwise_helper/kimpl/min.h
+29
-10
dnn/src/fallback/elemwise_helper/kimpl/mul.h
dnn/src/fallback/elemwise_helper/kimpl/mul.h
+29
-10
dnn/src/fallback/elemwise_helper/kimpl/none.h
dnn/src/fallback/elemwise_helper/kimpl/none.h
+20
-19
dnn/src/fallback/elemwise_helper/kimpl/op_base.h
dnn/src/fallback/elemwise_helper/kimpl/op_base.h
+206
-142
dnn/src/fallback/elemwise_helper/kimpl/relu.h
dnn/src/fallback/elemwise_helper/kimpl/relu.h
+67
-39
dnn/src/fallback/elemwise_helper/kimpl/sigmoid.h
dnn/src/fallback/elemwise_helper/kimpl/sigmoid.h
+27
-21
dnn/src/fallback/elemwise_helper/kimpl/sub.h
dnn/src/fallback/elemwise_helper/kimpl/sub.h
+29
-10
dnn/src/fallback/elemwise_helper/kimpl/tanh.h
dnn/src/fallback/elemwise_helper/kimpl/tanh.h
+52
-48
dnn/src/fallback/elemwise_helper/kimpl/true_div.h
dnn/src/fallback/elemwise_helper/kimpl/true_div.h
+11
-7
dnn/src/fallback/elemwise_helper/kimpl/typecvt.h
dnn/src/fallback/elemwise_helper/kimpl/typecvt.h
+16
-7
dnn/src/fallback/elemwise_helper/op_common.h
dnn/src/fallback/elemwise_helper/op_common.h
+182
-131
dnn/src/fallback/gi_intrinsic_helper.h
dnn/src/fallback/gi_intrinsic_helper.h
+3
-2
dnn/src/fallback/quantized_converter.h
dnn/src/fallback/quantized_converter.h
+7
-1
dnn/src/fallback/reduce/reducer.h
dnn/src/fallback/reduce/reducer.h
+195
-156
dnn/src/fallback/type_cvt/typecvt_helper.h
dnn/src/fallback/type_cvt/typecvt_helper.h
+61
-49
未找到文件。
dnn/src/arm_common/elemwise_helper/elemwise_op.h
浏览文件 @
8546c15d
...
...
@@ -12,7 +12,7 @@ using BcastType = megdnn::elemwise::BcastType;
///////////////////////////////// ParamElemVistor ///////////////////////////
#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix
)
\
#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix
, _neon_type_v2)
\
template <> \
struct ParamElemVisitor<_ctype> { \
_neon_type operator()(const _ctype* src) const { \
...
...
@@ -24,29 +24,61 @@ using BcastType = megdnn::elemwise::BcastType;
_neon_type operator()(const _ctype* src) const { \
return vdupq_n_##_fun_suffix(*reinterpret_cast<const _inner_ctype*>(src)); \
} \
}; \
template <> \
struct ParamElemVisitorV2<_ctype> { \
_neon_type_v2 operator()(const _ctype* src, const _ctype* src_1) const { \
_neon_type_v2 ret; \
ret.val[0] = \
vld1q_##_fun_suffix(reinterpret_cast<const _inner_ctype*>(src)); \
ret.val[1] = \
vld1q_##_fun_suffix(reinterpret_cast<const _inner_ctype*>(src_1)); \
return ret; \
} \
}; \
template <> \
struct ParamElemVisitorDupV2<_ctype> { \
_neon_type_v2 operator()(const _ctype* src) const { \
_neon_type_v2 ret; \
ret.val[0] = vdupq_n_##_fun_suffix( \
*reinterpret_cast<const _inner_ctype*>(src)); \
ret.val[1] = ret.val[0]; \
return ret; \
} \
}
cb
(
dt_quint8
,
uint8_t
,
uint8x16_t
,
u8
);
cb
(
dt_quint8
,
uint8_t
,
uint8x16_t
,
u8
,
uint8x16x2_t
);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
cb
(
__fp16
,
__fp16
,
float16x8_t
,
f16
);
cb
(
__fp16
,
__fp16
,
float16x8_t
,
f16
,
float16x8x2_t
);
#endif
cb
(
dt_int16
,
int16_t
,
int16x8_t
,
s16
);
cb
(
dt_int16
,
int16_t
,
int16x8_t
,
s16
,
int16x8x2_t
);
#undef cb
template
<
typename
ctype
>
struct
ParamElemVisitorBcast101x4
;
#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, rel_suffix) \
template <> \
struct ParamElemVisitorBcast101x4<_ctype> { \
_neon_type operator()(const _ctype* src) const { \
return vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \
reinterpret_cast<const _inner_ctype*>(src))); \
} \
#define cb(_ctype, _inner_ctype, _neon_type, _fun_suffix, rel_suffix, _neon_type_v2) \
template <> \
struct ParamElemVisitorBcast101x4<_ctype> { \
_neon_type operator()(const _ctype* src) const { \
return vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \
reinterpret_cast<const _inner_ctype*>(src))); \
} \
}; \
template <> \
struct ParamElemVisitorBcast101x4V2<_ctype> { \
_neon_type_v2 operator()(const _ctype* src) const { \
_neon_type_v2 ret; \
ret.val[0] = \
vreinterpretq_##_fun_suffix##_##rel_suffix(vld1q_dup_##rel_suffix( \
reinterpret_cast<const _inner_ctype*>(src))); \
ret.val[1] = ret.val[0]; \
return ret; \
} \
}
cb
(
dt_quint8
,
uint32_t
,
uint8x16_t
,
u8
,
u32
);
cb
(
dt_int16
,
int64_t
,
int16x8_t
,
s16
,
s64
);
cb
(
dt_quint8
,
uint32_t
,
uint8x16_t
,
u8
,
u32
,
uint8x16x2_t
);
cb
(
dt_int16
,
int64_t
,
int16x8_t
,
s16
,
s64
,
int16x8x2_t
);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
cb
(
__fp16
,
uint64_t
,
float16x8_t
,
f16
,
u64
);
cb
(
__fp16
,
uint64_t
,
float16x8_t
,
f16
,
u64
,
float16x8x2_t
);
#endif
#undef cb
...
...
dnn/src/fallback/elemwise/gi_impl/gi_mathfun.cpp
浏览文件 @
8546c15d
...
...
@@ -283,7 +283,7 @@ v4sf GiCosPsFloat32(v4sf x) {
v4sf
GiTanPsFloat32
(
v4sf
x
)
{
v4sf
ysin
,
ycos
;
GiSinCosPsFloat32
(
x
,
&
ysin
,
&
ycos
);
return
ysin
/
ycos
;
return
GiDivFloat32
(
ysin
,
ycos
)
;
}
#undef c_exp_hi
...
...
dnn/src/fallback/elemwise_helper/kimpl/abs.h
浏览文件 @
8546c15d
...
...
@@ -20,22 +20,28 @@ struct AbsOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template
<
typename
src_ctype
,
typename
dst_ctype
=
src_ctype
>
struct
AbsOp
;
#define OP(_ctype, _gi_type, _func_suffix, _simd_width) \
template <> \
struct AbsOp<_ctype> : AbsOpBase<_ctype> { \
using AbsOpBase::AbsOpBase; \
using AbsOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _gi_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_gi_type operator()(const _gi_type& src) const { \
auto vitem0 = GiAbs##_func_suffix(src.val[0]); \
auto vitem1 = GiAbs##_func_suffix(src.val[1]); \
return {{vitem0, vitem1}}; \
} \
#define OP(_ctype, _gi_type, _func_suffix, _simd_width) \
template <> \
struct AbsOp<_ctype> : AbsOpBase<_ctype> { \
using AbsOpBase::AbsOpBase; \
using AbsOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _gi_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_gi_type operator()(const _gi_type& src) const { \
auto vitem0 = \
GiAbs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 0)); \
auto vitem1 = \
GiAbs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 1)); \
_gi_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
};
OP
(
dt_float32
,
GI_FLOAT32_V2_t
,
Float32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
dt_float32
))
OP
(
dt_int32
,
GI_INT32_V2_t
,
Int32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
dt_int32
))
...
...
@@ -64,11 +70,18 @@ struct AbsOp<dt_qint8, dt_qint8> : AbsOpBase<dt_qint8, dt_qint8> {
OPERATOR_UNARY_QINT8_FALLBACK
;
}
GI_INT8_t
operator
()(
const
GI_INT32_V2_t
&
vsrc
)
const
{
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc
.
val
[
0
]),
this
->
vscale
);
auto
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc
.
val
[
1
]),
this
->
vscale
);
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale
));
auto
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale
));
vitem0
=
GiAbsFloat32
(
vitem0
);
vitem1
=
GiAbsFloat32
(
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
GI_FLOAT32_V2_t
tmp
;
GiSetSubVectorFloat32V2
(
tmp
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
tmp
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
tmp
);
}
};
...
...
dnn/src/fallback/elemwise_helper/kimpl/add.h
浏览文件 @
8546c15d
...
...
@@ -33,13 +33,21 @@ struct AddOp;
void operator()( \
const _gi_type2& src0, const _gi_type2& src1, dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_gi_type2 operator()(const _gi_type2& src0, const _gi_type2& src1) const { \
auto vitem0 = GiAdd##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiAdd##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiAdd##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiAdd##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_gi_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _gi_type& src0, const _gi_type& src1, dst_ctype* dst) const { \
...
...
@@ -82,13 +90,24 @@ struct AddOp<dt_qint8, dt_qint8> : AddOpBase<dt_qint8, dt_qint8> {
GI_INT8_t
operator
()(
const
GI_INT32_V2_t
&
vsrc0
,
const
GI_INT32_V2_t
&
vsrc1
)
const
{
auto
vitem0
=
GiAddFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
0
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
0
]),
this
->
vscale1
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
auto
vitem1
=
GiAddFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
1
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
1
]),
this
->
vscale1
));
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
GI_FLOAT32_V2_t
ret
;
GiSetSubVectorFloat32V2
(
ret
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
ret
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
ret
);
}
};
...
...
@@ -119,12 +138,24 @@ struct AddOp<dt_qint32, dt_qint8> : AddOpBase<dt_qint32, dt_qint8> {
GI_INT8_t
operator
()(
const
GI_INT32_V2_t
&
vsrc0
,
const
GI_INT32_V2_t
&
vsrc1
)
const
{
auto
vitem0
=
GiAddFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
0
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
0
]),
this
->
vscale1
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
auto
vitem1
=
GiAddFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
1
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
1
]),
this
->
vscale1
));
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
GI_FLOAT32_V2_t
ret
;
GiSetSubVectorFloat32V2
(
ret
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
ret
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
ret
);
}
};
...
...
dnn/src/fallback/elemwise_helper/kimpl/exp.h
浏览文件 @
8546c15d
...
...
@@ -23,22 +23,28 @@ struct ExpOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template
<
typename
src_ctype
,
typename
dst_ctype
=
src_ctype
>
struct
ExpOp
;
#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \
template <> \
struct ExpOp<_ctype> : ExpOpBase<_ctype> { \
using ExpOpBase::ExpOpBase; \
using ExpOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_simd_type operator()(const _simd_type& src) const { \
auto vitem0 = GiExpPs##_func_suffix(src.val[0]); \
auto vitem1 = GiExpPs##_func_suffix(src.val[1]); \
return {{vitem0, vitem1}}; \
} \
#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \
template <> \
struct ExpOp<_ctype> : ExpOpBase<_ctype> { \
using ExpOpBase::ExpOpBase; \
using ExpOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()(const _simd_type& src) const { \
auto vitem0 = \
GiExpPs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 0)); \
auto vitem1 = \
GiExpPs##_func_suffix(GiGetSubVector##_func_suffix##V2(src, 1)); \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
};
OP
(
dt_float32
,
GI_FLOAT32_V2_t
,
Float32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
float
))
#undef OP
...
...
dnn/src/fallback/elemwise_helper/kimpl/fast_tanh.h
浏览文件 @
8546c15d
...
...
@@ -32,14 +32,15 @@ struct FastTanhOp;
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()(const _simd_type& src) const { \
auto val_27 = GiBroadcast##_func_suffix(27.f); \
auto val_9 = GiBroadcast##_func_suffix(9.f); \
auto valx =
src.val[0];
\
auto valx1 =
src.val[1];
\
auto valx =
GiGetSubVector##_func_suffix##V2(src, 0);
\
auto valx1 =
GiGetSubVector##_func_suffix##V2(src, 1);
\
auto valxp2 = GiMultiply##_fix_func_suffix(valx, valx); \
auto valx1p2 = GiMultiply##_fix_func_suffix(valx1, valx1); \
auto denominator = GiAdd##_fix_func_suffix(valxp2, val_27); \
...
...
@@ -58,7 +59,10 @@ struct FastTanhOp;
r_denominator1); \
valx = GiMultiply##_fix_func_suffix(valx, r_denominator); \
valx1 = GiMultiply##_fix_func_suffix(valx1, r_denominator1); \
return {{valx, valx1}}; \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, valx); \
GiSetSubVector##_func_suffix##V2(ret, 1, valx1); \
return ret; \
} \
};
OP
(
dt_float32
,
GI_FLOAT32_V2_t
,
Float32
,
Float32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
float
))
...
...
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_h_swish.h
浏览文件 @
8546c15d
...
...
@@ -36,19 +36,23 @@ struct FuseAddHSwishOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto val1 =
src0.val[0];
\
auto val2 =
src0.val[1];
\
auto val3 =
src1.val[0];
\
auto val4 =
src1.val[1];
\
auto val1 =
GiGetSubVector##_func_suffix##V2(src0, 0);
\
auto val2 =
GiGetSubVector##_func_suffix##V2(src0, 1);
\
auto val3 =
GiGetSubVector##_func_suffix##V2(src1, 0);
\
auto val4 =
GiGetSubVector##_func_suffix##V2(src1, 1);
\
val1 = GiAdd##_func_suffix(val1, val3); \
val2 = GiAdd##_func_suffix(val2, val4); \
H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \
return {{val1, val2}}; \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
...
...
@@ -98,15 +102,28 @@ struct FuseAddHSwishOp<dt_qint32, dt_qint8> : FuseAddHSwishOpBase<dt_qint32, dt_
GI_FLOAT32_t
vitem0
,
vitem1
;
vitem0
=
GiAddFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
0
]),
this
->
vscale_src0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
0
]),
this
->
vscale_src1
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale_src0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale_src1
)));
vitem1
=
GiAddFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
1
]),
this
->
vscale_src0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
1
]),
this
->
vscale_src1
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale_src0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale_src1
)));
H_SWISH_KERN_FALLBACK
(
Float32
,
vitem0
,
vitem1
);
vitem0
=
GiMultiplyFloat32
(
vitem0
,
this
->
vscale_dst
);
vitem1
=
GiMultiplyFloat32
(
vitem1
,
this
->
vscale_dst
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
vitem0
=
GiMultiplyFloat32
(
vitem0
,
GiFixLenType2GiFloat32Type
(
this
->
vscale_dst
));
vitem1
=
GiMultiplyFloat32
(
vitem1
,
GiFixLenType2GiFloat32Type
(
this
->
vscale_dst
));
GI_FLOAT32_V2_t
ret
;
GiSetSubVectorFloat32V2
(
ret
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
ret
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
ret
);
}
};
...
...
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_relu.h
浏览文件 @
8546c15d
...
...
@@ -35,17 +35,21 @@ struct FuseAddReluOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto val1 =
src0.val[0];
\
auto val2 =
src0.val[1];
\
auto val3 =
src1.val[0];
\
auto val4 =
src1.val[1];
\
auto val1 =
GiGetSubVector##_func_suffix##V2(src0, 0);
\
auto val2 =
GiGetSubVector##_func_suffix##V2(src0, 1);
\
auto val3 =
GiGetSubVector##_func_suffix##V2(src1, 0);
\
auto val4 =
GiGetSubVector##_func_suffix##V2(src1, 1);
\
FUSE_ADD_RELU_SIMD_PACK2_FALLBACK(val1, val2, val3, val4, _func_suffix); \
return {{val1, val2}}; \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
...
...
@@ -105,15 +109,26 @@ struct FuseAddReluOp<dt_qint8, dt_qint8> : FuseAddReluOpBase<dt_qint8, dt_qint8>
GI_INT8_t
operator
()(
const
GI_INT32_V2_t
&
vsrc0
,
const
GI_INT32_V2_t
&
vsrc1
)
const
{
auto
vitem0
=
GiAddFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
0
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
0
]),
this
->
vscale1
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
auto
vitem1
=
GiAddFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
1
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
1
]),
this
->
vscale1
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
vitem0
=
GiMaximumFloat32
(
vitem0
,
this
->
vzero
());
vitem1
=
GiMaximumFloat32
(
vitem1
,
this
->
vzero
());
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
GI_FLOAT32_V2_t
ret
;
GiSetSubVectorFloat32V2
(
ret
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
ret
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
ret
);
}
};
...
...
@@ -144,15 +159,26 @@ struct FuseAddReluOp<dt_qint32, dt_qint8> : FuseAddReluOpBase<dt_qint32, dt_qint
GI_INT8_t
operator
()(
const
GI_INT32_V2_t
&
vsrc0
,
const
GI_INT32_V2_t
&
vsrc1
)
const
{
auto
vitem0
=
GiAddFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
0
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
0
]),
this
->
vscale1
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
auto
vitem1
=
GiAddFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
1
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
1
]),
this
->
vscale1
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
vitem0
=
GiMaximumFloat32
(
vitem0
,
this
->
vzero
());
vitem1
=
GiMaximumFloat32
(
vitem1
,
this
->
vzero
());
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
GI_FLOAT32_V2_t
ret
;
GiSetSubVectorFloat32V2
(
ret
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
ret
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
ret
);
}
};
...
...
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_sigmoid.h
浏览文件 @
8546c15d
...
...
@@ -36,19 +36,23 @@ struct FuseAddSigmoidOp;
const _simd_type& src0, const _simd_type& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()(const _simd_type& src0, const _simd_type& src1) const { \
auto val1 =
src0.val[0];
\
auto val2 =
src0.val[1];
\
auto val3 =
src1.val[0];
\
auto val4 =
src1.val[1];
\
auto val1 =
GiGetSubVector##_func_suffix##V2(src0, 0);
\
auto val2 =
GiGetSubVector##_func_suffix##V2(src0, 1);
\
auto val3 =
GiGetSubVector##_func_suffix##V2(src1, 0);
\
auto val4 =
GiGetSubVector##_func_suffix##V2(src1, 1);
\
val1 = GiAdd##_func_suffix(val1, val3); \
val2 = GiAdd##_func_suffix(val2, val4); \
val1 = GiSigmoidPs##_func_suffix(val1); \
val2 = GiSigmoidPs##_func_suffix(val2); \
return {{val1, val2}}; \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
};
OP
(
dt_float32
,
GI_FLOAT32_V2_t
,
Float32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
float
))
...
...
dnn/src/fallback/elemwise_helper/kimpl/fuse_add_tanh.h
浏览文件 @
8546c15d
...
...
@@ -35,14 +35,15 @@ struct FuseAddTanhOp;
const _simd_type& src0, const _simd_type& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()(const _simd_type& src0, const _simd_type& src1) const { \
auto val1 =
src0.val[0];
\
auto val2 =
src0.val[1];
\
auto val3 =
src1.val[0];
\
auto val4 =
src1.val[1];
\
auto val1 =
GiGetSubVector##_func_suffix##V2(src0, 0);
\
auto val2 =
GiGetSubVector##_func_suffix##V2(src0, 1);
\
auto val3 =
GiGetSubVector##_func_suffix##V2(src1, 0);
\
auto val4 =
GiGetSubVector##_func_suffix##V2(src1, 1);
\
val1 = GiAdd##_func_suffix(val1, val3); \
val2 = GiAdd##_func_suffix(val2, val4); \
auto exp1 = GiExpPs##_func_suffix(val1); \
...
...
@@ -65,7 +66,10 @@ struct FuseAddTanhOp;
GiRecpeS##_func_suffix(exp2, rexp2), rexp2); \
val1 = GiMultiply##_func_suffix(val1, rexp1); \
val2 = GiMultiply##_func_suffix(val2, rexp2); \
return {{val1, val2}}; \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
};
OP
(
dt_float32
,
GI_FLOAT32_V2_t
,
Float32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
float
))
...
...
dnn/src/fallback/elemwise_helper/kimpl/fuse_mul_add3.h
浏览文件 @
8546c15d
...
...
@@ -26,28 +26,36 @@ struct FuseMulAdd3OpBase : TernaryOpBase<src_ctype, dst_ctype> {
template
<
typename
src_ctype
,
typename
dst_ctype
=
src_ctype
>
struct
FuseMulAdd3Op
;
#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \
template <> \
struct FuseMulAdd3Op<_ctype> : FuseMulAdd3OpBase<_ctype> { \
using FuseMulAdd3OpBase::FuseMulAdd3OpBase; \
using FuseMulAdd3OpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
const _simd_type& src2, dst_ctype* dst) const { \
auto vitem = operator()(src0, src1, src2); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_simd_type operator()( \
const _simd_type& src0, const _simd_type& src1, \
const _simd_type& src2) const { \
auto vitem0 = GiMultiplyAdd##_func_suffix( \
src2.val[0], src0.val[0], src1.val[0]); \
auto vitem1 = GiMultiplyAdd##_func_suffix( \
src2.val[1], src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
} \
#define OP(_ctype, _simd_type, _func_suffix, _simd_width) \
template <> \
struct FuseMulAdd3Op<_ctype> : FuseMulAdd3OpBase<_ctype> { \
using FuseMulAdd3OpBase::FuseMulAdd3OpBase; \
using FuseMulAdd3OpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
const _simd_type& src2, dst_ctype* dst) const { \
auto vitem = operator()(src0, src1, src2); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type operator()( \
const _simd_type& src0, const _simd_type& src1, \
const _simd_type& src2) const { \
auto vitem0 = GiMultiplyAdd##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src2, 0), \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiMultiplyAdd##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src2, 1), \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
};
OP
(
dt_float32
,
GI_FLOAT32_V2_t
,
Float32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
float
))
OP
(
dt_int32
,
GI_INT32_V2_t
,
Int32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
))
...
...
dnn/src/fallback/elemwise_helper/kimpl/hswish.h
浏览文件 @
8546c15d
...
...
@@ -26,39 +26,43 @@ struct HSwishOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template
<
typename
src_ctype
,
typename
dst_ctype
=
src_ctype
>
struct
HSwishOp
;
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct HSwishOp<_ctype> : HSwishOpBase<_ctype> { \
using HSwishOpBase::HSwishOpBase; \
using HSwishOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto val1 = src.val[0]; \
auto val2 = src.val[1]; \
H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \
return {{val1, val2}}; \
} \
_simd_type operator()(const _simd_type& src) const { \
auto val_zero = GiBroadcast##_func_suffix(0.f); \
auto val_six = GiBroadcast##_func_suffix(6.f); \
auto val_three = GiBroadcast##_func_suffix(3.f); \
auto val_rec_six = GiBroadcast##_func_suffix(1.f / 6.f); \
auto clip1 = GiMaximum##_func_suffix( \
GiMinimum##_func_suffix( \
GiAdd##_func_suffix(src, val_three), val_six), \
val_zero); \
return GiMultiply##_func_suffix( \
GiMultiply##_func_suffix(src, clip1), val_rec_six); \
} \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct HSwishOp<_ctype> : HSwishOpBase<_ctype> { \
using HSwishOpBase::HSwishOpBase; \
using HSwishOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto val1 = GiGetSubVector##_func_suffix##V2(src, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src, 1); \
H_SWISH_KERN_FALLBACK(_func_suffix, val1, val2); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
_simd_type operator()(const _simd_type& src) const { \
auto val_zero = GiBroadcast##_func_suffix(0.f); \
auto val_six = GiBroadcast##_func_suffix(6.f); \
auto val_three = GiBroadcast##_func_suffix(3.f); \
auto val_rec_six = GiBroadcast##_func_suffix(1.f / 6.f); \
auto clip1 = GiMaximum##_func_suffix( \
GiMinimum##_func_suffix( \
GiAdd##_func_suffix(src, val_three), val_six), \
val_zero); \
return GiMultiply##_func_suffix( \
GiMultiply##_func_suffix(src, clip1), val_rec_six); \
} \
};
OP
(
dt_float32
,
GI_FLOAT32_t
,
GI_FLOAT32_V2_t
,
Float32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
float
))
...
...
@@ -90,14 +94,23 @@ struct HSwishOp<dt_qint32, dt_qint8> : HSwishOpBase<dt_qint32, dt_qint8> {
}
GI_INT8_t
operator
()(
const
GI_INT32_V2_t
&
vsrc
)
const
{
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc
.
val
[
0
]),
this
->
vscale_src
);
auto
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc
.
val
[
1
]),
this
->
vscale_src
);
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale_src
));
auto
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale_src
));
H_SWISH_KERN_FALLBACK
(
Float32
,
vitem0
,
vitem1
);
vitem0
=
GiMultiplyFloat32
(
vitem0
,
this
->
vscale_dst
);
vitem1
=
GiMultiplyFloat32
(
vitem1
,
this
->
vscale_dst
);
vitem0
=
GiMultiplyFloat32
(
vitem0
,
GiFixLenType2GiFloat32Type
(
this
->
vscale_dst
));
vitem1
=
GiMultiplyFloat32
(
vitem1
,
GiFixLenType2GiFloat32Type
(
this
->
vscale_dst
));
GI_FLOAT32_V2_t
tmp
;
GiSetSubVectorFloat32V2
(
tmp
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
tmp
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
{{
vitem0
,
vitem1
}}
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
tmp
);
}
};
...
...
dnn/src/fallback/elemwise_helper/kimpl/max.h
浏览文件 @
8546c15d
...
...
@@ -32,14 +32,22 @@ struct MaxOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto vitem0 = GiMaximum##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiMaximum##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiMaximum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiMaximum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
...
...
@@ -87,12 +95,23 @@ struct MaxOp<dt_qint8, dt_qint8> : MaxOpBase<dt_qint8, dt_qint8> {
GI_INT8_t
operator
()(
const
GI_INT32_V2_t
&
vsrc0
,
const
GI_INT32_V2_t
&
vsrc1
)
const
{
auto
vitem0
=
GiMaximumFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
0
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
0
]),
this
->
vscale1
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
auto
vitem1
=
GiMaximumFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
1
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
1
]),
this
->
vscale1
));
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
GI_FLOAT32_V2_t
tmp
;
GiSetSubVectorFloat32V2
(
tmp
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
tmp
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
tmp
);
}
};
...
...
dnn/src/fallback/elemwise_helper/kimpl/min.h
浏览文件 @
8546c15d
...
...
@@ -33,14 +33,22 @@ struct MinOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto vitem0 = GiMinimum##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiMinimum##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiMinimum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiMinimum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
...
...
@@ -84,12 +92,23 @@ struct MinOp<dt_qint8, dt_qint8> : MinOpBase<dt_qint8, dt_qint8> {
GI_INT8_t
operator
()(
const
GI_INT32_V2_t
&
vsrc0
,
const
GI_INT32_V2_t
&
vsrc1
)
const
{
auto
vitem0
=
GiMinimumFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
0
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
0
]),
this
->
vscale1
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
auto
vitem1
=
GiMinimumFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
1
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
1
]),
this
->
vscale1
));
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
GI_FLOAT32_V2_t
tmp
;
GiSetSubVectorFloat32V2
(
tmp
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
tmp
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
tmp
);
}
};
...
...
dnn/src/fallback/elemwise_helper/kimpl/mul.h
浏览文件 @
8546c15d
...
...
@@ -33,14 +33,22 @@ struct MulOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto vitem0 = GiMultiply##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiMultiply##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiMultiply##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiMultiply##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
...
...
@@ -83,13 +91,24 @@ struct MulOp<dt_qint8, dt_qint8> : MulOpBase<dt_qint8, dt_qint8> {
GI_INT8_t
operator
()(
const
GI_INT32_V2_t
&
vsrc0
,
const
GI_INT32_V2_t
&
vsrc1
)
const
{
auto
vitem0
=
GiMultiplyFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
0
]),
this
->
vscale_src0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
0
]),
this
->
vscale1
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale_src0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
auto
vitem1
=
GiMultiplyFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
1
]),
this
->
vscale_src0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
1
]),
this
->
vscale1
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale_src0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
GI_FLOAT32_V2_t
tmp
;
GiSetSubVectorFloat32V2
(
tmp
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
tmp
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
tmp
);
}
};
...
...
dnn/src/fallback/elemwise_helper/kimpl/none.h
浏览文件 @
8546c15d
...
...
@@ -16,23 +16,24 @@ struct NoneOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template
<
typename
src_ctype
,
typename
dst_type
=
src_ctype
>
struct
NoneOp
;
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct NoneOp<_ctype> : NoneOpBase<_ctype> { \
NoneOp(){}; \
NoneOp(float, float){}; \
using NoneOpBase::NoneOpBase; \
using NoneOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
_simd_type2 operator()(const _simd_type2& src) const { return src; } \
void operator()(const _simd_type2& src, _ctype* dst) const { \
GiStore##_func_suffix(dst, src.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, src.val[1]); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
GiStore##_func_suffix(dst, src); \
} \
_simd_type operator()(const _simd_type& src) const { return src; } \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct NoneOp<_ctype> : NoneOpBase<_ctype> { \
NoneOp(){}; \
NoneOp(float, float){}; \
using NoneOpBase::NoneOpBase; \
using NoneOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
_simd_type2 operator()(const _simd_type2& src) const { return src; } \
void operator()(const _simd_type2& src, _ctype* dst) const { \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(src, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(src, 1)); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
GiStore##_func_suffix(dst, src); \
} \
_simd_type operator()(const _simd_type& src) const { return src; } \
};
OP
(
dt_float32
,
GI_FLOAT32_t
,
GI_FLOAT32_V2_t
,
Float32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
float
))
...
...
@@ -61,8 +62,8 @@ struct NoneOp<dt_qint32, dt_qint8> : NoneOpBase<dt_qint32, dt_qint8> {
constexpr
static
size_t
SIMD_WIDTH
=
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
void
operator
()(
const
GI_INT32_V2_t
&
vsrc
,
dt_qint8
*
dst
)
const
{
GiStoreInt32
(
dst
,
vsrc
.
val
[
0
]
);
GiStoreInt32
(
dst
+
16
,
vsrc
.
val
[
1
]
);
GiStoreInt32
(
dst
,
GiGetSubVectorInt32V2
(
vsrc
,
0
)
);
GiStoreInt32
(
dst
+
16
,
GiGetSubVectorInt32V2
(
vsrc
,
1
)
);
}
void
operator
()(
const
GI_INT32_t
&
src
,
dt_qint8
*
dst
)
const
{
GiStoreInt32
(
dst
,
src
);
...
...
dnn/src/fallback/elemwise_helper/kimpl/op_base.h
浏览文件 @
8546c15d
此差异已折叠。
点击以展开。
dnn/src/fallback/elemwise_helper/kimpl/relu.h
浏览文件 @
8546c15d
...
...
@@ -20,37 +20,43 @@ struct ReluOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template
<
typename
src_ctype
,
typename
dst_type
=
src_ctype
>
struct
ReluOp
;
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width, zero) \
template <> \
struct ReluOp<_ctype> : ReluOpBase<_ctype> { \
using ReluOpBase::ReluOpBase; \
using ReluOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto vitem0 = GiMaximum##_func_suffix(src.val[0], zero); \
auto vitem1 = GiMaximum##_func_suffix(src.val[1], zero); \
return {{vitem0, vitem1}}; \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type operator()(const _simd_type& src) const { \
return GiMaximum##_func_suffix(src, zero); \
} \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width, zero_num) \
template <> \
struct ReluOp<_ctype> : ReluOpBase<_ctype> { \
using ReluOpBase::ReluOpBase; \
using ReluOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
_simd_type zero = GiBroadcast##_func_suffix(zero_num); \
auto vitem0 = GiMaximum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src, 0), zero); \
auto vitem1 = GiMaximum##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src, 1), zero); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type operator()(const _simd_type& src) const { \
_simd_type zero = GiBroadcast##_func_suffix(zero_num); \
return GiMaximum##_func_suffix(src, zero); \
} \
};
OP
(
dt_float32
,
GI_FLOAT32_t
,
GI_FLOAT32_V2_t
,
Float32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
float
),
vfzero
)
OP
(
dt_int32
,
GI_INT32_t
,
GI_INT32_V2_t
,
Int32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
),
vzero
)
OP
(
dt_int8
,
GI_INT8_t
,
GI_INT8_V2_t
,
Int8
,
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
),
vzero_int8
)
0.0
f
)
OP
(
dt_int32
,
GI_INT32_t
,
GI_INT32_V2_t
,
Int32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
),
0
)
OP
(
dt_int8
,
GI_INT8_t
,
GI_INT8_V2_t
,
Int8
,
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
),
0
)
#undef OP
template
<
>
...
...
@@ -76,11 +82,19 @@ struct ReluOp<dt_qint8, dt_qint8> : ReluOpBase<dt_qint8, dt_qint8> {
OPERATOR_UNARY_QINT8_FALLBACK
;
}
GI_INT8_t
operator
()(
const
GI_INT32_V2_t
&
vsrc
)
const
{
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc
.
val
[
0
]),
this
->
vscale
);
auto
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc
.
val
[
1
]),
this
->
vscale
);
GI_FLOAT32_t
vfzero
=
GiBroadcastFloat32
(
0.0
f
);
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale
));
auto
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale
));
vitem0
=
GiMaximumFloat32
(
vitem0
,
vfzero
);
vitem1
=
GiMaximumFloat32
(
vitem1
,
vfzero
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
GI_FLOAT32_V2_t
tmp
;
GiSetSubVectorFloat32V2
(
tmp
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
tmp
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
tmp
);
}
};
...
...
@@ -104,6 +118,8 @@ template <>
struct
ReluOp
<
dt_qint32
,
dt_qint8
>
:
ReluOpBase
<
dt_qint32
,
dt_qint8
>
,
FixupBase
{
using
ReluOpBase
::
operator
();
constexpr
static
size_t
SIMD_WIDTH
=
4
;
GI_INT32_t
vzero
=
GiBroadcastInt32
(
0
);
GI_FLOAT32_t
vfzero
=
GiBroadcastFloat32
(
0.0
f
);
ReluOp
(
DType
src_dtype
,
DType
dst_dtype
)
:
ReluOpBase
(
src_dtype
,
dst_dtype
),
FixupBase
(
scale
)
{}
...
...
@@ -115,8 +131,8 @@ struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8>, FixupBase
vst1_s8
(
reinterpret_cast
<
int8_t
*>
(
dst
),
vget_low_s8
(
operator
()(
vsrc
)));
}
int8x16_t
operator
()(
const
int32x4x2_t
&
vsrc
)
const
{
int32x4_t
vitem0
=
vqrdmulhq_s32
(
vsrc
.
val
[
0
]
,
vmultiplier
);
int32x4_t
vitem1
=
vqrdmulhq_s32
(
vsrc
.
val
[
1
]
,
vmultiplier
);
int32x4_t
vitem0
=
vqrdmulhq_s32
(
GiGetSubVectorInt32V2
(
vsrc
,
0
)
,
vmultiplier
);
int32x4_t
vitem1
=
vqrdmulhq_s32
(
GiGetSubVectorInt32V2
(
vsrc
,
1
)
,
vmultiplier
);
vitem0
=
vmaxq_s32
(
vitem0
,
vzero
);
vitem1
=
vmaxq_s32
(
vitem1
,
vzero
);
auto
tmp
=
vqmovn_s16
(
vcombine_s16
(
...
...
@@ -158,24 +174,36 @@ struct ReluOp<dt_qint32, dt_qint8> : ReluOpBase<dt_qint32, dt_qint8> {
}
void
operator
()(
const
GI_INT32_t
&
src
,
dt_qint8
*
dst
)
const
{
GiStoreLane0Int32
(
reinterpret_cast
<
int32_t
*>
(
dst
),
(
GI_INT32_t
)(
operator
()(
src
)));
reinterpret_cast
<
int32_t
*>
(
dst
),
GiReinterpretInt8AsInt32
(
operator
()(
src
)));
}
GI_INT8_t
operator
()(
const
GI_INT32_V2_t
&
vsrc
)
const
{
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc
.
val
[
0
]),
this
->
vscale
);
auto
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc
.
val
[
1
]),
this
->
vscale
);
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale
));
auto
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale
));
GI_FLOAT32_t
vfzero
=
GiBroadcastFloat32
(
0.0
f
);
vitem0
=
GiMaximumFloat32
(
vitem0
,
vfzero
);
vitem1
=
GiMaximumFloat32
(
vitem1
,
vfzero
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
GI_FLOAT32_V2_t
tmp
;
GiSetSubVectorFloat32V2
(
tmp
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
tmp
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
tmp
);
}
GI_INT8_t
operator
()(
const
GI_INT32_t
&
src
)
const
{
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
src
),
this
->
vscale
);
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
src
),
GiFixLenType2GiFloat32Type
(
this
->
vscale
));
GI_FLOAT32_t
vfzero
=
GiBroadcastFloat32
(
0.0
f
);
vitem0
=
GiMaximumFloat32
(
vitem0
,
vfzero
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_t
>
(
vitem0
);
}
GI_INT8_t
operator
()(
const
GI_FLOAT32_t
&
src
)
const
{
auto
vitem0
=
GiMultiplyFloat32
(
src
,
this
->
vscale
);
auto
vitem0
=
GiMultiplyFloat32
(
src
,
GiFixLenType2GiFloat32Type
(
this
->
vscale
));
GI_FLOAT32_t
vfzero
=
GiBroadcastFloat32
(
0.0
f
);
vitem0
=
GiMaximumFloat32
(
vitem0
,
vfzero
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_t
>
(
vitem0
);
}
...
...
dnn/src/fallback/elemwise_helper/kimpl/sigmoid.h
浏览文件 @
8546c15d
...
...
@@ -25,27 +25,33 @@ struct SigmoidOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template
<
typename
src_ctype
,
typename
dst_ctype
=
src_ctype
>
struct
SigmoidOp
;
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct SigmoidOp<_ctype> : SigmoidOpBase<_ctype> { \
using SigmoidOpBase::SigmoidOpBase; \
using SigmoidOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
return {{operator()(src.val[0]), operator()(src.val[1])}}; \
} \
_simd_type operator()(const _simd_type& src) const { \
return GiSigmoidPs##_func_suffix(src); \
} \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct SigmoidOp<_ctype> : SigmoidOpBase<_ctype> { \
using SigmoidOpBase::SigmoidOpBase; \
using SigmoidOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
void operator()(const _simd_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2( \
ret, 0, operator()(GiGetSubVector##_func_suffix##V2(src, 0))); \
GiSetSubVector##_func_suffix##V2( \
ret, 1, operator()(GiGetSubVector##_func_suffix##V2(src, 1))); \
return ret; \
} \
_simd_type operator()(const _simd_type& src) const { \
return GiSigmoidPs##_func_suffix(src); \
} \
};
OP
(
dt_float32
,
GI_FLOAT32_t
,
GI_FLOAT32_V2_t
,
Float32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
float
))
#undef OP
...
...
dnn/src/fallback/elemwise_helper/kimpl/sub.h
浏览文件 @
8546c15d
...
...
@@ -33,14 +33,22 @@ struct SubOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto vitem0 = GiSubtract##_func_suffix(src0.val[0], src1.val[0]); \
auto vitem1 = GiSubtract##_func_suffix(src0.val[1], src1.val[1]); \
return {{vitem0, vitem1}}; \
auto vitem0 = GiSubtract##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 0), \
GiGetSubVector##_func_suffix##V2(src1, 0)); \
auto vitem1 = GiSubtract##_func_suffix( \
GiGetSubVector##_func_suffix##V2(src0, 1), \
GiGetSubVector##_func_suffix##V2(src1, 1)); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, vitem0); \
GiSetSubVector##_func_suffix##V2(ret, 1, vitem1); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
...
...
@@ -82,12 +90,23 @@ struct SubOp<dt_qint8, dt_qint8> : SubOpBase<dt_qint8, dt_qint8> {
}
GI_INT8_t
operator
()(
const
GI_INT32_V2_t
&
vsrc0
,
const
GI_INT32_V2_t
&
vsrc1
)
const
{
auto
vitem0
=
GiSubtractFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
0
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
0
]),
this
->
vscale1
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
auto
vitem1
=
GiSubtractFloat32
(
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc0
.
val
[
1
]),
this
->
vscale0
),
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc1
.
val
[
1
]),
this
->
vscale1
));
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc0
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale0
)),
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc1
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale1
)));
GI_FLOAT32_V2_t
tmp
;
GiSetSubVectorFloat32V2
(
tmp
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
tmp
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
tmp
);
}
};
...
...
dnn/src/fallback/elemwise_helper/kimpl/tanh.h
浏览文件 @
8546c15d
...
...
@@ -23,54 +23,58 @@ struct TanhOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template
<
typename
src_ctype
,
typename
dst_type
=
src_ctype
>
struct
TanhOp
;
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct TanhOp<_ctype> : TanhOpBase<_ctype> { \
using TanhOpBase::TanhOpBase; \
using TanhOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto one_val = GiBroadcast##_func_suffix(1.f); \
auto two_val = GiBroadcast##_func_suffix(2.f); \
auto val1 = src.val[0]; \
auto val2 = src.val[1]; \
val1 = GiMultiply##_func_suffix(two_val, val1); \
val2 = GiMultiply##_func_suffix(two_val, val2); \
val1 = GiExpPs##_func_suffix(val1); \
val2 = GiExpPs##_func_suffix(val2); \
val1 = GiAdd##_func_suffix(one_val, val1); \
val2 = GiAdd##_func_suffix(one_val, val2); \
auto rval1 = GiRecpe##_func_suffix(val1); \
auto rval2 = GiRecpe##_func_suffix(val2); \
rval1 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val1, rval1), rval1); \
rval2 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val2, rval2), rval2); \
val1 = GiMultiply##_func_suffix(two_val, rval1); \
val2 = GiMultiply##_func_suffix(two_val, rval2); \
val1 = GiSubtract##_func_suffix(one_val, val1); \
val2 = GiSubtract##_func_suffix(one_val, val2); \
return {{val1, val2}}; \
} \
_simd_type operator()(const _simd_type& src) const { \
auto one_val = GiBroadcast##_func_suffix(1.f); \
auto two_val = GiBroadcast##_func_suffix(2.f); \
auto val1 = src; \
val1 = GiMultiply##_func_suffix(two_val, val1); \
val1 = GiExpPs##_func_suffix(val1); \
val1 = GiAdd##_func_suffix(one_val, val1); \
auto rval1 = GiRecpe##_func_suffix(val1); \
rval1 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val1, rval1), rval1); \
val1 = GiMultiply##_func_suffix(two_val, rval1); \
val1 = GiSubtract##_func_suffix(one_val, val1); \
return val1; \
} \
#define OP(_ctype, _simd_type, _simd_type2, _func_suffix, _simd_width) \
template <> \
struct TanhOp<_ctype> : TanhOpBase<_ctype> { \
using TanhOpBase::TanhOpBase; \
using TanhOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _simd_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()(const _simd_type2& src) const { \
auto one_val = GiBroadcast##_func_suffix(1.f); \
auto two_val = GiBroadcast##_func_suffix(2.f); \
auto val1 = GiGetSubVector##_func_suffix##V2(src, 0); \
auto val2 = GiGetSubVector##_func_suffix##V2(src, 1); \
val1 = GiMultiply##_func_suffix(two_val, val1); \
val2 = GiMultiply##_func_suffix(two_val, val2); \
val1 = GiExpPs##_func_suffix(val1); \
val2 = GiExpPs##_func_suffix(val2); \
val1 = GiAdd##_func_suffix(one_val, val1); \
val2 = GiAdd##_func_suffix(one_val, val2); \
auto rval1 = GiRecpe##_func_suffix(val1); \
auto rval2 = GiRecpe##_func_suffix(val2); \
rval1 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val1, rval1), rval1); \
rval2 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val2, rval2), rval2); \
val1 = GiMultiply##_func_suffix(two_val, rval1); \
val2 = GiMultiply##_func_suffix(two_val, rval2); \
val1 = GiSubtract##_func_suffix(one_val, val1); \
val2 = GiSubtract##_func_suffix(one_val, val2); \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
_simd_type operator()(const _simd_type& src) const { \
auto one_val = GiBroadcast##_func_suffix(1.f); \
auto two_val = GiBroadcast##_func_suffix(2.f); \
auto val1 = src; \
val1 = GiMultiply##_func_suffix(two_val, val1); \
val1 = GiExpPs##_func_suffix(val1); \
val1 = GiAdd##_func_suffix(one_val, val1); \
auto rval1 = GiRecpe##_func_suffix(val1); \
rval1 = GiMultiply##_func_suffix( \
GiRecpeS##_func_suffix(val1, rval1), rval1); \
val1 = GiMultiply##_func_suffix(two_val, rval1); \
val1 = GiSubtract##_func_suffix(one_val, val1); \
return val1; \
} \
};
OP
(
dt_float32
,
GI_FLOAT32_t
,
GI_FLOAT32_V2_t
,
Float32
,
GI_SIMD_LEN_BYTE
/
sizeof
(
float
))
#undef OP
...
...
dnn/src/fallback/elemwise_helper/kimpl/true_div.h
浏览文件 @
8546c15d
...
...
@@ -36,18 +36,22 @@ struct TrueDivOp;
const _simd_type2& src0, const _simd_type2& src1, \
dst_ctype* dst) const { \
auto vitem = operator()(src0, src1); \
GiStore##_func_suffix(dst, vitem.val[0]); \
GiStore##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
GiStore##_func_suffix(dst, GiGetSubVector##_func_suffix##V2(vitem, 0)); \
GiStore##_func_suffix( \
dst + SIMD_WIDTH, GiGetSubVector##_func_suffix##V2(vitem, 1)); \
} \
_simd_type2 operator()( \
const _simd_type2& src0, const _simd_type2& src1) const { \
auto val1 =
src0.val[0];
\
auto val2 =
src0.val[1];
\
auto val3 =
src1.val[0];
\
auto val4 =
src1.val[1];
\
auto val1 =
GiGetSubVector##_func_suffix##V2(src0, 0);
\
auto val2 =
GiGetSubVector##_func_suffix##V2(src0, 1);
\
auto val3 =
GiGetSubVector##_func_suffix##V2(src1, 0);
\
auto val4 =
GiGetSubVector##_func_suffix##V2(src1, 1);
\
val1 = GiDivide##_func_suffix(val1, val3); \
val2 = GiDivide##_func_suffix(val2, val4); \
return {{val1, val2}}; \
_simd_type2 ret; \
GiSetSubVector##_func_suffix##V2(ret, 0, val1); \
GiSetSubVector##_func_suffix##V2(ret, 1, val2); \
return ret; \
} \
void operator()( \
const _simd_type& src0, const _simd_type& src1, \
...
...
dnn/src/fallback/elemwise_helper/kimpl/typecvt.h
浏览文件 @
8546c15d
...
...
@@ -21,7 +21,8 @@ struct TypeCvtOp<dt_qint32, dt_qint8> : UnaryOpBase<dt_qint32, dt_qint8> {
}
void
operator
()(
const
GI_INT32_t
&
vsrc
,
dt_qint8
*
dst
)
const
{
GiStoreLane0Int32
(
reinterpret_cast
<
int32_t
*>
(
dst
),
(
GI_INT32_t
)(
operator
()(
vsrc
)));
reinterpret_cast
<
int32_t
*>
(
dst
),
GiReinterpretInt8AsInt32
(
operator
()(
vsrc
)));
}
void
operator
()(
const
src_ctype
&
src
,
dst_ctype
*
dst
)
const
{
*
dst
=
operator
()(
src
);
...
...
@@ -32,17 +33,25 @@ struct TypeCvtOp<dt_qint32, dt_qint8> : UnaryOpBase<dt_qint32, dt_qint8> {
}
GI_INT8_t
operator
()(
const
GI_INT32_V2_t
&
vsrc
)
const
{
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc
.
val
[
0
]),
this
->
vscale
);
auto
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
vsrc
.
val
[
1
]),
this
->
vscale
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc
,
0
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale
));
auto
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiGetSubVectorInt32V2
(
vsrc
,
1
)),
GiFixLenType2GiFloat32Type
(
this
->
vscale
));
GI_FLOAT32_V2_t
tmp
;
GiSetSubVectorFloat32V2
(
tmp
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
tmp
,
1
,
vitem1
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
tmp
);
}
GI_INT8_t
operator
()(
const
GI_INT32_t
&
src
)
const
{
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
src
),
this
->
vscale
);
auto
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
src
),
GiFixLenType2GiFloat32Type
(
this
->
vscale
));
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_t
>
(
vitem0
);
}
GI_INT8_t
operator
()(
const
GI_FLOAT32_t
&
src
)
const
{
auto
vitem0
=
GiMultiplyFloat32
(
src
,
this
->
vscale
);
auto
vitem0
=
GiMultiplyFloat32
(
src
,
GiFixLenType2GiFloat32Type
(
this
->
vscale
)
);
return
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_t
>
(
vitem0
);
}
};
...
...
dnn/src/fallback/elemwise_helper/op_common.h
浏览文件 @
8546c15d
此差异已折叠。
点击以展开。
dnn/src/fallback/gi_intrinsic_helper.h
浏览文件 @
8546c15d
...
...
@@ -11,8 +11,9 @@ struct LoadHelper {
static
GI_FORCEINLINE
void
impl
(
T
&
weight
,
T2
ptr
,
int
oc_offset
,
XT
...
args
);
};
#define WEIGHT_CB(step) \
src[step] = Func::impl(ptr + base_offset + step * ptr_step, args...);
#define WEIGHT_CB(step) \
src[step] = GiFloat32Type2FixLenType( \
Func::impl(ptr + base_offset + step * ptr_step, args...));
#define LOAD_HELPER(step) \
template < \
...
...
dnn/src/fallback/quantized_converter.h
浏览文件 @
8546c15d
...
...
@@ -38,7 +38,13 @@ template <>
inline
GI_FLOAT32_V2_t
QConverter
::
convert
(
const
GI_INT16_t
&
vsrc
)
{
GI_INT32_t
vhi
=
GiMoveHighLongInt16
(
vsrc
);
GI_INT32_t
vlo
=
GiMoveLowLongInt16
(
vsrc
);
return
{{
GiCastToFloat32
(
vlo
),
GiCastToFloat32
(
vhi
)}};
GI_FLOAT32_t
fhi
=
GiCastToFloat32
(
vhi
);
GI_FLOAT32_t
flo
=
GiCastToFloat32
(
vlo
);
GI_FLOAT32_V2_t
ret
;
GiSetSubVectorFloat32V2
(
ret
,
0
,
flo
);
GiSetSubVectorFloat32V2
(
ret
,
1
,
fhi
);
return
ret
;
}
template
<
>
...
...
dnn/src/fallback/reduce/reducer.h
浏览文件 @
8546c15d
此差异已折叠。
点击以展开。
dnn/src/fallback/type_cvt/typecvt_helper.h
浏览文件 @
8546c15d
...
...
@@ -18,22 +18,26 @@ struct QuantizedTypeCvter<int32_t, int8_t> {
static
constexpr
size_t
SIMD_WIDTH
=
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
)
*
2
;
static
constexpr
size_t
SIMD_STEP
=
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
float
scale
;
GI_FLOAT32_t
vscale
;
GI_FLOAT32_
FIXLEN_
t
vscale
;
QuantizedTypeCvter
(
DType
src_dtype
,
DType
dst_dtype
)
{
float
src_scale
=
src_dtype
.
param
<
dtype
::
QuantizedS32
>
().
scale
;
float
dst_scale
=
dst_dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
scale
=
src_scale
/
dst_scale
;
vscale
=
Gi
BroadcastFloat32
(
scale
);
vscale
=
Gi
Float32Type2FixLenType
(
GiBroadcastFloat32
(
scale
)
);
}
void
cvt
(
const
int32_t
*
src
,
int8_t
*
dst
)
{
GI_FLOAT32_t
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiLoadInt32
(
src
)),
vscale
);
GI_FLOAT32_t
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiLoadInt32
(
src
+
SIMD_STEP
)),
vscale
);
auto
vres
=
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
GI_FLOAT32_t
t
;
t
=
GiFixLenType2GiFloat32Type
(
vscale
);
GI_FLOAT32_t
vitem0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiLoadInt32
(
src
)),
t
);
GI_FLOAT32_t
vitem1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiLoadInt32
(
src
+
SIMD_STEP
)),
t
);
GI_FLOAT32_V2_t
v2
;
GiSetSubVectorFloat32V2
(
v2
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
v2
,
1
,
vitem1
);
auto
vres
=
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
v2
);
GiStoreLowInt8
(
dst
,
vres
);
}
...
...
@@ -48,27 +52,29 @@ struct QuantizedTypeCvter<int8_t, int32_t> {
using
dst_type
=
int32_t
;
static
constexpr
size_t
SIMD_WIDTH
=
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
);
float
scale
;
GI_FLOAT32_t
vscale
;
GI_FLOAT32_
FIXLEN_
t
vscale
;
QuantizedTypeCvter
(
DType
src_dtype
,
DType
dst_dtype
)
{
float
src_scale
=
src_dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
float
dst_scale
=
dst_dtype
.
param
<
dtype
::
QuantizedS32
>
().
scale
;
scale
=
src_scale
/
dst_scale
;
vscale
=
Gi
BroadcastFloat32
(
scale
);
vscale
=
Gi
Float32Type2FixLenType
(
GiBroadcastFloat32
(
scale
)
);
}
void
cvt
(
const
int8_t
*
src
,
int32_t
*
dst
)
{
GI_FLOAT32_t
t
;
t
=
GiFixLenType2GiFloat32Type
(
vscale
);
GI_INT8_t
data
=
GiLoadInt8
(
src
);
GI_INT16_t
vitem0
=
GiMoveLowLongInt8
(
data
);
GI_INT16_t
vitem1
=
GiMoveHighLongInt8
(
data
);
auto
vret0
=
QConverter
::
round
<
GI_INT32_t
,
GI_FLOAT32_t
>
(
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveLowLongInt16
(
vitem0
)),
vscale
));
auto
vret1
=
QConverter
::
round
<
GI_INT32_t
,
GI_FLOAT32_t
>
(
GiMultiplyFloat32
(
Gi
CastToFloat32
(
GiMoveHighLongInt16
(
vitem0
)),
vscale
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveLowLongInt16
(
vitem0
)),
t
));
auto
vret1
=
QConverter
::
round
<
GI_INT32_t
,
GI_FLOAT32_t
>
(
Gi
MultiplyFloat32
(
GiCastToFloat32
(
GiMoveHighLongInt16
(
vitem0
)),
t
));
auto
vret2
=
QConverter
::
round
<
GI_INT32_t
,
GI_FLOAT32_t
>
(
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveLowLongInt16
(
vitem1
)),
vscale
));
auto
vret3
=
QConverter
::
round
<
GI_INT32_t
,
GI_FLOAT32_t
>
(
GiMultiplyFloat32
(
Gi
CastToFloat32
(
GiMoveHighLongInt16
(
vitem1
)),
vscale
));
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveLowLongInt16
(
vitem1
)),
t
));
auto
vret3
=
QConverter
::
round
<
GI_INT32_t
,
GI_FLOAT32_t
>
(
Gi
MultiplyFloat32
(
GiCastToFloat32
(
GiMoveHighLongInt16
(
vitem1
)),
t
));
constexpr
size_t
step
=
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
GiStoreInt32
(
dst
,
vret0
);
...
...
@@ -90,21 +96,26 @@ struct QuantizedTypeCvter<float, int8_t> {
static
constexpr
size_t
SIMD_WIDTH
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
)
*
2
;
static
constexpr
size_t
SIMD_STEP
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
float
scale
;
GI_FLOAT32_t
vscale
;
GI_FLOAT32_
FIXLEN_
t
vscale
;
QuantizedTypeCvter
(
DType
src_dtype
,
DType
dst_dtype
)
{
MEGDNN_MARK_USED_VAR
(
src_dtype
);
float
src_scale
=
1
;
float
dst_scale
=
dst_dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
scale
=
src_scale
/
dst_scale
;
vscale
=
Gi
BroadcastFloat32
(
scale
);
vscale
=
Gi
Float32Type2FixLenType
(
GiBroadcastFloat32
(
scale
)
);
}
void
cvt
(
const
float
*
src
,
int8_t
*
dst
)
{
GI_FLOAT32_t
vitem0
=
GiMultiplyFloat32
(
GiLoadFloat32
(
src
),
vscale
);
GI_FLOAT32_t
vitem1
=
GiMultiplyFloat32
(
GiLoadFloat32
(
src
+
SIMD_STEP
),
vscale
);
auto
vres
=
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
({{
vitem0
,
vitem1
}});
GI_FLOAT32_t
t
;
t
=
GiFixLenType2GiFloat32Type
(
vscale
);
GI_FLOAT32_t
vitem0
=
GiMultiplyFloat32
(
GiLoadFloat32
(
src
),
t
);
GI_FLOAT32_t
vitem1
=
GiMultiplyFloat32
(
GiLoadFloat32
(
src
+
SIMD_STEP
),
t
);
GI_FLOAT32_V2_t
v2
;
GiSetSubVectorFloat32V2
(
v2
,
0
,
vitem0
);
GiSetSubVectorFloat32V2
(
v2
,
1
,
vitem1
);
auto
vres
=
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V2_t
>
(
v2
);
GiStoreLowInt8
(
dst
,
vres
);
}
...
...
@@ -119,18 +130,19 @@ struct QuantizedTypeCvter<int32_t, int32_t> {
using
dst_type
=
int32_t
;
static
constexpr
size_t
SIMD_WIDTH
=
GI_SIMD_LEN_BYTE
/
sizeof
(
int32_t
);
float
scale
;
GI_FLOAT32_t
vscale
;
GI_FLOAT32_
FIXLEN_
t
vscale
;
QuantizedTypeCvter
(
DType
src_dtype
,
DType
dst_dtype
)
{
float
src_scale
=
src_dtype
.
param
<
dtype
::
QuantizedS32
>
().
scale
;
float
dst_scale
=
dst_dtype
.
param
<
dtype
::
QuantizedS32
>
().
scale
;
scale
=
src_scale
/
dst_scale
;
vscale
=
Gi
BroadcastFloat32
(
scale
);
vscale
=
Gi
Float32Type2FixLenType
(
GiBroadcastFloat32
(
scale
)
);
}
void
cvt
(
const
int32_t
*
src
,
int32_t
*
dst
)
{
GI_FLOAT32_t
vitem
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiLoadInt32
(
src
)),
vscale
);
GI_FLOAT32_t
t
;
t
=
GiFixLenType2GiFloat32Type
(
vscale
);
GI_FLOAT32_t
vitem
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiLoadInt32
(
src
)),
t
);
auto
vres
=
QConverter
::
round
<
GI_INT32_t
,
GI_FLOAT32_t
>
(
vitem
);
GiStoreInt32
(
dst
,
vres
);
...
...
@@ -148,30 +160,32 @@ struct QuantizedTypeCvter<int8_t, int8_t> {
using
dst_type
=
int8_t
;
static
constexpr
size_t
SIMD_WIDTH
=
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
);
float
scale
;
GI_FLOAT32_t
vscale
;
GI_FLOAT32_
FIXLEN_
t
vscale
;
QuantizedTypeCvter
(
DType
src_dtype
,
DType
dst_dtype
)
{
float
src_scale
=
src_dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
float
dst_scale
=
dst_dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
scale
=
src_scale
/
dst_scale
;
vscale
=
Gi
BroadcastFloat32
(
scale
);
vscale
=
Gi
Float32Type2FixLenType
(
GiBroadcastFloat32
(
scale
)
);
}
void
cvt
(
const
int8_t
*
src
,
int8_t
*
dst
)
{
GI_FLOAT32_t
t
;
t
=
GiFixLenType2GiFloat32Type
(
vscale
);
GI_INT8_t
data
=
GiLoadInt8
(
src
);
GI_INT16_t
vitem0
=
GiMoveLowLongInt8
(
data
);
GI_INT16_t
vitem1
=
GiMoveHighLongInt8
(
data
);
auto
vret0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveLowLongInt16
(
vitem0
)),
vscale
);
auto
vret
1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveHighLongInt16
(
vitem0
)),
vscale
);
auto
vret2
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveLowLongInt16
(
vitem1
)),
vscale
)
;
auto
vret3
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveHighLongInt16
(
vitem1
)),
vscale
);
auto
vres
=
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V4_t
>
(
{{
vret0
,
vret1
,
vret2
,
vret3
}}
);
auto
vret0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveLowLongInt16
(
vitem0
)),
t
);
auto
vret1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveHighLongInt16
(
vitem0
)),
t
);
auto
vret
2
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveLowLongInt16
(
vitem1
)),
t
);
auto
vret3
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveHighLongInt16
(
vitem1
)),
t
);
GI_FLOAT32_V4_t
v4
;
GiSetSubVectorFloat32V4
(
v4
,
0
,
vret0
);
GiSetSubVectorFloat32V4
(
v4
,
1
,
vret1
);
GiSetSubVectorFloat32V4
(
v4
,
2
,
vret2
);
GiSetSubVectorFloat32V4
(
v4
,
3
,
vret3
);
auto
vres
=
QConverter
::
convert
<
GI_INT8_t
,
GI_FLOAT32_V4_t
>
(
v4
);
GiStoreInt8
(
dst
,
vres
);
}
...
...
@@ -245,26 +259,24 @@ struct Quan2FloatTypeCvter<int8_t, float> {
static
constexpr
size_t
SIMD_WIDTH
=
GI_SIMD_LEN_BYTE
/
sizeof
(
int8_t
);
static
constexpr
size_t
SIMD_STEP
=
GI_SIMD_LEN_BYTE
/
sizeof
(
float
);
float
_scale
=
0.0
f
;
GI_FLOAT32_t
vscale
;
GI_FLOAT32_
FIXLEN_
t
vscale
;
Quan2FloatTypeCvter
(
DType
src_dtype
,
DType
dst_dtype
)
{
_scale
=
src_dtype
.
param
<
dtype
::
QuantizedS8
>
().
scale
;
vscale
=
Gi
BroadcastFloat32
(
_scale
);
vscale
=
Gi
Float32Type2FixLenType
(
GiBroadcastFloat32
(
_scale
)
);
MEGDNN_MARK_USED_VAR
(
dst_dtype
);
}
void
cvt
(
const
int8_t
*
src
,
float
*
dst
)
{
GI_FLOAT32_t
t
;
t
=
GiFixLenType2GiFloat32Type
(
vscale
);
GI_INT8_t
data
=
GiLoadInt8
(
src
);
GI_INT16_t
vitem0
=
GiMoveLowLongInt8
(
data
);
GI_INT16_t
vitem1
=
GiMoveHighLongInt8
(
data
);
auto
vret0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveLowLongInt16
(
vitem0
)),
vscale
);
auto
vret1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveHighLongInt16
(
vitem0
)),
vscale
);
auto
vret2
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveLowLongInt16
(
vitem1
)),
vscale
);
auto
vret3
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveHighLongInt16
(
vitem1
)),
vscale
);
auto
vret0
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveLowLongInt16
(
vitem0
)),
t
);
auto
vret1
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveHighLongInt16
(
vitem0
)),
t
);
auto
vret2
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveLowLongInt16
(
vitem1
)),
t
);
auto
vret3
=
GiMultiplyFloat32
(
GiCastToFloat32
(
GiMoveHighLongInt16
(
vitem1
)),
t
);
GiStoreFloat32
(
dst
,
vret0
);
GiStoreFloat32
(
dst
+
SIMD_STEP
,
vret1
);
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录