Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
MegEngine 天元
MegEngine
提交
23c1fda7
MegEngine
项目概览
MegEngine 天元
/
MegEngine
1 年多 前同步成功
通知
403
Star
4705
Fork
582
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
MegEngine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
23c1fda7
编写于
10月 25, 2021
作者:
M
Megvii Engine Team
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
perf(arm_common): optimize sigmoid
GitOrigin-RevId: 7cb248a15b447f4fbb0008e419cdf07cf6387309
上级
b20cda6b
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
114 addition
and
41 deletion
+114
-41
dnn/src/arm_common/elemwise/neon_mathfun.cpp
dnn/src/arm_common/elemwise/neon_mathfun.cpp
+63
-0
dnn/src/arm_common/elemwise/neon_mathfun.h
dnn/src/arm_common/elemwise/neon_mathfun.h
+27
-0
dnn/src/arm_common/elemwise_helper/kimpl/fuse_add_sigmoid.h
dnn/src/arm_common/elemwise_helper/kimpl/fuse_add_sigmoid.h
+2
-12
dnn/src/arm_common/elemwise_helper/kimpl/sigmoid.h
dnn/src/arm_common/elemwise_helper/kimpl/sigmoid.h
+21
-28
imperative/python/test/unit/utils/test_network.py
imperative/python/test/unit/utils/test_network.py
+1
-1
未找到文件。
dnn/src/arm_common/elemwise/neon_mathfun.cpp
浏览文件 @
23c1fda7
...
...
@@ -371,6 +371,69 @@ v4sf tan_ps_f32(v4sf x) {
#undef c_cephes_log_q1
#undef c_cephes_log_q2
static
const
struct
{
float
lower_range
;
float
upper_range
;
float
alpha_9
;
float
alpha_7
;
float
alpha_5
;
float
alpha_3
;
float
alpha_1
;
float
beta_10
;
float
beta_8
;
float
beta_6
;
float
beta_4
;
float
beta_2
;
float
beta_0
;
float
one_half
;
}
sigmoid_constants
=
{
-
18.0
f
,
18.0
f
,
4.37031012579801e-11
f
,
1.15627324459942e-07
f
,
6.08574864600143e-05
f
,
8.51377133304701e-03
f
,
2.48287947061529e-01
f
,
6.10247389755681e-13
f
,
5.76102136993427e-09
f
,
6.29106785017040e-06
f
,
1.70198817374094e-03
f
,
1.16817656904453e-01
f
,
9.93151921023180e-01
f
,
0.5
f
,
};
v4sf
sigmoid_ps_f32
(
v4sf
src
)
{
auto
val
=
vmaxq_f32
(
vdupq_n_f32
(
sigmoid_constants
.
lower_range
),
src
);
val
=
vminq_f32
(
vdupq_n_f32
(
sigmoid_constants
.
upper_range
),
val
);
auto
squared
=
vmulq_f32
(
val
,
val
);
auto
p
=
vmlaq_f32
(
vdupq_n_f32
(
sigmoid_constants
.
alpha_7
),
squared
,
vdupq_n_f32
(
sigmoid_constants
.
alpha_9
));
p
=
vmlaq_f32
(
vdupq_n_f32
(
sigmoid_constants
.
alpha_5
),
p
,
squared
);
p
=
vmlaq_f32
(
vdupq_n_f32
(
sigmoid_constants
.
alpha_3
),
p
,
squared
);
p
=
vmlaq_f32
(
vdupq_n_f32
(
sigmoid_constants
.
alpha_1
),
p
,
squared
);
p
=
vmulq_f32
(
p
,
val
);
auto
q
=
vmlaq_f32
(
vdupq_n_f32
(
sigmoid_constants
.
beta_8
),
squared
,
vdupq_n_f32
(
sigmoid_constants
.
beta_10
));
q
=
vmlaq_f32
(
vdupq_n_f32
(
sigmoid_constants
.
beta_6
),
q
,
squared
);
q
=
vmlaq_f32
(
vdupq_n_f32
(
sigmoid_constants
.
beta_4
),
q
,
squared
);
q
=
vmlaq_f32
(
vdupq_n_f32
(
sigmoid_constants
.
beta_2
),
q
,
squared
);
q
=
vmlaq_f32
(
vdupq_n_f32
(
sigmoid_constants
.
beta_0
),
q
,
squared
);
return
vaddq_f32
(
div_ps_f32
(
p
,
q
),
vdupq_n_f32
(
sigmoid_constants
.
one_half
));
}
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
float16x8_t
sigmoid_ps_f16
(
float16x8_t
x
)
{
float32x4_t
low
=
vcvt_f32_f16
(
vget_low_f16
(
x
));
float32x4_t
high
=
vcvt_f32_f16
(
vget_high_f16
(
x
));
low
=
sigmoid_ps_f32
(
low
);
high
=
sigmoid_ps_f32
(
high
);
return
vcombine_f16
(
vcvt_f16_f32
(
low
),
vcvt_f16_f32
(
high
));
}
#endif
}
// namespace arm_common
}
// namespace megdnn
...
...
dnn/src/arm_common/elemwise/neon_mathfun.h
浏览文件 @
23c1fda7
...
...
@@ -54,11 +54,38 @@ v4sf cos_ps_f32(v4sf x);
v4sf
tan_ps_f32
(
v4sf
x
);
static
inline
v4sf
div_ps_f32
(
v4sf
x
,
v4sf
y
)
{
#if MEGDNN_AARCH64
return
vdivq_f32
(
x
,
y
);
#else
//! armv7 not support vdiv, so compute the reciprocal and iterate again
float32x4_t
recp
=
vrecpeq_f32
(
y
);
recp
=
vmulq_f32
(
vrecpsq_f32
(
y
,
recp
),
recp
);
return
vmulq_f32
(
x
,
recp
);
#endif
}
v4sf
sigmoid_ps_f32
(
v4sf
x
);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
/**
* \brief compute for 8 half at once, the inner just invoke exp_ps_f32 twice
*/
float16x8_t
exp_ps_f16
(
float16x8_t
x
);
static
inline
float16x8_t
div_ps_f16
(
float16x8_t
x
,
float16x8_t
y
)
{
#if MEGDNN_AARCH64
return
vdivq_f16
(
x
,
y
);
#else
//! armv7 not support vdiv, so compute the reciprocal and iterate again
float16x8_t
recp
=
vrecpeq_f16
(
y
);
recp
=
vmulq_f16
(
vrecpsq_f16
(
y
,
recp
),
recp
);
return
vmulq_f16
(
x
,
recp
);
#endif
}
float16x8_t
sigmoid_ps_f16
(
float16x8_t
x
);
#endif
}
// namespace arm_common
...
...
dnn/src/arm_common/elemwise_helper/kimpl/fuse_add_sigmoid.h
浏览文件 @
23c1fda7
...
...
@@ -47,24 +47,14 @@ struct FuseAddSigmoidOp;
vst1q_##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
_neon_type operator()(const _neon_type& src0, const _neon_type& src1) const { \
auto zero_val = vdupq_n_##_func_suffix(0.f); \
auto one_val = vdupq_n_##_func_suffix(1.f); \
auto val1 = src0.val[0]; \
auto val2 = src0.val[1]; \
auto val3 = src1.val[0]; \
auto val4 = src1.val[1]; \
val1 = vaddq_##_func_suffix(val1, val3); \
val2 = vaddq_##_func_suffix(val2, val4); \
val1 = vsubq_##_func_suffix(zero_val, val1); \
val2 = vsubq_##_func_suffix(zero_val, val2); \
val1 = exp_ps_##_func_suffix(val1); \
val2 = exp_ps_##_func_suffix(val2); \
auto recipe1 = vaddq_##_func_suffix(one_val, val1); \
auto recipe2 = vaddq_##_func_suffix(one_val, val2); \
val1 = vrecpeq_##_func_suffix(recipe1); \
val2 = vrecpeq_##_func_suffix(recipe2); \
val1 = vmulq_##_func_suffix(vrecpsq_##_func_suffix(recipe1, val1), val1); \
val2 = vmulq_##_func_suffix(vrecpsq_##_func_suffix(recipe2, val2), val2); \
val1 = sigmoid_ps_##_func_suffix(val1); \
val2 = sigmoid_ps_##_func_suffix(val2); \
return {{val1, val2}}; \
} \
};
...
...
dnn/src/arm_common/elemwise_helper/kimpl/sigmoid.h
浏览文件 @
23c1fda7
...
...
@@ -33,34 +33,27 @@ struct SigmoidOpBase : UnaryOpBase<src_ctype, dst_ctype> {
template
<
typename
src_ctype
,
typename
dst_ctype
=
src_ctype
>
struct
SigmoidOp
;
#define OP(_ctype, _neon_type, _neon_type2, _func_suffix, _simd_width) \
template <> \
struct SigmoidOp<_ctype> : SigmoidOpBase<_ctype> { \
using SigmoidOpBase::SigmoidOpBase; \
using SigmoidOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _neon_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
vst1q_##_func_suffix(dst, vitem.val[0]); \
vst1q_##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
void operator()(const _neon_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
vst1q_##_func_suffix(dst, vitem); \
} \
_neon_type2 operator()(const _neon_type2& src) const { \
return {{operator()(src.val[0]), operator()(src.val[1])}}; \
} \
_neon_type operator()(const _neon_type& src) const { \
auto zero_val = vdupq_n_##_func_suffix(0.f); \
auto one_val = vdupq_n_##_func_suffix(1.f); \
auto val1 = vsubq_##_func_suffix(zero_val, src); \
val1 = exp_ps_##_func_suffix(val1); \
auto recipe1 = vaddq_##_func_suffix(one_val, val1); \
val1 = vrecpeq_##_func_suffix(recipe1); \
val1 = vmulq_##_func_suffix(vrecpsq_##_func_suffix(recipe1, val1), val1); \
return val1; \
} \
#define OP(_ctype, _neon_type, _neon_type2, _func_suffix, _simd_width) \
template <> \
struct SigmoidOp<_ctype> : SigmoidOpBase<_ctype> { \
using SigmoidOpBase::SigmoidOpBase; \
using SigmoidOpBase::operator(); \
constexpr static size_t SIMD_WIDTH = _simd_width; \
void operator()(const _neon_type2& src, _ctype* dst) const { \
auto vitem = operator()(src); \
vst1q_##_func_suffix(dst, vitem.val[0]); \
vst1q_##_func_suffix(dst + SIMD_WIDTH, vitem.val[1]); \
} \
void operator()(const _neon_type& src, _ctype* dst) const { \
auto vitem = operator()(src); \
vst1q_##_func_suffix(dst, vitem); \
} \
_neon_type2 operator()(const _neon_type2& src) const { \
return {{operator()(src.val[0]), operator()(src.val[1])}}; \
} \
_neon_type operator()(const _neon_type& src) const { \
return sigmoid_ps_##_func_suffix(src); \
} \
};
OP
(
dt_float32
,
float32x4_t
,
float32x4x2_t
,
f32
,
4
)
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
...
...
imperative/python/test/unit/utils/test_network.py
浏览文件 @
23c1fda7
...
...
@@ -318,7 +318,7 @@ def test_add_remove_output():
out
=
g
.
run
(
a
.
numpy
(),
b
.
numpy
())
np
.
testing
.
assert_equal
(
out
[
"new_o1"
],
((
a
+
b
)
*
3
).
numpy
())
np
.
testing
.
assert_equal
(
out
[
"new_o2"
],
(
F
.
sigmoid
((
a
+
b
))).
numpy
())
np
.
testing
.
assert_
almost_
equal
(
out
[
"new_o2"
],
(
F
.
sigmoid
((
a
+
b
))).
numpy
())
def
test_query
():
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录