Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
c6efac34
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
c6efac34
编写于
3月 18, 2019
作者:
L
liyin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimize quantize and dequantize ops
上级
f0c7717e
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
164 addition
and
9 deletion
+164
-9
mace/core/BUILD.bazel
mace/core/BUILD.bazel
+6
-2
mace/core/runtime/cpu/cpu_runtime.cc
mace/core/runtime/cpu/cpu_runtime.cc
+24
-3
mace/python/tools/converter_tool/transformer.py
mace/python/tools/converter_tool/transformer.py
+9
-1
mace/utils/BUILD.bazel
mace/utils/BUILD.bazel
+19
-1
mace/utils/quantize.h
mace/utils/quantize.h
+106
-2
未找到文件。
mace/core/BUILD.bazel
浏览文件 @
c6efac34
...
...
@@ -10,11 +10,12 @@ licenses(["notice"]) # Apache 2.0
load
(
"//mace:mace.bzl"
,
"if_android"
,
"if_android_armv7"
,
"if_hexagon_enabled"
,
"if_not_hexagon_enabled"
,
"if_openmp_enabled"
,
"if_neon_enabled"
,
"if_not_hexagon_enabled"
,
"if_opencl_enabled"
,
"if_openmp_enabled"
,
"if_quantize_enabled"
,
)
...
...
@@ -58,6 +59,9 @@ cc_library(
"-DMACE_ENABLE_HEXAGON"
,
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfloat-abi=softfp"
,
]),
linkopts
=
[
"-ldl"
],
deps
=
[
...
...
mace/core/runtime/cpu/cpu_runtime.cc
浏览文件 @
c6efac34
...
...
@@ -40,19 +40,33 @@ struct CPUFreq {
float
freq
;
};
enum
SchedulePolicy
{
SCHED_STATIC
,
SCHED_GUIDED
,
};
namespace
{
MaceStatus
SetOpenMPThreadsAndAffinityCPUs
(
int
omp_num_threads
,
const
std
::
vector
<
size_t
>
&
cpu_ids
)
{
const
std
::
vector
<
size_t
>
&
cpu_ids
,
SchedulePolicy
schedule_policy
)
{
MaceOpenMPThreadCount
=
omp_num_threads
;
#ifdef MACE_ENABLE_OPENMP
VLOG
(
1
)
<<
"Set OpenMP threads number: "
<<
omp_num_threads
<<
", CPU core IDs: "
<<
MakeString
(
cpu_ids
);
omp_set_schedule
(
omp_sched_guided
,
1
);
if
(
schedule_policy
==
SCHED_GUIDED
)
{
omp_set_schedule
(
omp_sched_guided
,
1
);
}
else
if
(
schedule_policy
==
SCHED_STATIC
)
{
omp_set_schedule
(
omp_sched_static
,
0
);
}
else
{
LOG
(
WARNING
)
<<
"Unknown schedule policy: "
<<
schedule_policy
;
}
omp_set_num_threads
(
omp_num_threads
);
#else
MACE_UNUSED
(
omp_num_threads
);
MACE_UNUSED
(
schedule_policy
);
LOG
(
WARNING
)
<<
"Set OpenMP threads number failed: OpenMP not enabled."
;
#endif
...
...
@@ -148,6 +162,7 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
}
else
{
cores_to_use
=
num_threads_hint
;
}
MACE_CHECK
(
cores_to_use
>
0
,
"number of cores to use should > 0"
);
VLOG
(
2
)
<<
"Use "
<<
num_threads_hint
<<
" threads"
;
std
::
vector
<
size_t
>
cpu_ids
(
cores_to_use
);
...
...
@@ -156,6 +171,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
<<
cpu_freq
[
i
].
freq
;
cpu_ids
[
i
]
=
cpu_freq
[
i
].
core_id
;
}
SchedulePolicy
sched_policy
=
SCHED_GUIDED
;
if
(
std
::
abs
(
cpu_freq
[
0
].
freq
-
cpu_freq
[
cores_to_use
-
1
].
freq
)
<
1e-6
)
{
sched_policy
=
SCHED_STATIC
;
}
#ifdef MACE_ENABLE_QUANTIZE
if
(
gemm_context
)
{
...
...
@@ -164,7 +183,9 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
}
#endif // MACE_ENABLE_QUANTIZE
return
SetOpenMPThreadsAndAffinityCPUs
(
num_threads_hint
,
cpu_ids
);
return
SetOpenMPThreadsAndAffinityCPUs
(
num_threads_hint
,
cpu_ids
,
sched_policy
);
}
}
// namespace mace
...
...
mace/python/tools/converter_tool/transformer.py
浏览文件 @
c6efac34
...
...
@@ -1942,13 +1942,21 @@ class Transformer(base_converter.ConverterInterface):
continue
quantized_inputs_names
=
[]
should_quantize
=
False
has_const
=
False
for
idx
,
input_tensor
in
enumerate
(
op
.
input
):
if
input_tensor
in
self
.
_consts
:
has_const
=
True
break
if
not
has_const
:
continue
for
idx
,
input_tensor
in
enumerate
(
op
.
input
):
if
self
.
get_tensor_data_type
(
input_tensor
)
\
==
mace_pb2
.
DT_FLOAT
:
should_quantize
=
True
break
if
not
should_quantize
:
continue
else
:
...
...
mace/utils/BUILD.bazel
浏览文件 @
c6efac34
...
...
@@ -7,6 +7,14 @@ package(
licenses
([
"notice"
])
# Apache 2.0
load
(
"//mace:mace.bzl"
,
"if_android"
,
"if_android_armv7"
,
"if_neon_enabled"
,
"if_openmp_enabled"
,
)
cc_library
(
name
=
"utils_hdrs"
,
hdrs
=
glob
([
...
...
@@ -37,7 +45,17 @@ cc_library(
"-Werror"
,
"-Wextra"
,
"-Wno-missing-field-initializers"
,
],
]
+
if_openmp_enabled
([
"-fopenmp"
,
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfloat-abi=softfp"
,
]),
linkopts
=
if_android
([
"-llog"
,
]),
deps
=
[
":utils_hdrs"
,
],
...
...
mace/utils/quantize.h
浏览文件 @
c6efac34
...
...
@@ -19,6 +19,10 @@
#include <cmath>
#include <limits>
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif // MACE_ENABLE_NEON
#include "mace/utils/logging.h"
namespace
mace
{
...
...
@@ -156,6 +160,106 @@ inline void Dequantize(const T *input,
}
}
#if defined(MACE_ENABLE_NEON)
template
<
>
inline
void
QuantizeWithScaleAndZeropoint
<
uint8_t
>
(
const
float
*
input
,
const
index_t
size
,
float
scale
,
int32_t
zero_point
,
uint8_t
*
output
)
{
const
float32x4_t
vround
=
vdupq_n_f32
(
0.5
);
const
float32x4_t
vzero
=
vaddq_f32
(
vround
,
vcvtq_f32_s32
(
vdupq_n_s32
(
zero_point
)));
const
float
recip_scale
=
1.
f
/
scale
;
const
float32x4_t
vrecip_scale
=
vdupq_n_f32
(
recip_scale
);
const
index_t
block_count
=
size
/
16
;
#pragma omp parallel for schedule(runtime)
for
(
index_t
i
=
0
;
i
<
block_count
;
++
i
)
{
float32x4_t
vi0
=
vld1q_f32
(
input
+
i
*
16
);
float32x4_t
vi1
=
vld1q_f32
(
input
+
i
*
16
+
4
);
float32x4_t
vi2
=
vld1q_f32
(
input
+
i
*
16
+
8
);
float32x4_t
vi3
=
vld1q_f32
(
input
+
i
*
16
+
12
);
int32x4_t
vo0_s32
=
vcvtq_s32_f32
(
vmlaq_f32
(
vzero
,
vi0
,
vrecip_scale
));
int32x4_t
vo1_s32
=
vcvtq_s32_f32
(
vmlaq_f32
(
vzero
,
vi1
,
vrecip_scale
));
int32x4_t
vo2_s32
=
vcvtq_s32_f32
(
vmlaq_f32
(
vzero
,
vi2
,
vrecip_scale
));
int32x4_t
vo3_s32
=
vcvtq_s32_f32
(
vmlaq_f32
(
vzero
,
vi3
,
vrecip_scale
));
uint8x8_t
vo0_u8
=
vqmovun_s16
(
vcombine_s16
(
vqmovn_s32
(
vo0_s32
),
vqmovn_s32
(
vo1_s32
)));
uint8x8_t
vo1_u8
=
vqmovun_s16
(
vcombine_s16
(
vqmovn_s32
(
vo2_s32
),
vqmovn_s32
(
vo3_s32
)));
uint8x16_t
vo
=
vcombine_u8
(
vo0_u8
,
vo1_u8
);
vst1q_u8
(
output
+
i
*
16
,
vo
);
}
#pragma omp parallel for schedule(runtime)
for
(
index_t
i
=
block_count
*
16
;
i
<
size
;
++
i
)
{
output
[
i
]
=
Saturate
<
uint8_t
>
(
roundf
(
zero_point
+
recip_scale
*
input
[
i
]));
}
}
template
<
>
inline
void
Dequantize
<
int32_t
>
(
const
int32_t
*
input
,
const
index_t
size
,
const
float
scale
,
const
int32_t
zero_point
,
float
*
output
)
{
const
index_t
block_count
=
size
/
4
;
const
int32x4_t
vzero
=
vdupq_n_s32
(
zero_point
);
const
float32x4_t
vscale
=
vdupq_n_f32
(
scale
);
#pragma omp parallel for schedule(runtime)
for
(
index_t
i
=
0
;
i
<
block_count
;
++
i
)
{
int32x4_t
vi
=
vld1q_s32
(
input
+
i
*
4
);
float32x4_t
vo
=
vmulq_f32
(
vscale
,
vcvtq_f32_s32
(
vsubq_s32
(
vi
,
vzero
)));
vst1q_f32
(
output
+
i
*
4
,
vo
);
}
for
(
index_t
i
=
block_count
*
4
;
i
<
size
;
++
i
)
{
output
[
i
]
=
scale
*
(
input
[
i
]
-
zero_point
);
}
}
template
<
>
inline
void
Dequantize
<
uint8_t
>
(
const
uint8_t
*
input
,
const
index_t
size
,
const
float
scale
,
const
int32_t
zero_point
,
float
*
output
)
{
const
index_t
block_count
=
size
/
16
;
const
int32x4_t
vzero
=
vdupq_n_s32
(
zero_point
);
const
float32x4_t
vscale
=
vdupq_n_f32
(
scale
);
#pragma omp parallel for schedule(runtime)
for
(
index_t
i
=
0
;
i
<
block_count
;
++
i
)
{
uint8x16_t
vi
=
vld1q_u8
(
input
+
i
*
16
);
float32x4x4_t
vo
=
{
vmulq_f32
(
vscale
,
vcvtq_f32_s32
(
vsubq_s32
(
vreinterpretq_s32_u32
(
vmovl_u16
(
vget_low_u16
(
vmovl_u8
(
vget_low_u8
(
vi
))))),
vzero
))),
vmulq_f32
(
vscale
,
vcvtq_f32_s32
(
vsubq_s32
(
vreinterpretq_s32_u32
(
vmovl_u16
(
vget_high_u16
(
vmovl_u8
(
vget_low_u8
(
vi
))))),
vzero
))),
vmulq_f32
(
vscale
,
vcvtq_f32_s32
(
vsubq_s32
(
vreinterpretq_s32_u32
(
vmovl_u16
(
vget_low_u16
(
vmovl_u8
(
vget_high_u8
(
vi
))))),
vzero
))),
vmulq_f32
(
vscale
,
vcvtq_f32_s32
(
vsubq_s32
(
vreinterpretq_s32_u32
(
vmovl_u16
(
vget_high_u16
(
vmovl_u8
(
vget_high_u8
(
vi
))))),
vzero
))),
};
vst1q_f32
(
output
+
i
*
16
,
vo
.
val
[
0
]);
vst1q_f32
(
output
+
i
*
16
+
4
,
vo
.
val
[
1
]);
vst1q_f32
(
output
+
i
*
16
+
8
,
vo
.
val
[
2
]);
vst1q_f32
(
output
+
i
*
16
+
12
,
vo
.
val
[
3
]);
}
for
(
index_t
i
=
block_count
*
16
;
i
<
size
;
++
i
)
{
output
[
i
]
=
scale
*
(
input
[
i
]
-
zero_point
);
}
}
#endif // MACE_ENABLE_NEON
template
<
typename
T
>
inline
void
DeQuantize
(
const
Tensor
&
input
,
const
float
min_in
,
...
...
@@ -175,8 +279,8 @@ inline void DeQuantize(const Tensor &input,
}
inline
void
QuantizeMultiplier
(
double
multiplier
,
int32_t
*
output_multiplier
,
int32_t
*
shift
)
{
int32_t
*
output_multiplier
,
int32_t
*
shift
)
{
const
double
q
=
std
::
frexp
(
multiplier
,
shift
);
auto
qint
=
static_cast
<
int64_t
>
(
roundl
(
q
*
(
1ll
<<
31
)));
if
(
qint
==
(
1ll
<<
31
))
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录