Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
d4f1368d
Mace
项目概览
Xiaomi
/
Mace
通知
107
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
d4f1368d
编写于
2月 21, 2019
作者:
B
Bin Li
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add quantized eltwise sub
上级
f2f05c0d
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
338 addition
and
107 deletion
+338
-107
mace/ops/BUILD
mace/ops/BUILD
+5
-5
mace/ops/arm/q8/eltwise.cc
mace/ops/arm/q8/eltwise.cc
+157
-0
mace/ops/arm/q8/eltwise.h
mace/ops/arm/q8/eltwise.h
+48
-0
mace/ops/common/eltwise_type.h
mace/ops/common/eltwise_type.h
+40
-0
mace/ops/common/gemmlowp_util.h
mace/ops/common/gemmlowp_util.h
+3
-3
mace/ops/conv_2d.cc
mace/ops/conv_2d.cc
+1
-1
mace/ops/eltwise.cc
mace/ops/eltwise.cc
+34
-55
mace/ops/eltwise.h
mace/ops/eltwise.h
+2
-16
mace/ops/eltwise_benchmark.cc
mace/ops/eltwise_benchmark.cc
+15
-6
mace/ops/eltwise_test.cc
mace/ops/eltwise_test.cc
+9
-6
mace/ops/matmul.cc
mace/ops/matmul.cc
+1
-1
mace/ops/softmax.cc
mace/ops/softmax.cc
+1
-1
mace/python/tools/converter_tool/transformer.py
mace/python/tools/converter_tool/transformer.py
+20
-8
mace/utils/quantize.h
mace/utils/quantize.h
+2
-5
未找到文件。
mace/ops/BUILD
浏览文件 @
d4f1368d
...
...
@@ -84,7 +84,7 @@ cc_library(
]),
deps
=
[
"//mace/core"
,
"@gtest
//:gtest
"
,
"@gtest"
,
],
)
...
...
@@ -254,7 +254,7 @@ cc_library(
":arm_neon_kernels"
,
":ref_kernels"
,
":testing"
,
"@gtest
//:gtest
"
,
"@gtest"
,
],
alwayslink
=
1
,
)
...
...
@@ -289,7 +289,7 @@ cc_library(
":opencl_kernels"
,
":ref_kernels"
,
":testing"
,
"@gtest
//:gtest
"
,
"@gtest"
,
],
alwayslink
=
1
,
)
...
...
@@ -329,12 +329,12 @@ cc_library(
"ops_registry.h"
,
"ops_test_util.h"
,
"fixpoint.h"
,
"gemmlowp_util.h"
,
"
common/
gemmlowp_util.h"
,
"quantization_util.h"
,
],
)
+
if_quantize_enabled
(
glob
([
"fixpoint.h"
,
"gemmlowp_util.h"
,
"
common/
gemmlowp_util.h"
,
"quantization_util.h"
,
])),
copts
=
[
...
...
mace/ops/arm/q8/eltwise.cc
0 → 100644
浏览文件 @
d4f1368d
// Copyright 2019 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mace/ops/arm/q8/eltwise.h"
#include <arm_neon.h>
#include <algorithm>
#include "mace/ops/common/gemmlowp_util.h"
#include "mace/utils/logging.h"
namespace
mace
{
namespace
ops
{
namespace
arm
{
namespace
q8
{
MaceStatus
Eltwise
::
Compute
(
const
OpContext
*
context
,
const
Tensor
*
input0
,
const
Tensor
*
input1
,
Tensor
*
output
)
{
MACE_UNUSED
(
context
);
MACE_CHECK
(
type_
==
SUM
||
type_
==
SUB
,
"Quantized Elementwise only support SUM and SUB now."
);
constexpr
int
left_shift
=
20
;
const
double
doubled_scale
=
2
*
std
::
max
(
input0
->
scale
(),
input1
->
scale
());
const
double
adjusted_input0_scale
=
input0
->
scale
()
/
doubled_scale
;
const
double
adjusted_input1_scale
=
input1
->
scale
()
/
doubled_scale
;
const
double
adjusted_output_scale
=
doubled_scale
/
((
1
<<
left_shift
)
*
output
->
scale
());
int32_t
input0_multiplier
;
int32_t
input1_multiplier
;
int32_t
output_multiplier
;
int32_t
input0_shift
;
int32_t
input1_shift
;
int32_t
output_shift
;
QuantizeMultiplier
(
adjusted_input0_scale
,
&
input0_multiplier
,
&
input0_shift
);
QuantizeMultiplier
(
adjusted_input1_scale
,
&
input1_multiplier
,
&
input1_shift
);
QuantizeMultiplier
(
adjusted_output_scale
,
&
output_multiplier
,
&
output_shift
);
Tensor
::
MappingGuard
input0_guard
(
input0
);
Tensor
::
MappingGuard
input1_guard
(
input1
);
Tensor
::
MappingGuard
output_guard
(
output
);
auto
input0_ptr
=
input0
->
data
<
uint8_t
>
();
auto
input1_ptr
=
input1
->
data
<
uint8_t
>
();
auto
output_ptr
=
output
->
mutable_data
<
uint8_t
>
();
#pragma omp parallel for schedule(runtime)
for
(
index_t
i
=
0
;
i
<=
output
->
size
()
-
8
;
i
+=
8
)
{
const
auto
input0_val
=
vld1_u8
(
input0_ptr
+
i
);
const
auto
input1_val
=
vld1_u8
(
input1_ptr
+
i
);
const
auto
input0_val_s16
=
vreinterpretq_s16_u16
(
vmovl_u8
(
input0_val
));
const
auto
input1_val_s16
=
vreinterpretq_s16_u16
(
vmovl_u8
(
input1_val
));
const
auto
offset_input0
=
vaddq_s16
(
input0_val_s16
,
vdupq_n_s16
(
-
input0
->
zero_point
()));
const
auto
offset_input1
=
vaddq_s16
(
input1_val_s16
,
vdupq_n_s16
(
-
input1
->
zero_point
()));
auto
input0_low_s32
=
vmovl_s16
(
vget_low_s16
(
offset_input0
));
auto
input0_high_s32
=
vmovl_s16
(
vget_high_s16
(
offset_input0
));
auto
input1_low_s32
=
vmovl_s16
(
vget_low_s16
(
offset_input1
));
auto
input1_high_s32
=
vmovl_s16
(
vget_high_s16
(
offset_input1
));
const
auto
left_shift_dup
=
vdupq_n_s32
(
left_shift
);
input0_low_s32
=
vshlq_s32
(
input0_low_s32
,
left_shift_dup
);
input0_high_s32
=
vshlq_s32
(
input0_high_s32
,
left_shift_dup
);
input1_low_s32
=
vshlq_s32
(
input1_low_s32
,
left_shift_dup
);
input1_high_s32
=
vshlq_s32
(
input1_high_s32
,
left_shift_dup
);
input0_low_s32
=
vqrdmulhq_n_s32
(
input0_low_s32
,
input0_multiplier
);
input0_high_s32
=
vqrdmulhq_n_s32
(
input0_high_s32
,
input0_multiplier
);
input1_low_s32
=
vqrdmulhq_n_s32
(
input1_low_s32
,
input1_multiplier
);
input1_high_s32
=
vqrdmulhq_n_s32
(
input1_high_s32
,
input1_multiplier
);
const
auto
input0_shift_dup
=
vdupq_n_s32
(
input0_shift
);
const
auto
input1_shift_dup
=
vdupq_n_s32
(
input1_shift
);
input0_low_s32
=
vshlq_s32
(
input0_low_s32
,
input0_shift_dup
);
input0_high_s32
=
vshlq_s32
(
input0_high_s32
,
input0_shift_dup
);
input1_low_s32
=
vshlq_s32
(
input1_low_s32
,
input1_shift_dup
);
input1_high_s32
=
vshlq_s32
(
input1_high_s32
,
input1_shift_dup
);
int32x4_t
res_low
,
res_high
;
if
(
type_
==
SUM
)
{
res_low
=
vaddq_s32
(
input0_low_s32
,
input1_low_s32
);
res_high
=
vaddq_s32
(
input0_high_s32
,
input1_high_s32
);
}
else
{
res_low
=
vsubq_s32
(
input0_low_s32
,
input1_low_s32
);
res_high
=
vsubq_s32
(
input0_high_s32
,
input1_high_s32
);
}
res_low
=
vqrdmulhq_n_s32
(
res_low
,
output_multiplier
);
res_high
=
vqrdmulhq_n_s32
(
res_high
,
output_multiplier
);
res_low
=
gemmlowp
::
RoundingDivideByPOT
(
res_low
,
-
output_shift
);
res_high
=
gemmlowp
::
RoundingDivideByPOT
(
res_high
,
-
output_shift
);
const
auto
res_low_s16
=
vmovn_s32
(
res_low
);
const
auto
res_high_s16
=
vmovn_s32
(
res_high
);
const
auto
output_val
=
vaddq_s16
(
vcombine_s16
(
res_low_s16
,
res_high_s16
),
vdupq_n_s16
(
output
->
zero_point
()));
vst1_u8
(
output_ptr
+
i
,
vqmovun_s16
(
output_val
));
}
index_t
handled_output_size
=
output
->
size
()
-
output
->
size
()
%
8
;
#pragma omp parallel for schedule(runtime)
for
(
index_t
i
=
handled_output_size
;
i
<
output
->
size
();
++
i
)
{
const
int32_t
offset_input0
=
input0_ptr
[
i
]
-
input0
->
zero_point
();
const
int32_t
offset_input1
=
input1_ptr
[
i
]
-
input1
->
zero_point
();
const
int32_t
shifted_input0
=
offset_input0
*
(
1
<<
left_shift
);
const
int32_t
shifted_input1
=
offset_input1
*
(
1
<<
left_shift
);
const
int32_t
multiplied_input0
=
gemmlowp
::
RoundingDivideByPOT
(
gemmlowp
::
SaturatingRoundingDoublingHighMul
(
shifted_input0
,
input0_multiplier
),
-
input0_shift
);
const
int32_t
multiplied_input1
=
gemmlowp
::
RoundingDivideByPOT
(
gemmlowp
::
SaturatingRoundingDoublingHighMul
(
shifted_input1
,
input1_multiplier
),
-
input1_shift
);
int32_t
res
;
if
(
type_
==
SUM
)
{
res
=
multiplied_input0
+
multiplied_input1
;
}
else
{
res
=
multiplied_input0
-
multiplied_input1
;
}
const
int32_t
output_val
=
gemmlowp
::
RoundingDivideByPOT
(
gemmlowp
::
SaturatingRoundingDoublingHighMul
(
res
,
output_multiplier
),
-
output_shift
)
+
output
->
zero_point
();
output_ptr
[
i
]
=
Saturate
<
uint8_t
>
(
output_val
);
}
return
MaceStatus
::
MACE_SUCCESS
;
}
}
// namespace q8
}
// namespace arm
}
// namespace ops
}
// namespace mace
mace/ops/arm/q8/eltwise.h
0 → 100644
浏览文件 @
d4f1368d
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This implements matrix-vector multiplication described as
// https://github.com/google/gemmlowp/blob/master/todo/fast-gemv.txt
#ifndef MACE_OPS_ARM_Q8_ELTWISE_H_
#define MACE_OPS_ARM_Q8_ELTWISE_H_
#include "mace/core/op_context.h"
#include "mace/core/types.h"
#include "mace/ops/common/eltwise_type.h"
namespace
mace
{
namespace
ops
{
namespace
arm
{
namespace
q8
{
class
Eltwise
{
public:
explicit
Eltwise
(
const
EltwiseType
type
)
:
type_
(
type
)
{}
MaceStatus
Compute
(
const
OpContext
*
context
,
const
Tensor
*
input0
,
const
Tensor
*
input1
,
Tensor
*
output
);
private:
EltwiseType
type_
;
};
}
// namespace q8
}
// namespace arm
}
// namespace ops
}
// namespace mace
#endif // MACE_OPS_ARM_Q8_ELTWISE_H_
mace/ops/common/eltwise_type.h
0 → 100644
浏览文件 @
d4f1368d
// Copyright 2018 The MACE Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_COMMON_ELTWISE_TYPE_H_
#define MACE_OPS_COMMON_ELTWISE_TYPE_H_
namespace
mace
{
namespace
ops
{
enum
EltwiseType
{
SUM
=
0
,
SUB
=
1
,
PROD
=
2
,
DIV
=
3
,
MIN
=
4
,
MAX
=
5
,
NEG
=
6
,
ABS
=
7
,
SQR_DIFF
=
8
,
POW
=
9
,
EQUAL
=
10
,
FLOOR_DIV
=
11
,
NONE
=
12
,
};
}
// namespace ops
}
// namespace mace
#endif // MACE_OPS_COMMON_ELTWISE_TYPE_H_
mace/ops/gemmlowp_util.h
→
mace/ops/
common/
gemmlowp_util.h
浏览文件 @
d4f1368d
...
...
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_GEMMLOWP_UTIL_H_
#define MACE_OPS_GEMMLOWP_UTIL_H_
#ifndef MACE_OPS_
COMMON_
GEMMLOWP_UTIL_H_
#define MACE_OPS_
COMMON_
GEMMLOWP_UTIL_H_
#include <tuple>
...
...
@@ -75,4 +75,4 @@ struct GemmlowpOutputPipeline {
};
}
// namespace mace
#endif // MACE_OPS_GEMMLOWP_UTIL_H_
#endif // MACE_OPS_
COMMON_
GEMMLOWP_UTIL_H_
mace/ops/conv_2d.cc
浏览文件 @
d4f1368d
...
...
@@ -41,7 +41,7 @@
#endif // MACE_ENABLE_NEON
#ifdef MACE_ENABLE_QUANTIZE
#include "mace/ops/gemmlowp_util.h"
#include "mace/ops/
common/
gemmlowp_util.h"
#include "mace/ops/quantization_util.h"
#endif // MACE_ENABLE_QUANTIZE
...
...
mace/ops/eltwise.cc
浏览文件 @
d4f1368d
...
...
@@ -12,6 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef MACE_ENABLE_NEON
#ifdef MACE_ENABLE_QUANTIZE
#include "mace/ops/arm/q8/eltwise.h"
#endif // MACE_ENABLE_QUANTIZE
#endif // MACE_ENABLE_NEON
#include "mace/ops/eltwise.h"
#include <algorithm>
...
...
@@ -1035,19 +1041,30 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
scalar_input_index_
(
Operation
::
GetOptionalArg
<
int32_t
>
(
"scalar_input_index"
,
1
)),
data_format_
(
static_cast
<
DataFormat
>
(
Operation
::
GetOptionalArg
<
int
>
(
"data_format"
,
0
)))
{}
"data_format"
,
0
)))
#ifdef MACE_ENABLE_NEON
,
eltwise_
(
static_cast
<
ops
::
EltwiseType
>
(
Operation
::
GetOptionalArg
<
int
>
(
"type"
,
static_cast
<
int
>
(
ops
::
EltwiseType
::
NONE
))))
#endif
{}
MaceStatus
Run
(
OpContext
*
context
)
override
{
MACE_UNUSED
(
context
);
const
Tensor
*
input0
=
this
->
Input
(
0
);
const
Tensor
*
input1
=
this
->
InputSize
()
==
2
?
this
->
Input
(
1
)
:
nullptr
;
MACE_CHECK
(
this
->
InputSize
()
==
2
,
"Quantized Elementwise don't support broadcast now."
);
const
Tensor
*
input1
=
this
->
Input
(
1
);
Tensor
*
output
=
this
->
Output
(
0
);
MACE_CHECK
(
type_
==
SUM
,
"Only support Elementwise SUM now. "
);
MACE_CHECK
(
type_
==
SUM
||
type_
==
SUB
,
"Quantized Elementwise only support SUM and SUB now."
);
MACE_CHECK
(
input0
->
size
()
==
input1
->
size
(),
"input0 and input1 must have the same shape."
);
MACE_CHECK
(
output
->
scale
()
!=
0
);
MACE_RETURN_IF_ERROR
(
output
->
Resize
(
input0
->
shape
()));
#ifdef MACE_ENABLE_NEON
eltwise_
.
Compute
(
context
,
input0
,
input1
,
output
);
#else
constexpr
int
left_shift
=
20
;
const
double
doubled_scale
=
2
*
std
::
max
(
input0
->
scale
(),
input1
->
scale
());
const
double
adjusted_input0_scale
=
input0
->
scale
()
/
doubled_scale
;
...
...
@@ -1078,57 +1095,8 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
auto
input0_ptr
=
input0
->
data
<
uint8_t
>
();
auto
input1_ptr
=
input1
->
data
<
uint8_t
>
();
auto
output_ptr
=
output
->
mutable_data
<
uint8_t
>
();
index_t
handled_output_size
=
0
;
#ifdef MACE_ENABLE_NEON
#pragma omp parallel for schedule(runtime)
for
(
index_t
i
=
handled_output_size
;
i
<=
output
->
size
()
-
8
;
i
+=
8
)
{
const
auto
input0_val
=
vld1_u8
(
input0_ptr
+
i
);
const
auto
input1_val
=
vld1_u8
(
input1_ptr
+
i
);
const
auto
input0_val_s16
=
vreinterpretq_s16_u16
(
vmovl_u8
(
input0_val
));
const
auto
input1_val_s16
=
vreinterpretq_s16_u16
(
vmovl_u8
(
input1_val
));
const
auto
offset_input0
=
vaddq_s16
(
input0_val_s16
,
vdupq_n_s16
(
-
input0
->
zero_point
()));
const
auto
offset_input1
=
vaddq_s16
(
input1_val_s16
,
vdupq_n_s16
(
-
input1
->
zero_point
()));
auto
input0_low_s32
=
vmovl_s16
(
vget_low_s16
(
offset_input0
));
auto
input0_high_s32
=
vmovl_s16
(
vget_high_s16
(
offset_input0
));
auto
input1_low_s32
=
vmovl_s16
(
vget_low_s16
(
offset_input1
));
auto
input1_high_s32
=
vmovl_s16
(
vget_high_s16
(
offset_input1
));
const
auto
left_shift_dup
=
vdupq_n_s32
(
left_shift
);
input0_low_s32
=
vshlq_s32
(
input0_low_s32
,
left_shift_dup
);
input0_high_s32
=
vshlq_s32
(
input0_high_s32
,
left_shift_dup
);
input1_low_s32
=
vshlq_s32
(
input1_low_s32
,
left_shift_dup
);
input1_high_s32
=
vshlq_s32
(
input1_high_s32
,
left_shift_dup
);
input0_low_s32
=
vqrdmulhq_n_s32
(
input0_low_s32
,
input0_multiplier
);
input0_high_s32
=
vqrdmulhq_n_s32
(
input0_high_s32
,
input0_multiplier
);
input1_low_s32
=
vqrdmulhq_n_s32
(
input1_low_s32
,
input1_multiplier
);
input1_high_s32
=
vqrdmulhq_n_s32
(
input1_high_s32
,
input1_multiplier
);
const
auto
input0_shift_dup
=
vdupq_n_s32
(
input0_shift
);
const
auto
input1_shift_dup
=
vdupq_n_s32
(
input1_shift
);
input0_low_s32
=
vshlq_s32
(
input0_low_s32
,
input0_shift_dup
);
input0_high_s32
=
vshlq_s32
(
input0_high_s32
,
input0_shift_dup
);
input1_low_s32
=
vshlq_s32
(
input1_low_s32
,
input1_shift_dup
);
input1_high_s32
=
vshlq_s32
(
input1_high_s32
,
input1_shift_dup
);
auto
sum_low
=
vaddq_s32
(
input0_low_s32
,
input1_low_s32
);
auto
sum_high
=
vaddq_s32
(
input0_high_s32
,
input1_high_s32
);
sum_low
=
vqrdmulhq_n_s32
(
sum_low
,
output_multiplier
);
sum_high
=
vqrdmulhq_n_s32
(
sum_high
,
output_multiplier
);
sum_low
=
gemmlowp
::
RoundingDivideByPOT
(
sum_low
,
-
output_shift
);
sum_high
=
gemmlowp
::
RoundingDivideByPOT
(
sum_high
,
-
output_shift
);
const
auto
sum_low_s16
=
vmovn_s32
(
sum_low
);
const
auto
sum_high_s16
=
vmovn_s32
(
sum_high
);
const
auto
output_val
=
vaddq_s16
(
vcombine_s16
(
sum_low_s16
,
sum_high_s16
),
vdupq_n_s16
(
output
->
zero_point
()));
vst1_u8
(
output_ptr
+
i
,
vqmovun_s16
(
output_val
));
}
handled_output_size
=
output
->
size
()
-
output
->
size
()
%
8
;
#endif // NEON
#pragma omp parallel for schedule(runtime)
for
(
index_t
i
=
handled_output_size
;
i
<
output
->
size
();
++
i
)
{
for
(
index_t
i
=
0
;
i
<
output
->
size
();
++
i
)
{
const
int32_t
offset_input0
=
input0_ptr
[
i
]
-
input0
->
zero_point
();
const
int32_t
offset_input1
=
input1_ptr
[
i
]
-
input1
->
zero_point
();
const
int32_t
shifted_input0
=
offset_input0
*
(
1
<<
left_shift
);
...
...
@@ -1143,14 +1111,22 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
gemmlowp
::
SaturatingRoundingDoublingHighMul
(
shifted_input1
,
input1_multiplier
),
-
input1_shift
);
const
int32_t
sum
=
multiplied_input0
+
multiplied_input1
;
int32_t
res
;
if
(
type_
==
SUM
)
{
res
=
multiplied_input0
+
multiplied_input1
;
}
else
{
res
=
multiplied_input0
-
multiplied_input1
;
}
const
int32_t
output_val
=
gemmlowp
::
RoundingDivideByPOT
(
gemmlowp
::
SaturatingRoundingDoublingHighMul
(
sum
,
gemmlowp
::
SaturatingRoundingDoublingHighMul
(
res
,
output_multiplier
),
-
output_shift
)
+
output
->
zero_point
();
output_ptr
[
i
]
=
Saturate
<
uint8_t
>
(
output_val
);
}
#endif // NEON
return
MaceStatus
::
MACE_SUCCESS
;
}
...
...
@@ -1162,6 +1138,9 @@ class EltwiseOp<DeviceType::CPU, uint8_t> : public Operation {
int32_t
scalar_input_index_
;
DataFormat
data_format_
;
Tensor
scalar_tensor_
;
#ifdef MACE_ENABLE_NEON
arm
::
q8
::
Eltwise
eltwise_
;
#endif
};
#endif // MACE_ENABLE_QUANTIZE
...
...
mace/ops/eltwise.h
浏览文件 @
d4f1368d
...
...
@@ -15,25 +15,11 @@
#ifndef MACE_OPS_ELTWISE_H_
#define MACE_OPS_ELTWISE_H_
#include "mace/ops/common/eltwise_type.h"
namespace
mace
{
namespace
ops
{
enum
EltwiseType
{
SUM
=
0
,
SUB
=
1
,
PROD
=
2
,
DIV
=
3
,
MIN
=
4
,
MAX
=
5
,
NEG
=
6
,
ABS
=
7
,
SQR_DIFF
=
8
,
POW
=
9
,
EQUAL
=
10
,
FLOOR_DIV
=
11
,
NONE
=
12
,
};
inline
bool
IsLogicalType
(
EltwiseType
type
)
{
return
type
==
EQUAL
;
}
}
// namespace ops
...
...
mace/ops/eltwise_benchmark.cc
浏览文件 @
d4f1368d
...
...
@@ -30,12 +30,12 @@ void EltwiseBenchmark(
OpsTestNet
net
;
// Add input data
if
(
D
==
DeviceType
::
GPU
)
{
net
.
AddRandomInput
<
D
,
T
>
(
"Input0"
,
{
n
,
h
,
w
,
c
});
net
.
AddRandomInput
<
D
,
T
>
(
"Input1"
,
{
n
,
h
,
w
,
c
});
}
else
{
if
(
D
==
DeviceType
::
CPU
&&
DataTypeToEnum
<
T
>::
value
!=
DT_UINT8
)
{
net
.
AddRandomInput
<
D
,
T
>
(
"Input0"
,
{
n
,
c
,
h
,
w
});
net
.
AddRandomInput
<
D
,
T
>
(
"Input1"
,
{
n
,
c
,
h
,
w
});
}
else
{
net
.
AddRandomInput
<
D
,
T
>
(
"Input0"
,
{
n
,
h
,
w
,
c
});
net
.
AddRandomInput
<
D
,
T
>
(
"Input1"
,
{
n
,
h
,
w
,
c
});
}
OpDefBuilder
(
"Eltwise"
,
"EltwiseTest"
)
...
...
@@ -47,15 +47,21 @@ void EltwiseBenchmark(
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
Setup
(
D
);
if
(
D
==
DeviceType
::
CPU
&&
DataTypeToEnum
<
T
>::
value
==
DT_UINT8
)
{
net
.
GetTensor
(
"Output"
)
->
SetScale
(
0.1
);
}
// Warm-up
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
net
.
Run
Op
(
D
);
net
.
Run
(
);
net
.
Sync
();
}
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
net
.
Run
Op
(
D
);
net
.
Run
(
);
net
.
Sync
();
}
}
...
...
@@ -86,6 +92,9 @@ MACE_BM_ELTWISE(0, 1, 240, 240, 256);
MACE_BM_ELTWISE
(
5
,
1
,
128
,
128
,
32
);
MACE_BM_ELTWISE
(
5
,
1
,
240
,
240
,
256
);
MACE_BM_ELTWISE_MACRO
(
0
,
1
,
128
,
128
,
32
,
uint8_t
,
CPU
);
MACE_BM_ELTWISE_MACRO
(
1
,
1
,
128
,
128
,
32
,
uint8_t
,
CPU
);
}
// namespace test
}
// namespace ops
}
// namespace mace
mace/ops/eltwise_test.cc
浏览文件 @
d4f1368d
...
...
@@ -729,7 +729,8 @@ void RandomTensorEltwise(const ops::EltwiseType type,
}
}
void
QuantizedSum
(
const
std
::
vector
<
index_t
>
&
shape
)
{
void
Quantized
(
const
std
::
vector
<
index_t
>
&
shape
,
const
ops
::
EltwiseType
type
)
{
// Construct graph
OpsTestNet
net
;
...
...
@@ -753,7 +754,7 @@ void QuantizedSum(const std::vector<index_t> &shape) {
OpDefBuilder
(
"Eltwise"
,
"EltwiseTest"
)
.
Input
(
"TInput0"
)
.
Input
(
"TInput1"
)
.
AddIntArg
(
"type"
,
static_cast
<
int
>
(
ops
::
EltwiseType
::
SUM
))
.
AddIntArg
(
"type"
,
static_cast
<
int
>
(
type
))
.
AddIntArg
(
"data_format"
,
DataFormat
::
NCHW
)
.
Output
(
"TOutput"
)
.
Finalize
(
net
.
NewOperatorDef
());
...
...
@@ -794,7 +795,7 @@ void QuantizedSum(const std::vector<index_t> &shape) {
.
Input
(
"QuantizedInput0"
)
.
Input
(
"QuantizedInput1"
)
.
Output
(
"QuantizedOutput"
)
.
AddIntArg
(
"type"
,
static_cast
<
int
>
(
ops
::
EltwiseType
::
SUM
))
.
AddIntArg
(
"type"
,
static_cast
<
int
>
(
type
))
.
AddIntArg
(
"T"
,
static_cast
<
int
>
(
DT_UINT8
))
.
Finalize
(
net
.
NewOperatorDef
());
net
.
Setup
(
DeviceType
::
CPU
);
...
...
@@ -1009,9 +1010,11 @@ TEST_F(EltwiseOpTest, TensorGeneralBroadcastGPU) {
{
1
,
1
,
2
,
1
},
{
2
,
3
},
{
1
,
1
,
2
,
5
},
{
4
,
1
,
0
,
1
,
4
,
4
,
9
,
16
,
25
,
36
});
}
TEST_F
(
EltwiseOpTest
,
QuantizedSum
)
{
QuantizedSum
({
1
,
32
,
32
,
16
});
QuantizedSum
({
1
,
31
,
31
,
17
});
TEST_F
(
EltwiseOpTest
,
Quantized
)
{
Quantized
({
1
,
32
,
32
,
16
},
ops
::
EltwiseType
::
SUM
);
Quantized
({
1
,
31
,
31
,
17
},
ops
::
EltwiseType
::
SUM
);
Quantized
({
1
,
32
,
32
,
16
},
ops
::
EltwiseType
::
SUB
);
Quantized
({
1
,
31
,
31
,
17
},
ops
::
EltwiseType
::
SUB
);
}
}
// namespace test
...
...
mace/ops/matmul.cc
浏览文件 @
d4f1368d
...
...
@@ -38,7 +38,7 @@
#endif // MACE_ENABLE_NEON
#ifdef MACE_ENABLE_QUANTIZE
#include "mace/ops/gemmlowp_util.h"
#include "mace/ops/
common/
gemmlowp_util.h"
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
...
...
mace/ops/softmax.cc
浏览文件 @
d4f1368d
...
...
@@ -22,7 +22,7 @@
#ifdef MACE_ENABLE_QUANTIZE
#include "mace/ops/fixpoint.h"
#include "mace/ops/gemmlowp_util.h"
#include "mace/ops/
common/
gemmlowp_util.h"
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
...
...
mace/python/tools/converter_tool/transformer.py
浏览文件 @
d4f1368d
...
...
@@ -1423,8 +1423,9 @@ class Transformer(base_converter.ConverterInterface):
else
:
mace_check
(
op
.
type
==
MaceOp
.
Quantize
.
name
,
"Quantization only support float ops, "
"but get %s(%s)"
%
(
op
.
name
,
op
.
type
))
"but get %s(%s, %s)"
%
(
op
.
name
,
op
.
type
,
mace_pb2
.
DataType
.
Name
(
data_type_arg
.
i
)))
for
input_node
in
self
.
_option
.
input_nodes
.
values
():
new_input_name
=
self
.
input_name_map
[
input_node
.
name
]
...
...
@@ -1725,18 +1726,29 @@ class Transformer(base_converter.ConverterInterface):
self
.
add_quantize_info
(
op
,
0.0
,
1.0
)
self
.
_quantize_activation_info
[
op
.
output
[
0
]]
=
quantize_info
elif
(
op
.
type
==
MaceOp
.
Eltwise
.
name
and
ConverterUtil
.
get_arg
(
op
,
MaceKeyword
.
mace_element_type_str
).
i
==
EltwiseType
.
SUM
.
value
# noqa
and
not
op
.
quantize_info
and
len
(
op
.
input
)
==
2
and
len
(
op
.
input
[
0
])
not
in
self
.
_consts
and
len
(
op
.
input
[
1
])
not
in
self
.
_consts
):
del
op
.
quantize_info
[:]
producer_op0
=
self
.
_producer
[
op
.
input
[
0
]]
producer_op1
=
self
.
_producer
[
op
.
input
[
1
]]
minval
=
producer_op0
.
quantize_info
[
0
].
minval
\
+
producer_op1
.
quantize_info
[
0
].
minval
maxval
=
producer_op0
.
quantize_info
[
0
].
maxval
\
+
producer_op1
.
quantize_info
[
0
].
maxval
if
ConverterUtil
.
get_arg
(
op
,
MaceKeyword
.
mace_element_type_str
).
i
\
==
EltwiseType
.
SUM
.
value
:
minval
=
producer_op0
.
quantize_info
[
0
].
minval
\
+
producer_op1
.
quantize_info
[
0
].
minval
maxval
=
producer_op0
.
quantize_info
[
0
].
maxval
\
+
producer_op1
.
quantize_info
[
0
].
maxval
elif
ConverterUtil
.
get_arg
(
op
,
MaceKeyword
.
mace_element_type_str
).
i
\
==
EltwiseType
.
SUB
.
value
:
minval
=
producer_op0
.
quantize_info
[
0
].
minval
\
-
producer_op1
.
quantize_info
[
0
].
maxval
maxval
=
producer_op0
.
quantize_info
[
0
].
maxval
\
-
producer_op1
.
quantize_info
[
0
].
minval
else
:
mace_check
(
False
,
"Quantized Elementwise only support:"
" SUM and SUB now."
)
quantize_info
=
\
self
.
add_quantize_info
(
op
,
minval
,
maxval
)
self
.
_quantize_activation_info
[
op
.
output
[
0
]]
=
quantize_info
...
...
mace/utils/quantize.h
浏览文件 @
d4f1368d
...
...
@@ -19,6 +19,8 @@
#include <cmath>
#include <limits>
#include "mace/utils/logging.h"
namespace
mace
{
template
<
typename
T
>
...
...
@@ -138,11 +140,6 @@ inline void Dequantize(const T *input,
inline
void
QuantizeMultiplier
(
double
multiplier
,
int32_t
*
output_multiplier
,
int32_t
*
shift
)
{
if
(
multiplier
==
0.
f
)
{
*
output_multiplier
=
0
;
*
shift
=
0
;
return
;
}
const
double
q
=
std
::
frexp
(
multiplier
,
shift
);
auto
qint
=
static_cast
<
int64_t
>
(
roundl
(
q
*
(
1ll
<<
31
)));
if
(
qint
==
(
1ll
<<
31
))
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录