Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Xiaomi
Mace
提交
f4230919
Mace
项目概览
Xiaomi
/
Mace
通知
106
Star
40
Fork
27
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
f4230919
编写于
8月 08, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Implement quantized softmax
上级
f291a048
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
463 addition
and
14 deletion
+463
-14
mace/kernels/arm/conv_2d_neon_1x1.cc
mace/kernels/arm/conv_2d_neon_1x1.cc
+0
-4
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+3
-9
mace/kernels/fixpoint.h
mace/kernels/fixpoint.h
+64
-0
mace/kernels/fixpoint_test.cc
mace/kernels/fixpoint_test.cc
+54
-0
mace/kernels/gemmlowp_util.h
mace/kernels/gemmlowp_util.h
+0
-1
mace/kernels/quantize.h
mace/kernels/quantize.h
+18
-0
mace/kernels/softmax.h
mace/kernels/softmax.h
+232
-0
mace/ops/softmax.cc
mace/ops/softmax.cc
+5
-0
mace/ops/softmax_benchmark.cc
mace/ops/softmax_benchmark.cc
+37
-0
mace/ops/softmax_test.cc
mace/ops/softmax_test.cc
+50
-0
未找到文件。
mace/kernels/arm/conv_2d_neon_1x1.cc
浏览文件 @
f4230919
...
...
@@ -12,10 +12,6 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#if defined(MACE_ENABLE_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif
#include "mace/kernels/arm/conv_2d_neon.h"
#include "mace/kernels/gemm.h"
...
...
mace/kernels/conv_2d.h
浏览文件 @
f4230919
...
...
@@ -32,6 +32,7 @@
#include "mace/kernels/arm/conv_2d_neon.h"
#include "mace/kernels/arm/conv_winograd.h"
#include "mace/kernels/gemmlowp_util.h"
#include "mace/kernels/quantize.h"
#include "mace/utils/utils.h"
#ifdef MACE_ENABLE_OPENCL
...
...
@@ -826,18 +827,11 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
int32_t
*
quantized_multiplier
,
int
*
right_shift
)
{
float
real_multiplier
=
lhs_scale
*
rhs_scale
/
output_scale
;
MACE_CHECK
(
real_multiplier
>
0.
f
&&
real_multiplier
<
1.
f
,
real_multiplier
);
int
exponent
;
const
double
significand
=
std
::
frexp
(
real
_multiplier
,
&
exponent
);
QuantizeMultiplier
(
real_multiplier
,
quantized
_multiplier
,
&
exponent
);
*
right_shift
=
-
exponent
;
int64_t
q
=
static_cast
<
int64_t
>
(
std
::
round
(
significand
*
(
1ll
<<
31
)));
MACE_CHECK
(
q
<=
(
1ll
<<
31
));
if
(
q
==
(
1ll
<<
31
))
{
q
/=
2
;
(
*
right_shift
)
--
;
}
MACE_CHECK
(
*
right_shift
>=
0
);
MACE_CHECK
(
q
<=
std
::
numeric_limits
<
int32_t
>::
max
());
*
quantized_multiplier
=
static_cast
<
int32_t
>
(
q
);
}
typedef
gemmlowp
::
VectorMap
<
const
int32_t
,
gemmlowp
::
VectorShape
::
Col
>
...
...
mace/kernels/fixpoint.h
0 → 100644
浏览文件 @
f4230919
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_KERNELS_FIXPOINT_H_
#define MACE_KERNELS_FIXPOINT_H_
#if defined(MACE_ENABLE_NEON)
#include <arm_neon.h>
#endif
#include <algorithm>
#include "mace/core/types.h"
namespace
mace
{
namespace
kernels
{
inline
uint8_t
FindMax
(
const
uint8_t
*
xs
,
const
index_t
size
)
{
uint8_t
max_value
=
0
;
index_t
i
=
0
;
#if defined(MACE_ENABLE_NEON)
uint8x16_t
vmax16_0
=
vdupq_n_u8
(
0
);
uint8x16_t
vmax16_1
=
vdupq_n_u8
(
0
);
for
(;
i
<=
size
-
32
;
i
+=
32
)
{
vmax16_0
=
vmaxq_u8
(
vmax16_0
,
vld1q_u8
(
xs
+
i
+
0
));
vmax16_1
=
vmaxq_u8
(
vmax16_1
,
vld1q_u8
(
xs
+
i
+
16
));
}
uint8x16_t
vmax16
=
vmaxq_u8
(
vmax16_0
,
vmax16_1
);
if
(
i
<=
size
-
16
)
{
vmax16
=
vmaxq_u8
(
vmax16
,
vld1q_u8
(
xs
+
i
));
i
+=
16
;
}
uint8x8_t
vmax8
=
vmax_u8
(
vget_low_u8
(
vmax16
),
vget_high_u8
(
vmax16
));
if
(
i
<=
size
-
8
)
{
vmax8
=
vmax_u8
(
vmax8
,
vld1_u8
(
xs
+
i
));
i
+=
8
;
}
uint8x8_t
vmax4
=
vmax_u8
(
vmax8
,
vext_u8
(
vmax8
,
vmax8
,
4
));
uint8x8_t
vmax2
=
vmax_u8
(
vmax4
,
vext_u8
(
vmax4
,
vmax4
,
2
));
uint8x8_t
vmax1
=
vpmax_u8
(
vmax2
,
vmax2
);
max_value
=
vget_lane_u8
(
vmax1
,
0
);
#endif
for
(;
i
<
size
;
++
i
)
{
max_value
=
std
::
max
(
max_value
,
xs
[
i
]);
}
return
max_value
;
}
}
// namespace kernels
}
// namespace mace
#endif // MACE_KERNELS_FIXPOINT_H_
mace/kernels/fixpoint_test.cc
0 → 100644
浏览文件 @
f4230919
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <random>
#include <vector>
#include <algorithm>
#include "mace/kernels/fixpoint.h"
namespace
mace
{
namespace
kernels
{
namespace
test
{
namespace
{
void
TestFindMax
(
int
test_count
)
{
static
unsigned
int
seed
=
time
(
NULL
);
std
::
vector
<
uint8_t
>
input
(
test_count
);
uint8_t
expected_max
=
0
;
for
(
int
i
=
0
;
i
<
test_count
;
++
i
)
{
input
[
i
]
=
rand_r
(
&
seed
)
%
255
;
expected_max
=
std
::
max
(
expected_max
,
input
[
i
]);
}
uint8_t
actual_max
=
FindMax
(
input
.
data
(),
input
.
size
());
EXPECT_EQ
(
expected_max
,
actual_max
);
}
}
// namespace
TEST
(
FixpointTest
,
FindMax
)
{
TestFindMax
(
1
);
TestFindMax
(
2
);
TestFindMax
(
4
);
TestFindMax
(
8
);
TestFindMax
(
32
);
TestFindMax
(
33
);
TestFindMax
(
127
);
}
}
// namespace test
}
// namespace kernels
}
// namespace mace
mace/kernels/gemmlowp_util.h
浏览文件 @
f4230919
...
...
@@ -15,7 +15,6 @@
#ifndef MACE_KERNELS_GEMMLOWP_UTIL_H_
#define MACE_KERNELS_GEMMLOWP_UTIL_H_
#include <iostream>
#include "public/gemmlowp.h"
namespace
mace
{
...
...
mace/kernels/quantize.h
浏览文件 @
f4230919
...
...
@@ -137,6 +137,24 @@ inline void Dequantize(const T *input,
}
}
inline
void
QuantizeMultiplier
(
double
multiplier
,
int32_t
*
output_multiplier
,
int32_t
*
shift
)
{
if
(
multiplier
==
0.
f
)
{
*
output_multiplier
=
0
;
*
shift
=
0
;
return
;
}
const
double
q
=
std
::
frexp
(
multiplier
,
shift
);
auto
qint
=
static_cast
<
int64_t
>
(
roundl
(
q
*
(
1ll
<<
31
)));
if
(
qint
==
(
1ll
<<
31
))
{
qint
/=
2
;
++*
shift
;
}
*
output_multiplier
=
static_cast
<
int32_t
>
(
qint
);
MACE_CHECK
(
*
output_multiplier
<=
std
::
numeric_limits
<
int32_t
>::
max
());
}
template
<
DeviceType
D
,
typename
T
>
struct
QuantizeFunctor
;
...
...
mace/kernels/softmax.h
浏览文件 @
f4230919
...
...
@@ -25,6 +25,9 @@
#include "mace/core/tensor.h"
#include "mace/public/mace.h"
#include "mace/utils/utils.h"
#include "mace/kernels/fixpoint.h"
#include "mace/kernels/gemmlowp_util.h"
#include "mace/kernels/quantize.h"
#ifdef MACE_ENABLE_OPENCL
#include "mace/core/runtime/opencl/cl2_header.h"
...
...
@@ -120,6 +123,235 @@ struct SoftmaxFunctor<DeviceType::CPU, float> {
}
};
static
const
int
kInputDeltaIntBits
=
5
;
static
const
int
kSumExpIntBits
=
12
;
template
<
>
struct
SoftmaxFunctor
<
DeviceType
::
CPU
,
uint8_t
>
{
MaceStatus
operator
()(
const
Tensor
*
input
,
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
// Ignore range stat, fix range to [0, 1]. For large depth, each softmax
// output may be too small (<<1), which causes precision issue. But it is
// fine when doing classification inference.
output
->
SetScale
(
1.
f
/
255
);
output
->
SetZeroPoint
(
0
);
using
FixPointInputDelta
=
gemmlowp
::
FixedPoint
<
int32_t
,
kInputDeltaIntBits
>
;
using
FixPointSumExp
=
gemmlowp
::
FixedPoint
<
int32_t
,
kSumExpIntBits
>
;
using
FixPoint0
=
gemmlowp
::
FixedPoint
<
int32_t
,
0
>
;
MACE_CHECK
(
input
->
dim_size
()
==
2
||
input
->
dim_size
()
==
4
,
"Softmax does not support dim size: "
,
input
->
dim_size
());
index_t
batch
;
index_t
depth
;
if
(
input
->
dim_size
()
==
2
)
{
batch
=
input
->
dim
(
0
);
depth
=
input
->
dim
(
1
);
}
else
{
batch
=
input
->
dim
(
0
)
*
input
->
dim
(
1
)
*
input
->
dim
(
2
);
depth
=
input
->
dim
(
3
);
}
Tensor
::
MappingGuard
input_guard
(
input
);
Tensor
::
MappingGuard
output_guard
(
output
);
const
uint8_t
*
input_data
=
input
->
data
<
uint8_t
>
();
float
input_scale
=
input
->
scale
();
uint8_t
*
output_data
=
output
->
mutable_data
<
uint8_t
>
();
// If depth is short, do it using float32. Float computation should not
// be here, but as long as it is on CPU, it is fine.
if
(
depth
<
32
)
{
#pragma omp parallel for
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
const
uint8_t
*
input_ptr
=
input_data
+
b
*
depth
;
uint8_t
*
output_ptr
=
output_data
+
b
*
depth
;
uint8_t
max_value
=
FindMax
(
input_ptr
,
depth
);
float
sum
=
0
;
std
::
vector
<
float
>
depth_cache
(
depth
);
for
(
index_t
d
=
0
;
d
<
depth
;
++
d
)
{
float
exp_value
=
::
exp
((
static_cast
<
int
>
(
input_ptr
[
d
])
-
max_value
)
*
input_scale
);
sum
+=
exp_value
;
depth_cache
[
d
]
=
exp_value
;
}
sum
=
std
::
max
(
sum
,
std
::
numeric_limits
<
float
>::
min
());
for
(
index_t
d
=
0
;
d
<
depth
;
++
d
)
{
double
output_f
=
depth_cache
[
d
]
/
sum
;
output_ptr
[
d
]
=
static_cast
<
uint8_t
>
(
output_f
*
255
);
}
}
return
MACE_SUCCESS
;
}
int32_t
scale_q
=
static_cast
<
int32_t
>
(
std
::
min
(
static_cast
<
double
>
(
input_scale
)
*
(
1
<<
(
31
-
kInputDeltaIntBits
)),
(
1ll
<<
31
)
-
1.0
));
int32_t
input_delta_limit
=
-
((
1ll
<<
31
)
-
1
)
/
scale_q
;
#pragma omp parallel for
for
(
index_t
b
=
0
;
b
<
batch
;
++
b
)
{
const
uint8_t
*
input_ptr
=
input_data
+
b
*
depth
;
uint8_t
*
output_ptr
=
output_data
+
b
*
depth
;
FixPointSumExp
sum
=
FixPointSumExp
::
Zero
();
uint8_t
max_value
=
FindMax
(
input_ptr
,
depth
);
index_t
d
=
0
;
// Neon optimization is not useful so far as we benchmark.
// Enable it when we find a case that proves it useful.
#if 0 && defined(MACE_ENABLE_NEON)
using FixPointInputDeltaInt32x4 = gemmlowp::FixedPoint<int32x4_t,
kInputDeltaIntBits>;
using FixPointSumExpInt32x4 = gemmlowp::FixedPoint<int32x4_t,
kSumExpIntBits>;
using FixPoint0Int32x4 = gemmlowp::FixedPoint<int32x4_t, 0>;
int16x8_t vmax_value_s16 = vdupq_n_s16(max_value);
int32x4_t vinput_delta_limit_s32 = vdupq_n_s32(input_delta_limit);
FixPointSumExpInt32x4 vsum_s32_fp_0 = FixPointSumExpInt32x4::Zero();
FixPointSumExpInt32x4 vsum_s32_fp_1 = FixPointSumExpInt32x4::Zero();
FixPointSumExpInt32x4 vzero_s32_fp = FixPointSumExpInt32x4::Zero();
int32_t scale_q_multipler, scale_q_shift;
QuantizeMultiplier(scale_q, &scale_q_multipler, &scale_q_shift);
FixPointInputDeltaInt32x4 vscale_s32_fp =
FixPointInputDeltaInt32x4::FromScalarRaw(scale_q);
FixPoint0Int32x4 vscale_s32_fp_multiplier =
FixPoint0Int32x4::FromScalarRaw(scale_q_multipler);
for (; d <= depth - 8; d += 8) {
uint16x8_t vinput_u16 = vmovl_u8(vld1_u8(input_ptr + d));
int16x8_t vinput_delta_s16 =
vsubq_s16(vreinterpretq_s16_u16(vinput_u16), vmax_value_s16);
int32x4_t input_delta_s32_0 = vmovl_s16(vget_low_s16(vinput_delta_s16));
int32x4_t
input_delta_s32_1 = vmovl_s16(vget_high_s16(vinput_delta_s16));
int32x4_t vmask_s32_0 =
gemmlowp::MaskIfGreaterThanOrEqual(input_delta_s32_0,
vinput_delta_limit_s32);
int32x4_t vmask_s32_1 =
gemmlowp::MaskIfGreaterThanOrEqual(input_delta_s32_1,
vinput_delta_limit_s32);
FixPointInputDeltaInt32x4
vscaled_input_delta_s32_fp_0 = vscale_s32_fp_multiplier *
FixPointInputDeltaInt32x4::FromRaw(
gemmlowp::ShiftLeft(input_delta_s32_0, scale_q_shift));
FixPointInputDeltaInt32x4
vscaled_input_delta_s32_fp_1 = vscale_s32_fp_multiplier *
FixPointInputDeltaInt32x4::FromRaw(
gemmlowp::ShiftLeft(input_delta_s32_1, scale_q_shift));
FixPointSumExpInt32x4 vexp_s32_fp_0 = gemmlowp::Rescale<kSumExpIntBits>(
exp_on_negative_values(vscaled_input_delta_s32_fp_0));
FixPointSumExpInt32x4 vexp_s32_fp_1 = gemmlowp::Rescale<kSumExpIntBits>(
exp_on_negative_values(vscaled_input_delta_s32_fp_1));
FixPointSumExpInt32x4 vmasked_exp_s32_fp_0 =
SelectUsingMask(vmask_s32_0, vexp_s32_fp_0, vzero_s32_fp);
FixPointSumExpInt32x4 vmasked_exp_s32_fp_1 =
SelectUsingMask(vmask_s32_1, vexp_s32_fp_1, vzero_s32_fp);
vsum_s32_fp_0 = vsum_s32_fp_0 + vmasked_exp_s32_fp_0;
vsum_s32_fp_1 = vsum_s32_fp_1 + vmasked_exp_s32_fp_1;
}
int32x4_t vsum_s32 = (vsum_s32_fp_0 + vsum_s32_fp_1).raw();
int32x2_t vsum_reduced_2_s32 =
vadd_s32(vget_low_s32(vsum_s32), vget_high_s32(vsum_s32));
int32x2_t vsum_reduced_1_s32 =
vpadd_s32(vsum_reduced_2_s32, vsum_reduced_2_s32);
sum = FixPointSumExp::FromRaw(vget_lane_s32(vsum_reduced_1_s32, 0));
#endif
for
(;
d
<
depth
;
++
d
)
{
int32_t
input_delta
=
static_cast
<
int32_t
>
(
input_ptr
[
d
])
-
max_value
;
if
(
input_delta
>=
input_delta_limit
)
{
int32_t
scaled_input_delta_q
=
scale_q
*
input_delta
;
FixPointInputDelta
scaled_input_delta_fp
=
FixPointInputDelta
::
FromRaw
(
scaled_input_delta_q
);
sum
=
sum
+
gemmlowp
::
Rescale
<
kSumExpIntBits
>
(
exp_on_negative_values
(
scaled_input_delta_fp
));
}
}
int32_t
sum_q
=
sum
.
raw
();
int
left_zero_count
=
__builtin_clz
(
static_cast
<
uint32_t
>
(
sum_q
));
int
tail_count
=
kSumExpIntBits
-
left_zero_count
;
int32_t
fractional_q0
=
static_cast
<
int32_t
>
(
(
static_cast
<
uint32_t
>
(
sum_q
)
<<
left_zero_count
)
-
(
static_cast
<
uint32_t
>
(
1
)
<<
31
));
FixPoint0
recip_sum_q0
=
gemmlowp
::
one_over_one_plus_x_for_x_in_0_1
(
FixPoint0
::
FromRaw
(
fractional_q0
));
d
=
0
;
// Neon optimization is not useful so far as we benchmark.
// Enable it when we find a case that proves it useful.
#if 0 && defined(MACE_ENABLE_NEON)
FixPoint0Int32x4 vrecip_sum_q0_s32_fp =
FixPoint0Int32x4::FromScalarRaw(recip_sum_q0.raw());
int16x8_t vinput_delta_limit_s16 = vdupq_n_s16(input_delta_limit);
for (; d <= depth - 8; d += 8) {
uint16x8_t vinput_u16 = vmovl_u8(vld1_u8(input_ptr + d));
int16x8_t vinput_delta_s16 =
vsubq_s16(vreinterpretq_s16_u16(vinput_u16), vmax_value_s16);
int32x4_t input_delta_s32_0 = vmovl_s16(vget_low_s16(vinput_delta_s16));
int32x4_t
input_delta_s32_1 = vmovl_s16(vget_high_s16(vinput_delta_s16));
int16x8_t vmask_s16 = gemmlowp::MaskIfGreaterThanOrEqual(
vinput_delta_s16,
vinput_delta_limit_s16);
FixPointInputDeltaInt32x4
vscaled_input_delta_s32_fp_0 = vscale_s32_fp_multiplier *
FixPointInputDeltaInt32x4::FromRaw(
gemmlowp::ShiftLeft(input_delta_s32_0, scale_q_shift));
FixPointInputDeltaInt32x4
vscaled_input_delta_s32_fp_1 = vscale_s32_fp_multiplier *
FixPointInputDeltaInt32x4::FromRaw(
gemmlowp::ShiftLeft(input_delta_s32_1, scale_q_shift));
FixPoint0Int32x4 vexp_s32_fp_0 =
exp_on_negative_values(vscaled_input_delta_s32_fp_0);
FixPoint0Int32x4 vexp_s32_fp_1 =
exp_on_negative_values(vscaled_input_delta_s32_fp_1);
int32x4_t voutput_data_s32_0 = gemmlowp::RoundingDivideByPOT(
(vrecip_sum_q0_s32_fp * vexp_s32_fp_0).raw(), tail_count + 31 - 8);
int32x4_t voutput_data_s32_1 = gemmlowp::RoundingDivideByPOT(
(vrecip_sum_q0_s32_fp * vexp_s32_fp_1).raw(), tail_count + 31 - 8);
int16x8_t voutput_data_s16 =
vcombine_s16(vqmovn_s32(voutput_data_s32_0),
vqmovn_s32(voutput_data_s32_1));
int16x8_t masked_voutput_data_s16 =
gemmlowp::SelectUsingMask(vmask_s16,
voutput_data_s16,
vdupq_n_s16(0));
uint8x8_t voutput_u8 = vqmovun_s16(masked_voutput_data_s16);
vst1_u8(output_ptr + d, voutput_u8);
}
#endif
for
(;
d
<
depth
;
++
d
)
{
int32_t
input_delta
=
static_cast
<
int32_t
>
(
input_ptr
[
d
])
-
max_value
;
if
(
input_delta
>=
input_delta_limit
)
{
int32_t
scaled_input_delta_q
=
scale_q
*
input_delta
;
FixPointInputDelta
scaled_input_delta_fp
=
FixPointInputDelta
::
FromRaw
(
scaled_input_delta_q
);
FixPoint0
exp
=
exp_on_negative_values
(
scaled_input_delta_fp
);
int32_t
output_data
=
gemmlowp
::
RoundingDivideByPOT
(
(
recip_sum_q0
*
exp
).
raw
(),
tail_count
+
31
-
8
);
output_ptr
[
d
]
=
std
::
max
(
std
::
min
(
output_data
,
255
),
0
);
}
}
}
return
MACE_SUCCESS
;
}
};
#ifdef MACE_ENABLE_OPENCL
template
<
typename
T
>
struct
SoftmaxFunctor
<
DeviceType
::
GPU
,
T
>
{
...
...
mace/ops/softmax.cc
浏览文件 @
f4230919
...
...
@@ -23,6 +23,11 @@ void Register_Softmax(OperatorRegistryBase *op_registry) {
.
TypeConstraint
<
float
>
(
"T"
)
.
Build
(),
SoftmaxOp
<
DeviceType
::
CPU
,
float
>
);
MACE_REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Softmax"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
uint8_t
>
(
"T"
)
.
Build
(),
SoftmaxOp
<
DeviceType
::
CPU
,
uint8_t
>
);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"Softmax"
)
...
...
mace/ops/softmax_benchmark.cc
浏览文件 @
f4230919
...
...
@@ -68,6 +68,42 @@ void SoftmaxBenchmark(
}
net
.
Sync
();
}
template
<
>
void
SoftmaxBenchmark
<
CPU
,
uint8_t
>
(
int
iters
,
int
batch
,
int
channels
,
int
height
,
int
width
)
{
mace
::
testing
::
StopTiming
();
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
DeviceType
::
CPU
,
uint8_t
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
OpDefBuilder
(
"Softmax"
,
"SoftmaxBM"
)
.
Input
(
"Input"
)
.
Output
(
"Output"
)
.
AddIntArg
(
"T"
,
DT_UINT8
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
Setup
(
DeviceType
::
CPU
);
Tensor
*
output
=
net
.
GetTensor
(
"Output"
);
output
->
SetScale
(
0
);
output
->
SetZeroPoint
(
1
);
// Warm-up
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
net
.
Run
();
}
net
.
Sync
();
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
net
.
Run
();
}
net
.
Sync
();
}
}
// namespace
#define MACE_BM_SOFTMAX_MACRO(N, C, H, W, TYPE, DEVICE) \
...
...
@@ -82,6 +118,7 @@ void SoftmaxBenchmark(
#define MACE_BM_SOFTMAX(N, C, H, W) \
MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, CPU); \
MACE_BM_SOFTMAX_MACRO(N, C, H, W, uint8_t, CPU); \
MACE_BM_SOFTMAX_MACRO(N, C, H, W, float, GPU); \
MACE_BM_SOFTMAX_MACRO(N, C, H, W, half, GPU);
...
...
mace/ops/softmax_test.cc
浏览文件 @
f4230919
...
...
@@ -155,6 +155,56 @@ TEST_F(SoftmaxOpTest, OPENCLAlignedRank2) {
Complex
<
DeviceType
::
GPU
>
({
3
,
1001
});
}
namespace
{
void
TestQuantizedSoftmax
(
const
std
::
vector
<
index_t
>
&
input_shape
)
{
OpsTestNet
net
;
net
.
AddRandomInput
<
CPU
,
float
>
(
"Input"
,
input_shape
,
false
,
true
);
OpDefBuilder
(
"Softmax"
,
"SoftmaxTest"
)
.
Input
(
"Input"
)
.
Output
(
"Output"
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
();
OpDefBuilder
(
"Quantize"
,
"QuantizeInput"
)
.
Input
(
"Input"
)
.
Output
(
"QuantizedInput"
)
.
OutputType
({
DT_UINT8
})
.
AddIntArg
(
"T"
,
DT_UINT8
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
();
OpDefBuilder
(
"Softmax"
,
"SoftmaxQuantizeTest"
)
.
Input
(
"QuantizedInput"
)
.
Output
(
"QuantizedOutput"
)
.
OutputType
({
DT_UINT8
})
.
AddIntArg
(
"T"
,
DT_UINT8
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
Setup
(
DeviceType
::
CPU
);
Tensor
*
q_output
=
net
.
GetTensor
(
"QuantizedOutput"
);
q_output
->
SetScale
(
1.0
f
/
255
);
q_output
->
SetZeroPoint
(
0
);
net
.
Run
();
OpDefBuilder
(
"Dequantize"
,
"DeQuantizeTest"
)
.
Input
(
"QuantizedOutput"
)
.
Output
(
"DequantizedOutput"
)
.
OutputType
({
DT_FLOAT
})
.
AddIntArg
(
"T"
,
DT_UINT8
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
();
// Check
ExpectTensorSimilar
<
float
>
(
*
net
.
GetOutput
(
"Output"
),
*
net
.
GetTensor
(
"DequantizedOutput"
),
0.1
);
}
}
// namespace
TEST_F
(
SoftmaxOpTest
,
QuantizeTest
)
{
TestQuantizedSoftmax
({
5
,
10
});
TestQuantizedSoftmax
({
50
,
100
});
TestQuantizedSoftmax
({
1
,
31
});
}
}
// namespace test
}
// namespace ops
}
// namespace mace
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录