Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
毕竟曾有刹那
Mace
提交
3e9bb73e
Mace
项目概览
毕竟曾有刹那
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
3e9bb73e
编写于
4月 29, 2019
作者:
叶
叶剑武
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'fp16' into 'master'
Fp16 See merge request !1073
上级
a33e507c
7b147bbd
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
508 addition
and
44 deletion
+508
-44
mace/core/BUILD.bazel
mace/core/BUILD.bazel
+1
-1
mace/core/tensor.h
mace/core/tensor.h
+15
-16
mace/core/types.cc
mace/core/types.cc
+4
-0
mace/core/types.h
mace/core/types.h
+6
-0
mace/ops/BUILD.bazel
mace/ops/BUILD.bazel
+26
-12
mace/ops/arm/fp16/gemv.h
mace/ops/arm/fp16/gemv.h
+120
-0
mace/ops/cast.cc
mace/ops/cast.cc
+8
-0
mace/ops/gather.cc
mace/ops/gather.cc
+4
-0
mace/ops/matmul.cc
mace/ops/matmul.cc
+96
-0
mace/ops/matmul_benchmark.cc
mace/ops/matmul_benchmark.cc
+39
-6
mace/ops/matmul_test.cc
mace/ops/matmul_test.cc
+76
-0
mace/ops/ops_test_util.h
mace/ops/ops_test_util.h
+2
-1
mace/proto/mace.proto
mace/proto/mace.proto
+1
-0
mace/python/tools/converter_tool/base_converter.py
mace/python/tools/converter_tool/base_converter.py
+2
-0
mace/python/tools/converter_tool/transformer.py
mace/python/tools/converter_tool/transformer.py
+101
-6
mace/python/tools/model_saver.py
mace/python/tools/model_saver.py
+7
-2
未找到文件。
mace/core/BUILD.bazel
浏览文件 @
3e9bb73e
...
...
@@ -71,7 +71,7 @@ cc_library(
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfpu=neon
-fp16
"
,
"-mfloat-abi=softfp"
,
]),
linkopts
=
[
"-ldl"
],
...
...
mace/core/tensor.h
浏览文件 @
3e9bb73e
...
...
@@ -46,28 +46,28 @@ namespace mace {
break; \
}
#ifdef MACE_ENABLE_OPENCL
#define MACE_TYPE_ENUM_SWITCH( \
TYPE_ENUM, STATEMENTS, INVALID_STATEMENTS, DEFAULT_STATEMENTS) \
switch (TYPE_ENUM) { \
MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_CASE(float, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_CASE(uint8_t, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_CASE(int32_t, MACE_SINGLE_ARG(STATEMENTS)) \
case DT_INVALID: \
INVALID_STATEMENTS; \
break; \
default: \
DEFAULT_STATEMENTS; \
break; \
}
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
#define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \
MACE_CASE(float16_t, MACE_SINGLE_ARG(STATEMENTS))
#else
#define MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS)
#endif
#if MACE_ENABLE_OPENCL
#define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \
MACE_CASE(half, MACE_SINGLE_ARG(STATEMENTS))
#else
#define MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS)
#endif
#define MACE_TYPE_ENUM_SWITCH( \
TYPE_ENUM, STATEMENTS, INVALID_STATEMENTS, DEFAULT_STATEMENTS) \
switch (TYPE_ENUM) { \
MACE_CASE(float, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_CASE(uint8_t, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_CASE(int32_t, MACE_SINGLE_ARG(STATEMENTS)) \
MACE_TYPE_ENUM_SWITCH_CASE_NEON(STATEMENTS) \
MACE_TYPE_ENUM_SWITCH_CASE_OPENCL(STATEMENTS) \
case DT_INVALID: \
INVALID_STATEMENTS; \
break; \
...
...
@@ -75,7 +75,6 @@ namespace mace {
DEFAULT_STATEMENTS; \
break; \
}
#endif
// `TYPE_ENUM` will be converted to template `T` in `STATEMENTS`
#define MACE_RUN_WITH_TYPE_ENUM(TYPE_ENUM, STATEMENTS) \
...
...
mace/core/types.cc
浏览文件 @
3e9bb73e
...
...
@@ -47,6 +47,10 @@ size_t GetEnumTypeSize(const DataType dt) {
return
sizeof
(
float
);
case
DT_HALF
:
return
sizeof
(
half
);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
case
DT_FLOAT16
:
return
sizeof
(
float16_t
);
#endif
case
DT_UINT8
:
return
sizeof
(
uint8_t
);
case
DT_INT32
:
...
...
mace/core/types.h
浏览文件 @
3e9bb73e
...
...
@@ -17,6 +17,9 @@
#include <cstdint>
#include <string>
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
#include <arm_neon.h>
#endif
#include "mace/proto/mace.pb.h"
#include "include/half.hpp"
...
...
@@ -51,6 +54,9 @@ struct EnumToDataType;
};
MACE_MAPPING_DATA_TYPE_AND_ENUM
(
half
,
DT_HALF
);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_MAPPING_DATA_TYPE_AND_ENUM
(
float16_t
,
DT_FLOAT16
);
#endif
MACE_MAPPING_DATA_TYPE_AND_ENUM
(
float
,
DT_FLOAT
);
MACE_MAPPING_DATA_TYPE_AND_ENUM
(
uint8_t
,
DT_UINT8
);
MACE_MAPPING_DATA_TYPE_AND_ENUM
(
int32_t
,
DT_INT32
);
...
...
mace/ops/BUILD.bazel
浏览文件 @
3e9bb73e
...
...
@@ -38,7 +38,7 @@ cc_library(
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfpu=neon
-fp16
"
,
"-mfloat-abi=softfp"
,
])
+
if_opencl_enabled
([
"-DMACE_ENABLE_OPENCL"
,
...
...
@@ -61,7 +61,20 @@ cc_library(
"-Werror"
,
"-Wextra"
,
"-Wno-missing-field-initializers"
,
],
]
+
if_openmp_enabled
([
"-fopenmp"
,
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon-fp16"
,
"-mfloat-abi=softfp"
,
])
+
if_opencl_enabled
([
"-DMACE_ENABLE_OPENCL"
,
])
+
if_quantize_enabled
([
"-DMACE_ENABLE_QUANTIZE"
,
])
+
if_hexagon_enabled
([
"-DMACE_ENABLE_HEXAGON"
,
]),
deps
=
[
"//mace/core"
,
"@gtest"
,
...
...
@@ -89,7 +102,7 @@ cc_library(
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfpu=neon
-fp16
"
,
"-mfloat-abi=softfp"
,
])
+
if_opencl_enabled
([
"-DMACE_ENABLE_OPENCL"
,
...
...
@@ -111,6 +124,7 @@ cc_library(
srcs
=
glob
(
[
"arm/fp32/*.cc"
,
"arm/fp16/gemv.h"
,
],
exclude
=
[
"arm/fp32/*_test.cc"
,
...
...
@@ -141,7 +155,7 @@ cc_library(
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfpu=neon
-fp16
"
,
"-mfloat-abi=softfp"
,
])
+
if_opencl_enabled
([
"-DMACE_ENABLE_OPENCL"
,
...
...
@@ -186,7 +200,7 @@ cc_library(
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfpu=neon
-fp16
"
,
"-mfloat-abi=softfp"
,
])
+
if_opencl_enabled
([
"-DMACE_ENABLE_OPENCL"
,
...
...
@@ -221,7 +235,7 @@ cc_library(
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfpu=neon
-fp16
"
,
"-mfloat-abi=softfp"
,
])
+
if_opencl_enabled
([
"-DMACE_ENABLE_OPENCL"
,
...
...
@@ -256,7 +270,7 @@ cc_library(
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfpu=neon
-fp16
"
,
"-mfloat-abi=softfp"
,
])
+
if_opencl_enabled
([
"-DMACE_ENABLE_OPENCL"
,
...
...
@@ -324,7 +338,7 @@ cc_library(
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfpu=neon
-fp16
"
,
"-mfloat-abi=softfp"
,
])
+
if_opencl_enabled
([
"-DMACE_ENABLE_OPENCL"
,
...
...
@@ -368,7 +382,7 @@ cc_library(
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfpu=neon
-fp16
"
,
"-mfloat-abi=softfp"
,
])
+
if_opencl_enabled
([
"-DMACE_ENABLE_OPENCL"
,
...
...
@@ -398,7 +412,7 @@ cc_library(
]
+
if_openmp_enabled
([
"-fopenmp"
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfpu=neon
-fp16
"
,
])
+
if_android_armv7
([
"-mfloat-abi=softfp"
,
])
+
if_opencl_enabled
([
...
...
@@ -439,7 +453,7 @@ cc_test(
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfpu=neon
-fp16
"
,
"-mfloat-abi=softfp"
,
])
+
if_opencl_enabled
([
"-DMACE_ENABLE_OPENCL"
,
...
...
@@ -476,7 +490,7 @@ cc_test(
])
+
if_neon_enabled
([
"-DMACE_ENABLE_NEON"
,
])
+
if_android_armv7
([
"-mfpu=neon"
,
"-mfpu=neon
-fp16
"
,
"-mfloat-abi=softfp"
,
])
+
if_opencl_enabled
([
"-DMACE_ENABLE_OPENCL"
,
...
...
mace/ops/arm/fp16/gemv.h
0 → 100644
浏览文件 @
3e9bb73e
// Copyright 2018 Xiaomi, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef MACE_OPS_ARM_FP16_GEMV_H_
#define MACE_OPS_ARM_FP16_GEMV_H_
#include "mace/core/types.h"
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
#include <arm_neon.h>
#endif
#if defined(MACE_ENABLE_NEON) && !defined(__aarch64__) && defined(__ANDROID__)
#define vaddvq_f32(v) ((v)[0] + (v)[1] + (v)[2] + (v)[3])
#endif
namespace
mace
{
namespace
ops
{
template
<
typename
INPUT_TYPE_LEFT
,
typename
INPUT_TYPE_RIGHT
,
typename
OUTPUT_TYPE
>
void
FP16Gemv
(
const
INPUT_TYPE_LEFT
*
m_ptr
,
const
INPUT_TYPE_RIGHT
*
v_ptr
,
const
index_t
height
,
const
index_t
width
,
OUTPUT_TYPE
*
result
);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
template
<
>
void
FP16Gemv
<
float16_t
,
float
,
float
>
(
const
float16_t
*
m_ptr
,
const
float
*
v_ptr
,
const
index_t
height
,
const
index_t
width
,
float
*
out_ptr
)
{
#pragma omp parallel for
for
(
index_t
h
=
0
;
h
<
height
;
++
h
)
{
const
float16_t
*
m_ptr0
=
m_ptr
+
h
*
width
;
const
float
*
v_ptr0
=
v_ptr
;
float
*
out_ptr0
=
out_ptr
+
h
;
float
sum0
=
0
;
float32x4_t
vm0
,
vm1
,
vm2
,
vm3
;
float32x4_t
vv0
,
vv1
,
vv2
,
vv3
;
float32x4_t
vsum0
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vsum1
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vsum2
=
vdupq_n_f32
(
0.
f
);
float32x4_t
vsum3
=
vdupq_n_f32
(
0.
f
);
index_t
w
;
for
(
w
=
0
;
w
+
15
<
width
;
w
+=
16
)
{
vm0
=
vcvt_f32_f16
(
vld1_f16
(
m_ptr0
));
vv0
=
vld1q_f32
(
v_ptr0
);
vm1
=
vcvt_f32_f16
(
vld1_f16
(
m_ptr0
+
4
));
vv1
=
vld1q_f32
(
v_ptr0
+
4
);
vm2
=
vcvt_f32_f16
(
vld1_f16
(
m_ptr0
+
8
));
vv2
=
vld1q_f32
(
v_ptr0
+
8
);
vm3
=
vcvt_f32_f16
(
vld1_f16
(
m_ptr0
+
12
));
vv3
=
vld1q_f32
(
v_ptr0
+
12
);
vsum0
=
vmlaq_f32
(
vsum0
,
vm0
,
vv0
);
vsum1
=
vmlaq_f32
(
vsum1
,
vm1
,
vv1
);
vsum2
=
vmlaq_f32
(
vsum2
,
vm2
,
vv2
);
vsum3
=
vmlaq_f32
(
vsum3
,
vm3
,
vv3
);
m_ptr0
+=
16
;
v_ptr0
+=
16
;
}
for
(;
w
+
7
<
width
;
w
+=
8
)
{
vm0
=
vcvt_f32_f16
(
vld1_f16
(
m_ptr0
));
vv0
=
vld1q_f32
(
v_ptr0
);
vm1
=
vcvt_f32_f16
(
vld1_f16
(
m_ptr0
+
4
));
vv1
=
vld1q_f32
(
v_ptr0
+
4
);
vsum0
=
vmlaq_f32
(
vsum0
,
vm0
,
vv0
);
vsum1
=
vmlaq_f32
(
vsum1
,
vm1
,
vv1
);
m_ptr0
+=
8
;
v_ptr0
+=
8
;
}
for
(;
w
+
3
<
width
;
w
+=
4
)
{
vm0
=
vcvt_f32_f16
(
vld1_f16
(
m_ptr0
));
vv0
=
vld1q_f32
(
v_ptr0
);
vsum0
=
vmlaq_f32
(
vsum0
,
vm0
,
vv0
);
m_ptr0
+=
4
;
v_ptr0
+=
4
;
}
vsum0
+=
vsum1
;
vsum2
+=
vsum3
;
vsum0
+=
vsum2
;
sum0
=
vaddvq_f32
(
vsum0
);
for
(;
w
<
width
;
++
w
)
{
sum0
+=
m_ptr0
[
0
]
*
v_ptr0
[
0
];
m_ptr0
++
;
v_ptr0
++
;
}
*
out_ptr0
++
=
sum0
;
}
}
#endif
}
// namespace ops
}
// namespace mace
#endif // MACE_OPS_ARM_FP16_GEMV_H_
mace/ops/cast.cc
浏览文件 @
3e9bb73e
...
...
@@ -14,6 +14,10 @@
#include "mace/core/operator.h"
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
#include <arm_neon.h>
#endif
namespace
mace
{
namespace
ops
{
...
...
@@ -55,6 +59,10 @@ void RegisterCast(OpRegistryBase *op_registry) {
DeviceType
::
CPU
,
float
);
MACE_REGISTER_OP
(
op_registry
,
"Cast"
,
CastOp
,
DeviceType
::
CPU
,
int32_t
);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP
(
op_registry
,
"Cast"
,
CastOp
,
DeviceType
::
CPU
,
float16_t
);
#endif
}
}
// namespace ops
...
...
mace/ops/gather.cc
浏览文件 @
3e9bb73e
...
...
@@ -93,6 +93,10 @@ void RegisterGather(OpRegistryBase *op_registry) {
MACE_REGISTER_OP
(
op_registry
,
"Gather"
,
GatherOp
,
DeviceType
::
CPU
,
uint8_t
);
#endif // MACE_ENABLE_QUANTIZE
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP
(
op_registry
,
"Gather"
,
GatherOp
,
DeviceType
::
CPU
,
float16_t
);
#endif
}
}
// namespace ops
...
...
mace/ops/matmul.cc
浏览文件 @
3e9bb73e
...
...
@@ -44,6 +44,9 @@
#include "mace/ops/opencl/buffer_transformer.h"
#include "mace/ops/opencl/image/matmul.h"
#endif // MACE_ENABLE_OPENCL
#ifdef MACE_ENABLE_NEON
#include "mace/ops/arm/fp16/gemv.h"
#endif
namespace
mace
{
namespace
ops
{
...
...
@@ -510,6 +513,86 @@ class MatMulOp<DeviceType::GPU, T> : public MatMulOpBase {
};
#endif // MACE_ENABLE_OPENCL
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
template
<
>
class
MatMulOp
<
CPU
,
float16_t
>
:
public
MatMulOpBase
{
public:
explicit
MatMulOp
(
OpConstructContext
*
context
)
:
MatMulOpBase
(
context
)
{}
MaceStatus
Run
(
OpContext
*
context
)
override
{
MACE_CHECK_NOTNULL
(
context
);
Validate
();
const
Tensor
*
A
=
this
->
Input
(
INPUT_A
);
const
Tensor
*
B
=
this
->
Input
(
INPUT_B
);
Tensor
*
C
=
this
->
Output
(
OUTPUT
);
index_t
batch
;
index_t
height
;
index_t
K
;
index_t
width
;
index_t
rank
=
A
->
dim_size
();
height
=
A
->
dim
(
rank
-
2
);
K
=
A
->
dim
(
rank
-
1
);
if
(
transpose_a_
)
{
std
::
swap
(
height
,
K
);
}
if
(
transpose_b_
)
{
width
=
B
->
dim
(
rank
-
2
);
}
else
{
width
=
B
->
dim
(
rank
-
1
);
}
batch
=
std
::
accumulate
(
A
->
shape
().
begin
(),
A
->
shape
().
end
()
-
2
,
1
,
std
::
multiplies
<
index_t
>
());
std
::
vector
<
index_t
>
c_shape
=
A
->
shape
();
c_shape
[
rank
-
2
]
=
height
;
c_shape
[
rank
-
1
]
=
width
;
MACE_RETURN_IF_ERROR
(
C
->
Resize
(
c_shape
));
Tensor
::
MappingGuard
guarda
(
A
);
Tensor
::
MappingGuard
guardb
(
B
);
Tensor
::
MappingGuard
guardc
(
C
);
auto
*
c_ptr_base
=
C
->
mutable_data
<
float
>
();
MACE_CHECK
(
batch
==
1
,
"matmul fp16 only support batch = 1 now"
);
if
(
width
==
1
&&
!
transpose_a_
&&
A
->
dtype
()
==
DT_FLOAT16
&&
B
->
dtype
()
==
DT_FLOAT
)
{
auto
*
a_ptr_base
=
A
->
data
<
float16_t
>
();
auto
*
b_ptr_base
=
B
->
data
<
float
>
();
FP16Gemv
(
a_ptr_base
,
b_ptr_base
,
height
,
K
,
c_ptr_base
);
return
MaceStatus
::
MACE_SUCCESS
;
}
else
if
(
height
==
1
&&
transpose_b_
&&
A
->
dtype
()
==
DT_FLOAT
&&
B
->
dtype
()
==
DT_FLOAT16
)
{
auto
*
b_ptr_base
=
B
->
data
<
float16_t
>
();
auto
*
a_ptr_base
=
A
->
data
<
float
>
();
FP16Gemv
(
b_ptr_base
,
a_ptr_base
,
width
,
K
,
c_ptr_base
);
return
MaceStatus
::
MACE_SUCCESS
;
}
else
{
LOG
(
INFO
)
<<
"Matmul fp16 gemv args: "
<<
height
<<
" "
<<
width
<<
" "
<<
transpose_a_
<<
" "
<<
transpose_b_
;
LOG
(
FATAL
)
<<
"Matmul fp16 Op only support fp32[1,k]·fp16[w,k]T or"
" fp16[w,k]·fp32[k,1] now!"
;
return
MaceStatus
::
MACE_INVALID_ARGS
;
}
}
private:
};
#endif // MACE_ENABLE_NEON
void
RegisterMatMul
(
OpRegistryBase
*
op_registry
)
{
MACE_REGISTER_OP
(
op_registry
,
"MatMul"
,
MatMulOp
,
DeviceType
::
CPU
,
float
);
...
...
@@ -518,6 +601,19 @@ void RegisterMatMul(OpRegistryBase *op_registry) {
MACE_REGISTER_OP
(
op_registry
,
"MatMul"
,
MatMulOp
,
DeviceType
::
CPU
,
uint8_t
);
#endif // MACE_ENABLE_QUANTIZE
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OP
(
op_registry
,
"MatMul"
,
MatMulOp
,
DeviceType
::
GPU
,
float
);
MACE_REGISTER_OP
(
op_registry
,
"MatMul"
,
MatMulOp
,
DeviceType
::
GPU
,
half
);
#endif // MACE_ENABLE_OPENCL
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
MACE_REGISTER_OP
(
op_registry
,
"MatMul"
,
MatMulOp
,
DeviceType
::
CPU
,
float16_t
);
#endif // MACE_ENABLE_NEON
}
}
// namespace ops
...
...
mace/ops/matmul_benchmark.cc
浏览文件 @
3e9bb73e
...
...
@@ -263,8 +263,17 @@ void MatMulBenchmark(
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"A"
,
{
batch
,
height
,
channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"B"
,
{
batch
,
channels
,
out_width
});
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
if
(
DataTypeToEnum
<
T
>::
value
==
DT_FLOAT16
)
{
net
.
AddRandomInput
<
D
,
float16_t
>
(
"A"
,
{
batch
,
height
,
channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"B"
,
{
batch
,
channels
,
out_width
});
}
else
{
#endif
net
.
AddRandomInput
<
D
,
T
>
(
"A"
,
{
batch
,
height
,
channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"B"
,
{
batch
,
channels
,
out_width
});
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
}
#endif
net
.
GetTensor
(
"A"
)
->
SetIsWeight
(
true
);
net
.
GetTensor
(
"B"
)
->
SetIsWeight
(
true
);
if
(
DataTypeToEnum
<
T
>::
value
==
DT_UINT8
)
{
...
...
@@ -305,8 +314,17 @@ void MatMulTransposeBenchmark(
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
D
,
T
>
(
"A"
,
{
batch
,
height
,
channels
});
net
.
AddRandomInput
<
D
,
T
>
(
"B"
,
{
batch
,
out_width
,
channels
});
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
if
(
DataTypeToEnum
<
T
>::
value
==
DT_FLOAT16
)
{
net
.
AddRandomInput
<
D
,
float
>
(
"A"
,
{
batch
,
height
,
channels
});
net
.
AddRandomInput
<
D
,
float16_t
>
(
"B"
,
{
batch
,
out_width
,
channels
});
}
else
{
#endif
net
.
AddRandomInput
<
D
,
T
>
(
"A"
,
{
batch
,
height
,
channels
});
net
.
AddRandomInput
<
D
,
float
>
(
"B"
,
{
batch
,
out_width
,
channels
});
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
}
#endif
net
.
GetTensor
(
"A"
)
->
SetIsWeight
(
true
);
net
.
GetTensor
(
"B"
)
->
SetIsWeight
(
true
);
if
(
DataTypeToEnum
<
T
>::
value
==
DT_UINT8
)
{
...
...
@@ -381,10 +399,10 @@ void MatMulTransposeBenchmark(
#ifdef MACE_ENABLE_QUANTIZE
#define MACE_BM_MATMUL_TRANPOSE(N, H, C, W) \
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU); \
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU)
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, uint8_t, CPU)
;
#else
#define MACE_BM_MATMUL_TRANPOSE(N, H, C, W) \
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU)
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float, CPU)
;
#endif
MACE_BM_MATMUL_OP
(
1
,
30000
,
256
,
1
);
...
...
@@ -407,6 +425,21 @@ MACE_BM_MATMUL_TRANPOSE(16, 128, 128, 49);
MACE_BM_MATMUL_TRANPOSE
(
16
,
128
,
128
,
961
);
MACE_BM_MATMUL_TRANPOSE
(
16
,
128
,
128
,
3969
);
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
#define MACE_BM_MATMUL_TRANPOSE_FP16(N, H, C, W) \
MACE_BM_MATMUL_TRANSPOSE_MACRO(N, H, C, W, float16_t, CPU);
MACE_BM_MATMUL_TRANPOSE_FP16
(
1
,
1
,
256
,
30000
);
MACE_BM_MATMUL_TRANPOSE_FP16
(
1
,
1
,
256
,
256
);
MACE_BM_MATMUL_TRANPOSE_FP16
(
1
,
1
,
256
,
2048
);
MACE_BM_MATMUL_TRANPOSE_FP16
(
1
,
1
,
2048
,
256
);
MACE_BM_MATMUL_TRANPOSE_FP16
(
1
,
1
,
512
,
30000
);
MACE_BM_MATMUL_TRANPOSE_FP16
(
1
,
1
,
512
,
512
);
MACE_BM_MATMUL_TRANPOSE_FP16
(
1
,
1
,
512
,
2048
);
MACE_BM_MATMUL_TRANPOSE_FP16
(
1
,
1
,
2048
,
512
);
#endif // MACE_ENABLE_NEON
}
// namespace test
}
// namespace ops
}
// namespace mace
mace/ops/matmul_test.cc
浏览文件 @
3e9bb73e
...
...
@@ -330,6 +330,69 @@ void QuantOutputInt32(const std::vector<index_t> &batch,
}
}
// namespace
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
namespace
{
void
FloatOutput16
(
const
std
::
vector
<
index_t
>
&
batch
,
const
index_t
rows
,
const
index_t
depth
,
const
index_t
cols
,
const
bool
transpose_lhs
,
const
bool
transpose_rhs
,
const
bool
lhs_batched
=
true
,
const
bool
rhs_batched
=
true
)
{
// Construct graph
OpsTestNet
net
;
index_t
lhs_rows
=
transpose_lhs
?
depth
:
rows
;
index_t
lhs_cols
=
transpose_lhs
?
rows
:
depth
;
index_t
rhs_rows
=
transpose_rhs
?
cols
:
depth
;
index_t
rhs_cols
=
transpose_rhs
?
depth
:
cols
;
std
::
vector
<
index_t
>
lhs_shape
=
{
lhs_rows
,
lhs_cols
};
std
::
vector
<
index_t
>
rhs_shape
=
{
rhs_rows
,
rhs_cols
};
if
(
lhs_batched
)
{
lhs_shape
.
insert
(
lhs_shape
.
begin
(),
batch
.
begin
(),
batch
.
end
());
}
if
(
rhs_batched
)
{
rhs_shape
.
insert
(
rhs_shape
.
begin
(),
batch
.
begin
(),
batch
.
end
());
}
net
.
AddRandomInput
<
CPU
,
float
>
(
"A"
,
lhs_shape
);
net
.
AddRandomInput
<
CPU
,
float
>
(
"B"
,
rhs_shape
);
OpDefBuilder
(
"MatMul"
,
"MatMulTest"
)
.
Input
(
"A"
)
.
AddIntArg
(
"transpose_a"
,
transpose_lhs
?
1
:
0
)
.
Input
(
"B"
)
.
AddIntArg
(
"transpose_b"
,
transpose_rhs
?
1
:
0
)
.
Output
(
"Output"
)
.
AddIntArg
(
"T"
,
DT_FLOAT
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
(
CPU
);
OpDefBuilder
(
"Cast"
,
"CastTest"
)
.
Input
(
"B"
)
.
Output
(
"HalveB"
)
.
OutputType
({
DT_FLOAT16
})
.
AddIntArg
(
"T"
,
DT_FLOAT
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
();
OpDefBuilder
(
"MatMul"
,
"Float16MatMulTest"
)
.
Input
(
"A"
)
.
AddIntArg
(
"transpose_a"
,
transpose_lhs
?
1
:
0
)
.
Input
(
"HalveB"
)
.
AddIntArg
(
"transpose_b"
,
transpose_rhs
?
1
:
0
)
.
Output
(
"Float16Output"
)
.
AddIntArg
(
"T"
,
DT_FLOAT16
)
.
OutputType
({
DT_FLOAT
})
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
();
// Check
ExpectTensorSimilar
<
float
>
(
*
net
.
GetOutput
(
"Output"
),
*
net
.
GetTensor
(
"Float16Output"
),
0.01
);
}
}
// namespace
#endif // MACE_ENABLE_NEON
TEST_F
(
MatMulOpTest
,
QuantOutputUint8
)
{
QuantOutputUint8
({
1
},
64
,
128
,
32
,
false
,
false
);
QuantOutputUint8
({
1
},
64
,
32
,
128
,
false
,
false
);
...
...
@@ -381,6 +444,19 @@ TEST_F(MatMulOpTest, QuantOutputInt32) {
QuantOutputInt32
({
2
,
3
},
31
,
61
,
67
,
true
,
true
,
false
,
true
);
}
#if defined(MACE_ENABLE_NEON) && defined(__ANDROID__)
TEST_F
(
MatMulOpTest
,
FloatOutput16
)
{
FloatOutput16
({
1
},
1
,
512
,
30745
,
false
,
true
,
false
,
false
);
FloatOutput16
({
1
},
1
,
256
,
30000
,
false
,
true
,
false
,
false
);
FloatOutput16
({
1
},
1
,
256
,
2048
,
false
,
true
,
false
,
false
);
FloatOutput16
({
1
},
1
,
2048
,
256
,
false
,
true
,
false
,
false
);
FloatOutput16
({
1
},
1
,
512
,
30000
,
false
,
true
,
false
,
false
);
FloatOutput16
({
1
},
1
,
512
,
512
,
false
,
true
,
false
,
false
);
FloatOutput16
({
1
},
1
,
512
,
2048
,
false
,
true
,
false
,
false
);
FloatOutput16
({
1
},
1
,
2048
,
512
,
false
,
true
,
false
,
false
);
}
#endif // MACE_ENABLE_NEON
}
// namespace test
}
// namespace ops
}
// namespace mace
mace/ops/ops_test_util.h
浏览文件 @
3e9bb73e
...
...
@@ -162,7 +162,8 @@ class OpsTestNet {
std
::
random_device
rd
;
std
::
mt19937
gen
(
rd
());
std
::
normal_distribution
<
float
>
nd
(
0
,
1
);
if
(
DataTypeToEnum
<
T
>::
value
==
DT_HALF
)
{
if
(
DataTypeToEnum
<
T
>::
value
==
DT_HALF
||
DataTypeToEnum
<
T
>::
value
==
DT_FLOAT16
)
{
std
::
generate
(
input_data
,
input_data
+
input
->
size
(),
[
&
gen
,
&
nd
,
positive
,
truncate
,
truncate_min
,
truncate_max
]
{
...
...
mace/proto/mace.proto
浏览文件 @
3e9bb73e
...
...
@@ -13,6 +13,7 @@ enum DataType {
DT_UINT8
=
2
;
DT_HALF
=
3
;
DT_INT32
=
4
;
DT_FLOAT16
=
5
;
}
enum
MemoryType
{
...
...
mace/python/tools/converter_tool/base_converter.py
浏览文件 @
3e9bb73e
...
...
@@ -314,6 +314,8 @@ class TransformerRule(Enum):
TRANSFORM_CHANNEL_SHUFFLE
=
38
UPDATE_DATA_FORMAT
=
39
QUANTIZE_SPECIFIC_OPS_ONLY
=
40
FP16_MATMUL_WEIGHT
=
41
FP16_GATHER_WEIGHT
=
42
class
ConverterInterface
(
object
):
...
...
mace/python/tools/converter_tool/transformer.py
浏览文件 @
3e9bb73e
...
...
@@ -106,6 +106,10 @@ class Transformer(base_converter.ConverterInterface):
self
.
transform_channel_shuffle
,
TransformerRule
.
QUANTIZE_SPECIFIC_OPS_ONLY
:
self
.
quantize_specific_ops_only
,
TransformerRule
.
FP16_MATMUL_WEIGHT
:
self
.
fp16_matmul_weight
,
TransformerRule
.
FP16_GATHER_WEIGHT
:
self
.
fp16_gather_weight
,
}
self
.
_option
=
option
...
...
@@ -1853,6 +1857,102 @@ class Transformer(base_converter.ConverterInterface):
op
.
quantize_info
[
i
].
minval
,
op
.
quantize_info
[
i
].
maxval
))
def
fp16_gather_weight
(
self
):
for
op
in
self
.
_model
.
op
:
if
op
.
type
!=
MaceOp
.
Gather
.
name
:
continue
if
op
.
input
[
0
]
not
in
self
.
_consts
:
raise
KeyError
(
"Not in const tensor: "
+
str
(
op
.
input
[
0
]))
const_tensor
=
self
.
_consts
[
op
.
input
[
0
]]
if
const_tensor
.
data_type
==
mace_pb2
.
DT_FLOAT16
:
print
(
str
(
const_tensor
.
name
)
+
" is alreay float16"
)
continue
print
(
"FP16 Embedding Lookup Weights: %s"
%
const_tensor
.
name
)
op_outputs
=
[
x
for
x
in
op
.
output
]
new_gather_name
=
op
.
name
+
'_fp16'
new_gather_output_name
=
new_gather_name
+
":0"
dehalve_name
=
op
.
name
# fp16 weights
const_tensor
.
data_type
=
mace_pb2
.
DT_FLOAT16
# change gather
op
.
name
=
new_gather_name
op
.
output
[:]
=
[
new_gather_output_name
]
# op.output.extend([new_gather_output_name])
data_type_arg
=
ConverterUtil
.
get_arg
(
op
,
MaceKeyword
.
mace_op_data_type_str
)
# noqa
if
data_type_arg
is
None
:
data_type_arg
=
op
.
arg
.
add
()
data_type_arg
.
name
=
MaceKeyword
.
mace_op_data_type_str
data_type_arg
.
i
=
mace_pb2
.
DT_FLOAT16
# add dehalve
dehalve_op
=
self
.
_model
.
op
.
add
()
dehalve_op
.
name
=
dehalve_name
dehalve_op
.
type
=
MaceOp
.
Cast
.
name
dehalve_op
.
input
.
extend
([
new_gather_output_name
])
dehalve_op
.
output
.
extend
(
op_outputs
)
dehalve_op
.
output_shape
.
extend
(
op
.
output_shape
)
dehalve_op
.
output_type
.
extend
([
mace_pb2
.
DT_FLOAT
])
data_type_arg
=
dehalve_op
.
arg
.
add
()
data_type_arg
.
name
=
MaceKeyword
.
mace_op_data_type_str
data_type_arg
.
i
=
mace_pb2
.
DT_FLOAT16
def
fp16_matmul_weight
(
self
):
if
self
.
_option
.
device
!=
DeviceType
.
CPU
.
value
:
return
print
(
'Convert matmul weights to fp16 for specific matmul: activation + weights'
)
# noqa
for
op
in
self
.
_model
.
op
:
if
op
.
type
!=
MaceOp
.
MatMul
.
name
:
continue
if
op
.
input
[
0
]
not
in
self
.
_consts
and
op
.
input
[
1
]
not
in
self
.
_consts
:
# noqa
continue
if
op
.
input
[
0
]
in
self
.
_consts
and
op
.
input
[
1
]
in
self
.
_consts
:
continue
# Matmul fp16 Op only support fp32[1,k] x fp16[w,k]T or fp16[w,k] x fp32[k,1] now! # noqa
transpose_a_arg
=
ConverterUtil
.
get_arg
(
op
,
MaceKeyword
.
mace_transpose_a_str
)
# noqa
transpose_b_arg
=
ConverterUtil
.
get_arg
(
op
,
MaceKeyword
.
mace_transpose_b_str
)
# noqa
transpose_a
=
transpose_a_arg
is
not
None
and
transpose_a_arg
.
i
==
1
# noqa
transpose_b
=
transpose_b_arg
is
not
None
and
transpose_b_arg
.
i
==
1
# noqa
left_tensor
=
op
.
input
[
0
]
right_tensor
=
op
.
input
[
1
]
left_shape
=
self
.
get_tensor_shape
(
left_tensor
)
right_shape
=
self
.
get_tensor_shape
(
right_tensor
)
height
=
left_shape
[
-
1
]
if
transpose_a
else
left_shape
[
-
2
]
width
=
right_shape
[
-
2
]
if
transpose_b
else
right_shape
[
-
1
]
batch
=
reduce
(
lambda
x
,
y
:
x
*
y
,
left_shape
[:
-
2
],
1
)
if
batch
!=
1
:
continue
if
left_tensor
in
self
.
_consts
:
if
width
!=
1
or
transpose_a
:
continue
const_tensor
=
self
.
_consts
[
left_tensor
]
else
:
if
height
!=
1
or
not
transpose_b
:
continue
const_tensor
=
self
.
_consts
[
right_tensor
]
print
(
'Convert Matmul Weights to fp16: %s'
%
op
.
name
)
const_tensor
.
data_type
=
mace_pb2
.
DT_FLOAT16
data_type_arg
=
ConverterUtil
.
get_arg
(
op
,
MaceKeyword
.
mace_op_data_type_str
)
# noqa
if
data_type_arg
is
None
:
data_type_arg
=
op
.
arg
.
add
()
data_type_arg
.
name
=
MaceKeyword
.
mace_op_data_type_str
data_type_arg
.
i
=
mace_pb2
.
DT_FLOAT16
op
.
output_type
.
extend
([
mace_pb2
.
DT_FLOAT
])
def
add_opencl_informations
(
self
):
print
(
"Add OpenCL informations"
)
...
...
@@ -2052,10 +2152,7 @@ class Transformer(base_converter.ConverterInterface):
data_type_arg
=
quantize_op
.
arg
.
add
()
data_type_arg
.
name
=
MaceKeyword
.
mace_non_zero
if
non_zero
:
data_type_arg
.
i
=
1
else
:
data_type_arg
.
i
=
0
data_type_arg
.
i
=
0
find_range_arg
=
quantize_op
.
arg
.
add
()
find_range_arg
.
name
=
\
...
...
@@ -2064,8 +2161,6 @@ class Transformer(base_converter.ConverterInterface):
quantized_inputs_names
[
-
1
]
=
quantize_output_name
non_zero
=
False
del
op
.
input
[:]
op
.
input
.
extend
(
quantized_inputs_names
)
...
...
mace/python/tools/model_saver.py
浏览文件 @
3e9bb73e
...
...
@@ -117,6 +117,9 @@ class TensorInfo:
elif
tensor
.
data_type
==
mace_pb2
.
DT_UINT8
:
self
.
data
=
bytearray
(
np
.
array
(
tensor
.
int32_data
).
astype
(
np
.
uint8
).
tolist
())
elif
tensor
.
data_type
==
mace_pb2
.
DT_FLOAT16
:
self
.
data
=
bytearray
(
np
.
array
(
tensor
.
float_data
).
astype
(
np
.
float16
).
tobytes
())
else
:
raise
Exception
(
'Tensor data type %s not supported'
%
tensor
.
data_type
)
...
...
@@ -139,7 +142,8 @@ def update_tensor_infos(net_def, data_type):
offset
+=
padding
if
tensor
.
data_type
==
mace_pb2
.
DT_FLOAT
\
or
tensor
.
data_type
==
mace_pb2
.
DT_HALF
:
or
tensor
.
data_type
==
mace_pb2
.
DT_HALF
\
or
tensor
.
data_type
==
mace_pb2
.
DT_FLOAT16
:
tensor
.
data_size
=
len
(
tensor
.
float_data
)
elif
tensor
.
data_type
==
mace_pb2
.
DT_INT32
:
tensor
.
data_size
=
len
(
tensor
.
int32_data
)
...
...
@@ -178,7 +182,8 @@ def save_model_data(net_def, model_tag, output_dir):
def
save_model_to_proto
(
net_def
,
model_tag
,
output_dir
):
for
tensor
in
net_def
.
tensors
:
if
tensor
.
data_type
==
mace_pb2
.
DT_FLOAT
\
or
tensor
.
data_type
==
mace_pb2
.
DT_HALF
:
or
tensor
.
data_type
==
mace_pb2
.
DT_HALF
\
or
tensor
.
data_type
==
mace_pb2
.
DT_FLOAT16
:
del
tensor
.
float_data
[:]
elif
tensor
.
data_type
==
mace_pb2
.
DT_INT32
:
del
tensor
.
int32_data
[:]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录