Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
慢慢CG
Mace
提交
ddc1a005
Mace
项目概览
慢慢CG
/
Mace
与 Fork 源项目一致
Fork自
Xiaomi / Mace
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
Mace
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
提交
ddc1a005
编写于
8月 22, 2018
作者:
李
李寅
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'gemmlowp' into 'master'
Add quantized fully connected See merge request !756
上级
56e814b6
092bd2b7
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
245 addition
and
34 deletion
+245
-34
mace/kernels/conv_2d.h
mace/kernels/conv_2d.h
+1
-29
mace/kernels/fully_connected.h
mace/kernels/fully_connected.h
+66
-4
mace/kernels/gemmlowp_util.h
mace/kernels/gemmlowp_util.h
+32
-0
mace/ops/fully_connected.cc
mace/ops/fully_connected.cc
+5
-0
mace/ops/fully_connected_benchmark.cc
mace/ops/fully_connected_benchmark.cc
+39
-1
mace/ops/fully_connected_test.cc
mace/ops/fully_connected_test.cc
+102
-0
未找到文件。
mace/kernels/conv_2d.h
浏览文件 @
ddc1a005
...
...
@@ -822,34 +822,6 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
}
}
typedef
gemmlowp
::
VectorMap
<
const
int32_t
,
gemmlowp
::
VectorShape
::
Col
>
ColVectorMap
;
typedef
std
::
tuple
<
gemmlowp
::
OutputStageBiasAddition
<
ColVectorMap
>
,
gemmlowp
::
OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
,
gemmlowp
::
OutputStageSaturatingCastToUint8
>
Pipeline
;
inline
Pipeline
MakeOutputPipeline
(
const
int32_t
*
bias_data
,
const
index_t
channels
,
const
float
lhs_scale
,
const
float
rhs_scale
,
const
float
output_scale
,
const
int32_t
output_zero_point
)
{
ColVectorMap
bias_vector
(
bias_data
,
channels
);
gemmlowp
::
OutputStageBiasAddition
<
ColVectorMap
>
bias_addition_stage
;
bias_addition_stage
.
bias_vector
=
bias_vector
;
int32_t
quantized_multiplier
;
int32_t
right_shift
;
GetOutputMultiplierAndShift
(
lhs_scale
,
rhs_scale
,
output_scale
,
&
quantized_multiplier
,
&
right_shift
);
gemmlowp
::
OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
quantize_down_stage
;
quantize_down_stage
.
result_offset_after_shift
=
output_zero_point
;
quantize_down_stage
.
result_fixedpoint_multiplier
=
quantized_multiplier
;
quantize_down_stage
.
result_shift
=
right_shift
;
gemmlowp
::
OutputStageSaturatingCastToUint8
saturating_cast_stage
;
return
std
::
make_tuple
(
bias_addition_stage
,
quantize_down_stage
,
saturating_cast_stage
);
}
MaceStatus
operator
()(
const
Tensor
*
input
,
// NHWC
const
Tensor
*
filter
,
// OHWI
const
Tensor
*
bias
,
...
...
@@ -959,7 +931,7 @@ struct Conv2dFunctor<DeviceType::CPU, uint8_t> : Conv2dFunctorBase {
gemmlowp
::
MatrixMap
<
uint8_t
,
gemmlowp
::
MapOrder
::
ColMajor
>
output_matrix
(
output_data
,
gemm_output_rows
,
gemm_output_cols
);
const
auto
&
output_pipeline
=
MakeOutputPipelin
e
(
const
auto
&
output_pipeline
=
GemmlowpOutputPipeline
::
Mak
e
(
bias_data
,
channels
,
filter
->
scale
(),
input
->
scale
(),
output
->
scale
(),
output
->
zero_point
());
...
...
mace/kernels/fully_connected.h
浏览文件 @
ddc1a005
...
...
@@ -22,6 +22,7 @@
#include "mace/core/tensor.h"
#include "mace/kernels/activation.h"
#include "mace/kernels/gemm.h"
#include "mace/kernels/gemmlowp_util.h"
namespace
mace
{
namespace
kernels
{
...
...
@@ -46,10 +47,10 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
:
FullyConnectedBase
(
activation
,
relux_max_limit
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
const
Tensor
*
weight
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
const
Tensor
*
weight
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
std
::
vector
<
index_t
>
output_shape
=
{
input
->
dim
(
0
),
weight
->
dim
(
0
),
1
,
1
};
MACE_RETURN_IF_ERROR
(
output
->
Resize
(
output_shape
));
...
...
@@ -83,6 +84,67 @@ struct FullyConnectedFunctor<DeviceType::CPU, float>: FullyConnectedBase {
}
};
template
<
>
struct
FullyConnectedFunctor
<
DeviceType
::
CPU
,
uint8_t
>:
FullyConnectedBase
{
FullyConnectedFunctor
(
const
ActivationType
activation
,
const
float
relux_max_limit
)
:
FullyConnectedBase
(
activation
,
relux_max_limit
)
{}
MaceStatus
operator
()(
const
Tensor
*
input
,
const
Tensor
*
weight
,
const
Tensor
*
bias
,
Tensor
*
output
,
StatsFuture
*
future
)
{
MACE_UNUSED
(
future
);
gemmlowp
::
GemmContext
&
gemm_context
=
GetGemmlowpContext
();
std
::
vector
<
index_t
>
output_shape
=
{
input
->
dim
(
0
),
1
,
1
,
weight
->
dim
(
0
)};
MACE_RETURN_IF_ERROR
(
output
->
Resize
(
output_shape
));
const
int
N
=
static_cast
<
int
>
(
output
->
dim
(
0
));
const
int
input_size
=
static_cast
<
int
>
(
weight
->
dim
(
1
)
*
weight
->
dim
(
2
)
*
weight
->
dim
(
3
));
const
int
output_size
=
static_cast
<
int
>
(
weight
->
dim
(
0
));
Tensor
::
MappingGuard
guard_input
(
input
);
Tensor
::
MappingGuard
guard_weight
(
weight
);
Tensor
::
MappingGuard
guard_output
(
output
);
auto
input_ptr
=
input
->
data
<
uint8_t
>
();
auto
weight_ptr
=
weight
->
data
<
uint8_t
>
();
auto
output_ptr
=
output
->
mutable_data
<
uint8_t
>
();
std
::
vector
<
index_t
>
bias_shape
{
output_size
};
std
::
unique_ptr
<
Tensor
>
zero_bias
;
const
int32_t
*
bias_ptr
=
nullptr
;
if
(
bias
==
nullptr
)
{
zero_bias
.
reset
(
new
Tensor
(
GetDeviceAllocator
(
DeviceType
::
CPU
),
DT_INT32
));
zero_bias
->
Resize
(
bias_shape
);
zero_bias
->
Clear
();
bias_ptr
=
zero_bias
->
data
<
int32_t
>
();
}
else
{
bias_ptr
=
bias
->
data
<
int32_t
>
();
}
gemmlowp
::
MatrixMap
<
const
uint8_t
,
gemmlowp
::
MapOrder
::
RowMajor
>
weight_matrix
(
weight_ptr
,
output_size
,
input_size
);
gemmlowp
::
MatrixMap
<
const
uint8_t
,
gemmlowp
::
MapOrder
::
ColMajor
>
input_matrix
(
input_ptr
,
input_size
,
N
);
gemmlowp
::
MatrixMap
<
uint8_t
,
gemmlowp
::
MapOrder
::
ColMajor
>
output_matrix
(
output_ptr
,
output_size
,
N
);
const
auto
&
output_pipeline
=
GemmlowpOutputPipeline
::
Make
(
bias_ptr
,
output_size
,
weight
->
scale
(),
input
->
scale
(),
output
->
scale
(),
output
->
zero_point
());
using
BitDepthParams
=
gemmlowp
::
L8R8WithLhsNonzeroBitDepthParams
;
gemmlowp
::
GemmWithOutputPipeline
<
uint8_t
,
uint8_t
,
BitDepthParams
>
(
&
gemm_context
,
weight_matrix
,
input_matrix
,
&
output_matrix
,
-
weight
->
zero_point
(),
-
input
->
zero_point
(),
output_pipeline
);
return
MACE_SUCCESS
;
}
};
#ifdef MACE_ENABLE_OPENCL
template
<
typename
T
>
struct
FullyConnectedFunctor
<
DeviceType
::
GPU
,
T
>
:
FullyConnectedBase
{
...
...
mace/kernels/gemmlowp_util.h
浏览文件 @
ddc1a005
...
...
@@ -15,12 +15,44 @@
#ifndef MACE_KERNELS_GEMMLOWP_UTIL_H_
#define MACE_KERNELS_GEMMLOWP_UTIL_H_
#include <tuple>
#include "public/gemmlowp.h"
#include "mace/kernels/quantize.h"
namespace
mace
{
gemmlowp
::
GemmContext
&
GetGemmlowpContext
();
struct
GemmlowpOutputPipeline
{
typedef
gemmlowp
::
VectorMap
<
const
int32_t
,
gemmlowp
::
VectorShape
::
Col
>
ColVectorMap
;
typedef
std
::
tuple
<
gemmlowp
::
OutputStageBiasAddition
<
ColVectorMap
>
,
gemmlowp
::
OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
,
gemmlowp
::
OutputStageSaturatingCastToUint8
>
Pipeline
;
static
Pipeline
Make
(
const
int32_t
*
bias_data
,
const
index_t
channels
,
const
float
lhs_scale
,
const
float
rhs_scale
,
const
float
output_scale
,
const
int32_t
output_zero_point
)
{
ColVectorMap
bias_vector
(
bias_data
,
channels
);
gemmlowp
::
OutputStageBiasAddition
<
ColVectorMap
>
bias_addition_stage
;
bias_addition_stage
.
bias_vector
=
bias_vector
;
int32_t
quantized_multiplier
;
int32_t
right_shift
;
kernels
::
GetOutputMultiplierAndShift
(
lhs_scale
,
rhs_scale
,
output_scale
,
&
quantized_multiplier
,
&
right_shift
);
gemmlowp
::
OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint
quantize_down_stage
;
quantize_down_stage
.
result_offset_after_shift
=
output_zero_point
;
quantize_down_stage
.
result_fixedpoint_multiplier
=
quantized_multiplier
;
quantize_down_stage
.
result_shift
=
right_shift
;
gemmlowp
::
OutputStageSaturatingCastToUint8
saturating_cast_stage
;
return
std
::
make_tuple
(
bias_addition_stage
,
quantize_down_stage
,
saturating_cast_stage
);
}
};
}
// namespace mace
#endif // MACE_KERNELS_GEMMLOWP_UTIL_H_
mace/ops/fully_connected.cc
浏览文件 @
ddc1a005
...
...
@@ -24,6 +24,11 @@ void Register_FullyConnected(OperatorRegistryBase *op_registry) {
.
Build
(),
FullyConnectedOp
<
DeviceType
::
CPU
,
float
>
);
MACE_REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FullyConnected"
)
.
Device
(
DeviceType
::
CPU
)
.
TypeConstraint
<
uint8_t
>
(
"T"
)
.
Build
(),
FullyConnectedOp
<
DeviceType
::
CPU
,
uint8_t
>
);
#ifdef MACE_ENABLE_OPENCL
MACE_REGISTER_OPERATOR
(
op_registry
,
OpKeyBuilder
(
"FullyConnected"
)
.
Device
(
DeviceType
::
GPU
)
...
...
mace/ops/fully_connected_benchmark.cc
浏览文件 @
ddc1a005
...
...
@@ -80,6 +80,43 @@ void FCBenchmark(
}
net
.
Sync
();
}
template
<
>
void
FCBenchmark
<
CPU
,
uint8_t
>
(
int
iters
,
int
batch
,
int
height
,
int
width
,
int
channel
,
int
out_channel
)
{
mace
::
testing
::
StopTiming
();
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
CPU
,
uint8_t
>
(
"Input"
,
{
batch
,
height
,
width
,
channel
});
net
.
GetTensor
(
"Input"
)
->
SetScale
(
0.1
);
net
.
AddRandomInput
<
CPU
,
uint8_t
>
(
"Weight"
,
{
out_channel
,
height
,
width
,
channel
});
net
.
GetTensor
(
"Weight"
)
->
SetScale
(
0.1
);
net
.
AddRandomInput
<
CPU
,
uint8_t
>
(
"Bias"
,
{
out_channel
});
OpDefBuilder
(
"FullyConnected"
,
"FullyConnectedTest"
)
.
Input
(
"Input"
)
.
Input
(
"Weight"
)
.
Input
(
"Bias"
)
.
Output
(
"Output"
)
.
AddIntArg
(
"T"
,
DT_UINT8
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
Setup
(
CPU
);
net
.
GetTensor
(
"Output"
)
->
SetScale
(
0.1
);
// Warm-up
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
net
.
Run
();
}
mace
::
testing
::
StartTiming
();
while
(
iters
--
)
{
net
.
Run
();
}
}
}
// namespace
#define MACE_BM_FC_MACRO(N, H, W, C, OC, TYPE, DEVICE) \
...
...
@@ -98,7 +135,8 @@ void FCBenchmark(
#define MACE_BM_FC(N, H, W, C, OC) \
MACE_BM_FC_MACRO(N, H, W, C, OC, float, CPU); \
MACE_BM_FC_MACRO(N, H, W, C, OC, float, GPU); \
MACE_BM_FC_MACRO(N, H, W, C, OC, half, GPU);
MACE_BM_FC_MACRO(N, H, W, C, OC, half, GPU); \
MACE_BM_FC_MACRO(N, H, W, C, OC, uint8_t, CPU);
MACE_BM_FC
(
1
,
16
,
16
,
32
,
32
);
MACE_BM_FC
(
1
,
8
,
8
,
32
,
1000
);
...
...
mace/ops/fully_connected_test.cc
浏览文件 @
ddc1a005
...
...
@@ -15,6 +15,7 @@
#include <fstream>
#include "mace/core/operator.h"
#include "mace/kernels/quantize.h"
#include "mace/ops/ops_test_util.h"
namespace
mace
{
...
...
@@ -216,6 +217,107 @@ TEST_F(FullyConnectedOpTest, ComplexHalfWidthFormatAligned) {
Random
<
half
>
(
1
,
14
,
14
,
13
,
23
);
}
namespace
{
void
QuantRandom
(
const
index_t
batch
,
const
index_t
height
,
const
index_t
width
,
const
index_t
channels
,
const
index_t
out_channel
)
{
// Construct graph
OpsTestNet
net
;
// Add input data
net
.
AddRandomInput
<
CPU
,
float
>
(
"Input"
,
{
batch
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
CPU
,
float
>
(
"Weight"
,
{
out_channel
,
height
,
width
,
channels
});
net
.
AddRandomInput
<
CPU
,
float
>
(
"Bias"
,
{
out_channel
});
net
.
TransformDataFormat
<
CPU
,
float
>
(
"Input"
,
NHWC
,
"InputNCHW"
,
NCHW
);
net
.
TransformDataFormat
<
CPU
,
float
>
(
"Weight"
,
OHWI
,
"WeightOIHW"
,
OIHW
);
OpDefBuilder
(
"FullyConnected"
,
"FullyConnectedTest"
)
.
Input
(
"InputNCHW"
)
.
Input
(
"WeightOIHW"
)
.
Input
(
"Bias"
)
.
Output
(
"OutputNCHW"
)
.
AddIntArg
(
"T"
,
DT_FLOAT
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
();
net
.
TransformDataFormat
<
CPU
,
float
>
(
"OutputNCHW"
,
NCHW
,
"Output"
,
NHWC
);
OpDefBuilder
(
"Quantize"
,
"QuantizeWeight"
)
.
Input
(
"Weight"
)
.
Output
(
"QuantizedWeight"
)
.
OutputType
({
DT_UINT8
})
.
AddIntArg
(
"T"
,
DT_UINT8
)
.
AddIntArg
(
"non_zero"
,
true
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
();
OpDefBuilder
(
"Quantize"
,
"QuantizeInput"
)
.
Input
(
"Input"
)
.
Output
(
"QuantizedInput"
)
.
OutputType
({
DT_UINT8
})
.
AddIntArg
(
"T"
,
DT_UINT8
)
.
AddIntArg
(
"non_zero"
,
true
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
();
OpDefBuilder
(
"Quantize"
,
"QuantizeOutput"
)
.
Input
(
"Output"
)
.
Output
(
"ExpectedQuantizedOutput"
)
.
OutputType
({
DT_UINT8
})
.
AddIntArg
(
"T"
,
DT_UINT8
)
.
AddIntArg
(
"non_zero"
,
true
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
();
Tensor
*
q_weight
=
net
.
GetTensor
(
"QuantizedWeight"
);
Tensor
*
q_input
=
net
.
GetTensor
(
"QuantizedInput"
);
Tensor
*
bias
=
net
.
GetTensor
(
"Bias"
);
auto
bias_data
=
bias
->
data
<
float
>
();
std
::
vector
<
int32_t
>
q_bias
(
bias
->
size
());
kernels
::
QuantizeWithScaleAndZeropoint
(
bias_data
,
bias
->
size
(),
q_input
->
scale
()
*
q_weight
->
scale
(),
0
,
q_bias
.
data
());
net
.
AddInputFromArray
<
DeviceType
::
CPU
,
int32_t
>
(
"QuantizedBias"
,
{
out_channel
},
q_bias
);
OpDefBuilder
(
"FullyConnected"
,
"QuantizeFullyConnectedTest"
)
.
Input
(
"QuantizedInput"
)
.
Input
(
"QuantizedWeight"
)
.
Input
(
"QuantizedBias"
)
.
Output
(
"QuantizedOutput"
)
.
AddIntArg
(
"T"
,
DT_UINT8
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
Setup
(
DeviceType
::
CPU
);
Tensor
*
eq_output
=
net
.
GetTensor
(
"ExpectedQuantizedOutput"
);
Tensor
*
q_output
=
net
.
GetTensor
(
"QuantizedOutput"
);
q_output
->
SetScale
(
eq_output
->
scale
());
q_output
->
SetZeroPoint
(
eq_output
->
zero_point
());
net
.
Run
();
OpDefBuilder
(
"Dequantize"
,
"DeQuantizeTest"
)
.
Input
(
"QuantizedOutput"
)
.
Output
(
"DequantizedOutput"
)
.
OutputType
({
DT_FLOAT
})
.
AddIntArg
(
"T"
,
DT_UINT8
)
.
Finalize
(
net
.
NewOperatorDef
());
net
.
RunOp
();
// Check
ExpectTensorSimilar
<
float
>
(
*
net
.
GetOutput
(
"Output"
),
*
net
.
GetTensor
(
"DequantizedOutput"
),
0.01
);
}
}
// namespace
TEST_F
(
FullyConnectedOpTest
,
Quant
)
{
QuantRandom
(
1
,
16
,
16
,
32
,
16
);
QuantRandom
(
1
,
7
,
7
,
32
,
16
);
QuantRandom
(
1
,
7
,
7
,
512
,
128
);
QuantRandom
(
1
,
1
,
1
,
2048
,
1024
);
}
}
// namespace test
}
// namespace ops
}
// namespace mace
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录