Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
6ce11736
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6ce11736
编写于
12月 06, 2018
作者:
J
Jiaying Zhao
提交者:
GitHub
12月 06, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1336 from wzzju/add_fusion_fc_int8_op
add fusion fc int8_t op and its UT.
上级
ce969f68
3dbf9966
变更
20
隐藏空白更改
内联
并排
Showing
20 changed file
with
696 addition
and
262 deletion
+696
-262
src/common/types.cpp
src/common/types.cpp
+3
-1
src/common/types.h
src/common/types.h
+1
-0
src/operators/fusion_conv_add_relu_int8_op.h
src/operators/fusion_conv_add_relu_int8_op.h
+7
-7
src/operators/fusion_fc_int8_op.cpp
src/operators/fusion_fc_int8_op.cpp
+61
-0
src/operators/fusion_fc_int8_op.h
src/operators/fusion_fc_int8_op.h
+50
-0
src/operators/kernel/arm/fusion_fc_kernel.cpp
src/operators/kernel/arm/fusion_fc_kernel.cpp
+18
-1
src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
...perators/kernel/central-arm-func/conv_add_relu_arm_func.h
+1
-2
src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+30
-17
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+0
-2
src/operators/math/gemm.h
src/operators/math/gemm.h
+48
-14
src/operators/math/gemm_int8.cpp
src/operators/math/gemm_int8.cpp
+137
-15
src/operators/math/math_function.h
src/operators/math/math_function.h
+1
-1
src/operators/math/math_function_int8.cpp
src/operators/math/math_function_int8.cpp
+10
-10
src/operators/op_param.h
src/operators/op_param.h
+14
-2
test/common/test_gemm_accuracy.cpp
test/common/test_gemm_accuracy.cpp
+7
-5
test/common/test_gemm_int8_accuracy.cpp
test/common/test_gemm_int8_accuracy.cpp
+98
-41
test/common/test_gemm_perf.cpp
test/common/test_gemm_perf.cpp
+56
-14
test/operators/test_fusion_conv_add_relu_int8_op.cpp
test/operators/test_fusion_conv_add_relu_int8_op.cpp
+7
-3
test/operators/test_fusion_fc_op.cpp
test/operators/test_fusion_fc_op.cpp
+143
-127
tools/op.cmake
tools/op.cmake
+4
-0
未找到文件。
src/common/types.cpp
浏览文件 @
6ce11736
...
...
@@ -32,6 +32,7 @@ const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
const
char
*
G_OP_TYPE_FUSION_DWCONV_BN_RELU
=
"fusion_dwconv_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_BN_RELU
=
"fusion_conv_bn_relu"
;
const
char
*
G_OP_TYPE_FC
=
"fusion_fc"
;
const
char
*
G_OP_TYPE_FC_INT8
=
"fusion_fc_int8"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
=
"fusion_conv_add"
;
const
char
*
G_OP_TYPE_LRN
=
"lrn"
;
const
char
*
G_OP_TYPE_MUL
=
"mul"
;
...
...
@@ -111,12 +112,13 @@ std::unordered_map<
{
G_OP_TYPE_MULTICLASS_NMS
,
{{
"BBoxes"
,
"Scores"
},
{
"Out"
}}},
{
G_OP_TYPE_POLYGON_BOX_TRANSFORM
,
{{
"Input"
},
{
"Output"
}}},
{
G_OP_TYPE_FC
,
{{
"X"
,
"Y"
,
"Z"
},
{
"Out"
}}},
{
G_OP_TYPE_FC_INT8
,
{{
"X"
,
"Y"
,
"Z"
,
"Scale"
},
{
"Out"
}}},
{
G_OP_TYPE_RESHAPE
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_RESHAPE2
,
{{
"X"
},
{
"Out"
,
"XShape"
}}},
{
G_OP_TYPE_DEPTHWISE_CONV
,
{{
"Input"
},
{
"Output"
}}},
{
G_OP_TYPE_FILL_CONSTANT
,
{{},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_RELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8
,
{{
"Input"
,
"Scale"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_PRELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_IM2SEQUENCE
,
{{
"X"
},
{
"Out"
}}},
...
...
src/common/types.h
浏览文件 @
6ce11736
...
...
@@ -103,6 +103,7 @@ extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_PRELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
;
extern
const
char
*
G_OP_TYPE_FC
;
extern
const
char
*
G_OP_TYPE_FC_INT8
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_BN_ADD_RELU
;
...
...
src/operators/fusion_conv_add_relu_int8_op.h
浏览文件 @
6ce11736
...
...
@@ -22,19 +22,19 @@ namespace paddle_mobile {
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
FusionConvAddReluInt8Op
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddReluParam
<
DeviceType
>
,
operators
::
ConvAddReluKernel
<
DeviceType
,
T
>>
{
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddReluParam
<
DeviceType
>
,
ConvAddReluKernel
<
DeviceType
,
T
>>
{
public:
FusionConvAddReluInt8Op
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddReluParam
<
DeviceType
>
,
operators
::
ConvAddReluKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddReluParam
<
DeviceType
>
,
ConvAddReluKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
void
InferShape
()
const
override
;
};
}
// namespace operators
...
...
src/operators/fusion_fc_int8_op.cpp
0 → 100644
浏览文件 @
6ce11736
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_INT8_OP
#include "operators/fusion_fc_int8_op.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
Dtype
,
typename
T
>
void
FusionFcInt8Op
<
Dtype
,
T
>::
InferShape
()
const
{
auto
x_dims
=
this
->
param_
.
InputX
()
->
dims
();
auto
y_dims
=
this
->
param_
.
InputY
()
->
dims
();
int
x_num_col_dims
=
this
->
param_
.
XNumColDims
();
int
y_num_col_dims
=
this
->
param_
.
YNumColDims
();
assert
(
x_dims
.
size
()
>
x_num_col_dims
);
assert
(
y_dims
.
size
()
>
y_num_col_dims
);
/// (1,2,3,4) , x_num_col_dims = 2 -> (2,12)
auto
x_mat_dims
=
framework
::
flatten_to_2d
(
x_dims
,
x_num_col_dims
);
auto
y_mat_dims
=
framework
::
flatten_to_2d
(
y_dims
,
y_num_col_dims
);
assert
(
x_mat_dims
[
1
]
==
y_mat_dims
[
0
]);
std
::
vector
<
int64_t
>
output_dims
;
output_dims
.
reserve
(
static_cast
<
size_t
>
(
x_num_col_dims
+
y_dims
.
size
()
-
y_num_col_dims
));
for
(
int
i
=
0
;
i
<
x_num_col_dims
;
++
i
)
{
output_dims
.
push_back
(
x_dims
[
i
]);
}
for
(
int
i
=
y_num_col_dims
;
i
<
y_dims
.
size
();
++
i
)
{
output_dims
.
push_back
(
y_dims
[
i
]);
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_dims
);
this
->
param_
.
Out
()
->
Resize
(
ddim
);
}
}
// namespace operators
}
// namespace paddle_mobile
namespace
ops
=
paddle_mobile
::
operators
;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU_INT8
(
fusion_fc_int8
,
ops
::
FusionFcInt8Op
);
#endif
#endif // FUSION_FC_INT8_OP
src/operators/fusion_fc_int8_op.h
0 → 100644
浏览文件 @
6ce11736
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_INT8_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/fusion_fc_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
FusionFcInt8Op
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionFcParam
<
DeviceType
>
,
FusionFcKernel
<
DeviceType
,
T
>>
{
public:
FusionFcInt8Op
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionFcParam
<
DeviceType
>
,
FusionFcKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
void
InferShape
()
const
override
;
};
}
// namespace operators
}
// namespace paddle_mobile
#endif // FUSION_FC_INT8_OP
src/operators/kernel/arm/fusion_fc_kernel.cpp
浏览文件 @
6ce11736
...
...
@@ -27,10 +27,27 @@ bool FusionFcKernel<CPU, float>::Init(FusionFcParam<CPU> *param) {
template
<
>
void
FusionFcKernel
<
CPU
,
float
>::
Compute
(
const
FusionFcParam
<
CPU
>
&
param
)
{
FusionFcCompute
<
float
>
(
param
);
FusionFcCompute
<
float
,
float
>
(
param
);
param
.
Out
()
->
set_lod
(
param
.
InputX
()
->
lod
());
}
template
class
FusionFcKernel
<
CPU
,
float
>;
#ifdef FUSION_FC_INT8_OP
template
<
>
bool
FusionFcKernel
<
CPU
,
int8_t
>::
Init
(
FusionFcParam
<
CPU
>
*
param
)
{
return
true
;
}
template
<
>
void
FusionFcKernel
<
CPU
,
int8_t
>::
Compute
(
const
FusionFcParam
<
CPU
>
&
param
)
{
FusionFcCompute
<
int8_t
,
int32_t
>
(
param
);
param
.
Out
()
->
set_lod
(
param
.
InputX
()
->
lod
());
}
template
class
FusionFcKernel
<
CPU
,
int8_t
>;
#endif
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
浏览文件 @
6ce11736
...
...
@@ -39,8 +39,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> ¶m) {
float
beta
=
1.0
f
;
#ifdef FUSION_CONVADDRELU_INT8_OP
Tensor
scale
=
*
param
.
InputScale
();
alpha
=
scale
.
data
<
float
>
()[
0
];
alpha
=
param
.
InputScale
()
->
data
<
float
>
()[
0
];
beta
=
0.0
f
;
#endif
...
...
src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
浏览文件 @
6ce11736
...
...
@@ -15,23 +15,29 @@ limitations under the License. */
#ifdef FUSION_FC_OP
#pragma once
#include <type_traits>
#include "operators/math/math_function.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
template
<
typename
P
,
typename
S
>
void
FusionFcCompute
(
const
FusionFcParam
<
CPU
>
&
param
)
{
const
Tensor
*
input_x
=
param
.
InputX
();
const
Tensor
*
input_y
=
param
.
InputY
();
const
Tensor
*
input_z
=
param
.
InputZ
();
auto
*
input_z_data
=
input_z
->
data
<
float
>
();
Tensor
*
input_z
=
param
.
InputZ
();
S
*
input_z_data
=
input_z
->
data
<
S
>
();
int
axis
=
param
.
Axis
();
Tensor
*
out
=
param
.
Out
();
// int m = out->dims()[0];
// int n = out->dims()[1];
auto
*
out_data
=
out
->
mutable_data
<
float
>
();
auto
*
out_data
=
out
->
mutable_data
<
P
>
();
float
alpha
=
1.0
f
;
float
beta
=
1.0
f
;
const
Tensor
x_matrix
=
input_x
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_x
,
param
.
XNumColDims
())
...
...
@@ -51,21 +57,28 @@ void FusionFcCompute(const FusionFcParam<CPU> ¶m) {
axis
=
(
axis
==
-
1
?
out_dim
.
size
()
-
input_z
->
dims
().
size
()
:
axis
);
PADDLE_MOBILE_ENFORCE
(
axis
==
1
,
" to fit broadcast, axis = 1. "
);
int64_t
classes
=
input_z
->
numel
();
for
(
int
i
=
0
;
i
<
out_dim
[
0
];
i
++
)
{
memory
::
Copy
(
out_data
+
i
*
classes
,
input_z_data
,
sizeof
(
float
)
*
classes
);
}
if
(
std
::
is_same
<
P
,
int8_t
>::
value
)
{
#ifdef FUSION_FC_INT8_OP
alpha
=
param
.
InputScale
()
->
data
<
float
>
()[
0
];
beta
=
0.0
f
;
math
::
matmul
(
x_matrix
,
false
,
y_matrix
,
false
,
alpha
,
out
,
beta
,
false
,
input_z_data
,
true
);
#endif
}
else
{
// bias_data的维度和out的第二个维度一致
int64_t
classes
=
input_z
->
numel
();
for
(
int
i
=
0
;
i
<
out_dim
[
0
];
i
++
)
{
memory
::
Copy
(
out_data
+
i
*
classes
,
input_z_data
,
sizeof
(
float
)
*
classes
);
}
// for (int i = 0; i < out->numel(); i++) {
// DLOG << out_data[i];
// }
// bias_data的维度和out的维度一致
math
::
matmul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
),
false
);
math
::
matmul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
alpha
,
out
,
beta
,
false
);
}
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
//
if (out_dim.size() != 2) {
//
out->Resize(out_dim);
//
}
// if (out_dim.size() != 2) {
// out->Resize(out_dim);
// }
}
}
// namespace operators
...
...
src/operators/math/gemm.cpp
浏览文件 @
6ce11736
...
...
@@ -2924,7 +2924,6 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
#endif // __ARM_NEON
// 32位 float 矩阵乘法
template
<
>
void
Gemm
::
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
)
{
...
...
@@ -3147,7 +3146,6 @@ void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
}
// 32位 float 矩阵乘法
template
<
>
void
Gemm
::
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
)
{
...
...
src/operators/math/gemm.h
浏览文件 @
6ce11736
...
...
@@ -167,14 +167,25 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *new_bias);
*/
// 32位 float 矩阵乘法
void
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
// 32位 float 矩阵乘法(openmp 多线程版本)
void
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
...
...
@@ -202,7 +213,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
template
<
typename
Otype
>
void
InnerKernelWithBias
(
int32_t
mc
,
int32_t
nc
,
float
alpha
,
const
int8_t
*
a
,
const
int8_t
*
b
,
float
beta
,
int32_t
*
c
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
);
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
,
bool
addOnRow
=
false
);
// 8 bits int pack function
void
PackMatrixA_4r
(
int32_t
m
,
int32_t
k
,
int32_t
m_tail
,
const
int8_t
*
A
,
...
...
@@ -228,28 +240,32 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
template
<
typename
Itype
,
typename
Btype
,
typename
Otype
>
void
Sgemm_omp
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
Itype
*
A
,
int32_t
lda
,
const
Itype
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
Btype
*
bias
);
int32_t
ldc
,
bool
relu
,
Btype
*
bias
,
bool
addOnRow
=
false
);
template
<
typename
Otype
>
void
Sgemm_omp
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
int8_t
*
A
,
int32_t
lda
,
const
int8_t
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
);
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
,
bool
addOnRow
=
false
);
template
<
typename
Itype
,
typename
Btype
,
typename
Otype
>
void
Sgemm
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
Itype
*
A
,
int32_t
lda
,
const
Itype
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
Btype
*
bias
);
int32_t
ldc
,
bool
relu
,
Btype
*
bias
,
bool
addOnRow
=
false
);
template
<
typename
Otype
>
void
Sgemm
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
int8_t
*
A
,
int32_t
lda
,
const
int8_t
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
);
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
,
bool
addOnRow
=
false
);
// 8 bits int write back
// C = A * B
void
WriteBasic
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int32_t
*
C
,
int32_t
ldc
);
// C = A * B + bias, scale * relu(C)
void
WriteWithAddReluScale
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
int32_t
*
bias
,
float
scale
);
// C = A * B + bias, scale * C
// C = A * B + bias, scale * C
, bias is added on column
void
WriteWithAddScale
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
int32_t
*
bias
,
float
scale
);
// C = A * B + bias, scale * C, bias is added on row
void
WriteWithAddScaleT
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
int32_t
*
bias
,
float
scale
);
private:
int
MC
=
0
;
...
...
@@ -273,7 +289,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
template
<
typename
Otype
>
void
Gemm
::
Sgemm
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
int8_t
*
A
,
int32_t
lda
,
const
int8_t
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
)
{
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
,
bool
addOnRow
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int32_t
L1
=
32
*
1024
;
...
...
@@ -322,8 +339,15 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
InnerKernel
(
mc
,
nc
,
alpha
,
packedA_int8
,
packedB_int8
,
beta
,
packedC_int32
,
&
C
(
i
,
j
),
ldc
,
relu
);
}
else
{
InnerKernelWithBias
(
mc
,
nc
,
alpha
,
packedA_int8
,
packedB_int8
,
beta
,
packedC_int32
,
&
C
(
i
,
j
),
ldc
,
relu
,
bias
+
i
);
if
(
addOnRow
)
{
InnerKernelWithBias
(
mc
,
nc
,
alpha
,
packedA_int8
,
packedB_int8
,
beta
,
packedC_int32
,
&
C
(
i
,
j
),
ldc
,
relu
,
bias
+
j
,
addOnRow
);
}
else
{
InnerKernelWithBias
(
mc
,
nc
,
alpha
,
packedA_int8
,
packedB_int8
,
beta
,
packedC_int32
,
&
C
(
i
,
j
),
ldc
,
relu
,
bias
+
i
,
addOnRow
);
}
}
}
}
...
...
@@ -339,7 +363,7 @@ template <typename Otype>
void
Gemm
::
Sgemm_omp
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
int8_t
*
A
,
int32_t
lda
,
const
int8_t
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
)
{
int32_t
*
bias
,
bool
addOnRow
)
{
#ifdef _OPENMP
int32_t
max_threads
=
omp_get_max_threads
();
#else
...
...
@@ -422,8 +446,13 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
InnerKernel
(
mc
,
n
,
alpha
,
local_A
,
packedB_int8
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
);
}
else
{
InnerKernelWithBias
(
mc
,
n
,
alpha
,
local_A
,
packedB_int8
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
bias
+
i
);
if
(
addOnRow
)
{
InnerKernelWithBias
(
mc
,
n
,
alpha
,
local_A
,
packedB_int8
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
bias
,
addOnRow
);
}
else
{
InnerKernelWithBias
(
mc
,
n
,
alpha
,
local_A
,
packedB_int8
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
bias
+
i
,
addOnRow
);
}
}
}
}
else
{
...
...
@@ -447,8 +476,13 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
InnerKernel
(
m
,
nc
,
alpha
,
packedA_int8
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
);
}
else
{
InnerKernelWithBias
(
m
,
nc
,
alpha
,
packedA_int8
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
bias
);
if
(
addOnRow
)
{
InnerKernelWithBias
(
m
,
nc
,
alpha
,
packedA_int8
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
bias
+
j
,
addOnRow
);
}
else
{
InnerKernelWithBias
(
m
,
nc
,
alpha
,
packedA_int8
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
bias
,
addOnRow
);
}
}
}
}
...
...
src/operators/math/gemm_int8.cpp
浏览文件 @
6ce11736
...
...
@@ -699,7 +699,7 @@ template <>
void
Gemm
::
InnerKernelWithBias
(
int32_t
mc
,
int32_t
nc
,
float
alpha
,
const
int8_t
*
a
,
const
int8_t
*
b
,
float
beta
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
)
{
int32_t
*
bias
,
bool
addOnRow
)
{
#pragma omp parallel for
for
(
int32_t
j
=
0
;
j
<
nc
;
j
+=
NR_INT8
)
{
for
(
int32_t
i
=
0
;
i
<
mc
;
i
+=
MR_INT8
)
{
...
...
@@ -716,7 +716,11 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
WriteWithAddReluScale
(
mc
,
nc
,
c
,
C
,
ldc
,
bias
,
alpha
);
return
;
}
else
{
WriteWithAddScale
(
mc
,
nc
,
c
,
C
,
ldc
,
bias
,
alpha
);
if
(
addOnRow
)
{
WriteWithAddScaleT
(
mc
,
nc
,
c
,
C
,
ldc
,
bias
,
alpha
);
}
else
{
WriteWithAddScale
(
mc
,
nc
,
c
,
C
,
ldc
,
bias
,
alpha
);
}
}
}
...
...
@@ -724,7 +728,7 @@ template <>
void
Gemm
::
InnerKernelWithBias
(
int32_t
mc
,
int32_t
nc
,
float
alpha
,
const
int8_t
*
a
,
const
int8_t
*
b
,
float
beta
,
int32_t
*
c
,
int32_t
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
)
{}
int32_t
*
bias
,
bool
addOnRow
)
{}
// 8 bits int PackMatrixA_4r
void
Gemm
::
PackMatrixA_4r_16
(
int32_t
m
,
int32_t
k
,
int32_t
m_tail
,
...
...
@@ -1159,14 +1163,13 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
#endif // __ARM_NEON
}
// C = A * B + bias, scale * C
// C = A * B + bias, scale * C
, bias is added on column
void
Gemm
::
WriteWithAddScale
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
int32_t
*
bias
,
float
scale
)
{
#if __ARM_NEON
#if __aarch64__
// TODO
#else
int32_t
zero
=
0
;
int8_t
narrow
=
-
128
;
int32_t
nc1
=
nc
>>
3
;
int32_t
_nc1
=
nc
&
7
;
...
...
@@ -1184,7 +1187,6 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
"subs %[mc], %[mc], #1
\n\t
"
"blt end_mc_%=
\n\t
"
"vdup.32 q15, %[scale]
\n\t
"
"vdup.32 q14, %[zero]
\n\t
"
"vdup.8 d24, %[narrow]
\n\t
"
"loop_mc_%=:
\n\t
"
"vld1.32 {d26[0]}, [%[bias_ptr]]!
\n\t
"
...
...
@@ -1222,9 +1224,9 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
:
:
[
C_ptr
]
"r"
(
C_ptr
),
[
c_ptr
]
"r"
(
c_ptr
),
[
mc
]
"r"
(
m
),
[
nc1
]
"r"
(
n
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
),
[
bias_ptr
]
"r"
(
bias_ptr
),
[
scale
]
"r"
(
scale
),
[
zero
]
"r"
(
zero
),
[
narrow
]
"r"
(
narrow
)
[
scale
]
"r"
(
scale
),
[
narrow
]
"r"
(
narrow
)
:
"cc"
,
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q12"
,
"q13"
,
"q1
4"
,
"q1
5"
);
"q7"
,
"q12"
,
"q13"
,
"q15"
);
}
int32_t
nc_left
;
...
...
@@ -1239,7 +1241,6 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
nc_left
=
_nc1
;
asm
volatile
(
"vdup.32 q15, %[scale]
\n\t
"
"vdup.32 q14, %[zero]
\n\t
"
"vdup.8 d24, %[narrow]
\n\t
"
"vdup.32 q13, %[bias_v]
\n\t
"
"cmp %[_nc1], #4
\n\t
"
...
...
@@ -1260,7 +1261,7 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
"subs %[_nc1], %[_nc1], #4
\n\t
"
"beq process_over_%=
\n\t
"
"less_four_%=:
\n\t
"
"vld1.32 {q0}, [%[c0]]
!
\n\t
"
"vld1.32 {q0}, [%[c0]]
\n\t
"
"vqadd.s32 q0, q0, q13
\n\t
"
"vcvt.f32.s32 q1, q0
\n\t
"
"vmul.f32 q1, q1, q15
\n\t
"
...
...
@@ -1277,17 +1278,138 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
"process_over_%=:
\n\t
"
:
:
[
_nc1
]
"r"
(
nc_left
),
[
C0
]
"r"
(
C0
),
[
c0
]
"r"
(
c0
),
[
bias_v
]
"r"
(
bias_v
),
[
scale
]
"r"
(
scale
),
[
zero
]
"r"
(
zero
),
[
narrow
]
"r"
(
narrow
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
[
bias_v
]
"r"
(
bias_v
),
[
scale
]
"r"
(
scale
),
[
narrow
]
"r"
(
narrow
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q12"
,
"q13"
,
"q15"
);
}
}
#endif // __aarch64__
#endif // __ARM_NEON
}
// C = A * B + bias, scale * C, bias is added on row
void
Gemm
::
WriteWithAddScaleT
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
int32_t
*
bias
,
float
scale
)
{
#if __ARM_NEON
#if __aarch64__
// TODO
#else
int8_t
narrow
=
-
128
;
int32_t
nc1
=
nc
>>
3
;
int32_t
_nc1
=
nc
&
7
;
int32_t
step
=
sizeof
(
int8_t
)
*
ldc
;
int32_t
step1
=
sizeof
(
int32_t
)
*
(
NC
-
(
nc1
<<
3
));
int32_t
volatile
m
=
mc
;
int32_t
volatile
n
=
nc1
;
int32_t
*
volatile
c_ptr
,
*
volatile
bias_ptr
;
int8_t
*
volatile
C_ptr
;
c_ptr
=
c
;
C_ptr
=
C
;
bias_ptr
=
bias
;
if
(
nc1
>
0
)
{
asm
volatile
(
"subs %[mc], %[mc], #1
\n\t
"
"blt end_mc_%=
\n\t
"
"vdup.32 q15, %[scale]
\n\t
"
"vdup.8 d24, %[narrow]
\n\t
"
"loop_mc_%=:
\n\t
"
"mov r4, %[bias_ptr]
\n\t
"
"mov r6, %[C_ptr]
\n\t
"
"mov r5, %[nc1]
\n\t
"
"subs r5, r5, #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q13, q14}, [r4]!
\n\t
"
"vld1.32 {q0, q1}, [%[c_ptr]]!
\n\t
"
"vqadd.s32 q0, q0, q13
\n\t
"
"vqadd.s32 q1, q1, q14
\n\t
"
"vcvt.f32.s32 q2, q0
\n\t
"
"vcvt.f32.s32 q3, q1
\n\t
"
"vmul.f32 q2, q2, q15
\n\t
"
"vmul.f32 q3, q3, q15
\n\t
"
"vcvt.s32.f32 q4, q2
\n\t
"
"vcvt.s32.f32 q5, q3
\n\t
"
"vqmovn.s32 d12, q4
\n\t
"
"vqmovn.s32 d13, q5
\n\t
"
"vqmovn.s16 d14, q6
\n\t
"
"vceq.s8 d15, d14, d24
\n\t
"
"vsub.s8 d14, d14, d15
\n\t
"
"vst1.8 {d14}, [r6]!
\n\t
"
"subs r5, r5, #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"add %[C_ptr], %[C_ptr], %[step]
\n\t
"
"add %[c_ptr], %[c_ptr], %[step1]
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"bge loop_mc_%=
\n\t
"
"end_mc_%=:
\n\t
"
:
:
[
C_ptr
]
"r"
(
C_ptr
),
[
c_ptr
]
"r"
(
c_ptr
),
[
mc
]
"r"
(
m
),
[
nc1
]
"r"
(
n
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
),
[
bias_ptr
]
"r"
(
bias_ptr
),
[
scale
]
"r"
(
scale
),
[
narrow
]
"r"
(
narrow
)
:
"cc"
,
"memory"
,
"r4"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q12"
,
"q13"
,
"q15"
);
}
int32_t
nc_left
;
int32_t
*
c0
;
int8_t
*
C0
;
int32_t
*
volatile
bias0
=
bias_ptr
+
nc1
*
8
;
if
(
_nc1
!=
0
)
{
for
(
int32_t
i
=
0
;
i
<
mc
;
i
++
)
{
C0
=
C_ptr
+
nc1
*
8
+
i
*
ldc
;
c0
=
c_ptr
+
nc1
*
8
+
i
*
NC
;
nc_left
=
_nc1
;
asm
volatile
(
"vdup.32 q15, %[scale]
\n\t
"
"vdup.8 d24, %[narrow]
\n\t
"
"cmp %[_nc1], #4
\n\t
"
"blt less_four_%=
\n\t
"
"vld1.32 {q0}, [%[c0]]!
\n\t
"
"vld1.32 {q13}, [%[bias0]]!
\n\t
"
"vqadd.s32 q0, q0, q13
\n\t
"
"vcvt.f32.s32 q1, q0
\n\t
"
"vmul.f32 q1, q1, q15
\n\t
"
"vcvt.s32.f32 q2, q1
\n\t
"
"vqmovn.s32 d6, q2
\n\t
"
"vqmovn.s16 d8, q3
\n\t
"
"vceq.s8 d9, d8, d24
\n\t
"
"vsub.s8 d8, d8, d9
\n\t
"
"vst1.8 {d8[0]}, [%[C0]]!
\n\t
"
"vst1.8 {d8[1]}, [%[C0]]!
\n\t
"
"vst1.8 {d8[2]}, [%[C0]]!
\n\t
"
"vst1.8 {d8[3]}, [%[C0]]!
\n\t
"
"subs %[_nc1], %[_nc1], #4
\n\t
"
"beq process_over_%=
\n\t
"
"less_four_%=:
\n\t
"
"vld1.32 {q0}, [%[c0]]
\n\t
"
"vld1.32 {q13}, [%[bias0]]
\n\t
"
"vqadd.s32 q0, q0, q13
\n\t
"
"vcvt.f32.s32 q1, q0
\n\t
"
"vmul.f32 q1, q1, q15
\n\t
"
"vcvt.s32.f32 q2, q1
\n\t
"
"vqmovn.s32 d6, q2
\n\t
"
"vqmovn.s16 d8, q3
\n\t
"
"vceq.s8 d9, d8, d24
\n\t
"
"vsub.s8 d8, d8, d9
\n\t
"
"loop_save_%=:
\n\t
"
"vst1.8 {d8[0]}, [%[C0]]!
\n\t
"
"vext.8 d8, d8, d8, #1
\n\t
"
"subs %[_nc1], %[_nc1], #1
\n\t
"
"bgt loop_save_%=
\n\t
"
"process_over_%=:
\n\t
"
:
:
[
_nc1
]
"r"
(
nc_left
),
[
C0
]
"r"
(
C0
),
[
c0
]
"r"
(
c0
),
[
bias0
]
"r"
(
bias0
),
[
scale
]
"r"
(
scale
),
[
narrow
]
"r"
(
narrow
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q12"
,
"q13"
,
"q15"
);
}
}
#endif // __aarch64__
#endif // __ARM_NEON
}
// C = A * B + bias, scale * relu(C)
// C = A * B + bias, scale * relu(C)
, bias is added on column
void
Gemm
::
WriteWithAddReluScale
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
int32_t
*
bias
,
float
scale
)
{
#if __ARM_NEON
...
...
src/operators/math/math_function.h
浏览文件 @
6ce11736
...
...
@@ -34,7 +34,7 @@ template <typename T, typename S>
void
matmul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
T
alpha
,
framework
::
Tensor
*
matrix_out
,
T
beta
,
bool
relu
=
false
,
S
*
bias
=
nullptr
);
S
*
bias
=
nullptr
,
bool
addOnRow
=
false
);
template
<
typename
T
>
void
matmulWithBn
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
...
...
src/operators/math/math_function_int8.cpp
浏览文件 @
6ce11736
...
...
@@ -24,8 +24,8 @@ namespace math {
template
<
>
void
matmul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
int32_t
*
bias
)
{
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
int32_t
*
bias
,
bool
addOnRow
)
{
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
...
...
@@ -55,18 +55,18 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
#ifdef _OPENMP
if
(
bias
!=
nullptr
)
{
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
);
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
,
addOnRow
);
}
else
{
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
);
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
,
addOnRow
);
}
#else
if
(
bias
!=
nullptr
)
{
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
);
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
,
addOnRow
);
}
else
{
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
);
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
,
addOnRow
);
}
#endif
}
else
{
...
...
@@ -74,21 +74,21 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
if
(
bias
!=
nullptr
)
{
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
int8_t
>
(),
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
);
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
,
addOnRow
);
}
else
{
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
int8_t
>
(),
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
);
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
,
addOnRow
);
}
#else
if
(
bias
!=
nullptr
)
{
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
int8_t
>
(),
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
);
N
,
relu
,
bias
,
addOnRow
);
}
else
{
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
int8_t
>
(),
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
);
N
,
relu
,
bias
,
addOnRow
);
}
#endif
}
...
...
src/operators/op_param.h
浏览文件 @
6ce11736
...
...
@@ -1632,6 +1632,10 @@ class FusionFcParam : public OpParam {
x_num_col_dims_
=
GetAttr
<
int
>
(
"x_num_col_dims"
,
attrs
);
y_num_col_dims_
=
GetAttr
<
int
>
(
"y_num_col_dims"
,
attrs
);
axis_
=
GetAttr
<
int
>
(
"axis"
,
attrs
);
#ifdef FUSION_FC_INT8_OP
scale_
=
InputScaleFrom
<
GType
>
(
inputs
,
scope
);
#endif
}
GType
*
InputX
()
const
{
return
input_x_
;
}
...
...
@@ -1655,8 +1659,16 @@ class FusionFcParam : public OpParam {
int
x_num_col_dims_
;
int
y_num_col_dims_
;
int
axis_
;
#ifdef PADDLE_MOBILE_FPGA
#ifdef FUSION_FC_INT8_OP
public:
const
RType
*
InputScale
()
const
{
return
scale_
;
}
private:
RType
*
scale_
;
#endif
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
SplitConvArgs
fpga_conv_args
;
...
...
@@ -1717,7 +1729,7 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
typedef
typename
DtypeTensorTrait
<
DeviceType
>::
rtype
RType
;
const
RType
*
InputScale
()
const
{
return
scale_
;
}
pr
otected
:
pr
ivate
:
RType
*
scale_
;
#endif
};
...
...
test/common/test_gemm_accuracy.cpp
浏览文件 @
6ce11736
...
...
@@ -25,7 +25,7 @@ limitations under the License. */
#define c(i, j) c[(i)*ldc + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
void
print_mat
ir
x
(
int
m
,
int
n
,
int
ldc
,
float
*
c
)
{
void
print_mat
ri
x
(
int
m
,
int
n
,
int
ldc
,
float
*
c
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
std
::
cout
<<
c
(
i
,
0
);
for
(
int
j
=
1
;
j
<
n
;
++
j
)
{
...
...
@@ -98,18 +98,20 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
if
(
pr
>
0
)
{
std
::
cout
<<
"A:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
k
,
lda
,
a
);
print_mat
ri
x
(
m
,
k
,
lda
,
a
);
std
::
cout
<<
"B:"
<<
std
::
endl
;
print_mat
ir
x
(
k
,
n
,
ldb
,
b
);
print_mat
ri
x
(
k
,
n
,
ldb
,
b
);
std
::
cout
<<
"C:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
n
,
ldc
,
c
);
print_mat
ri
x
(
m
,
n
,
ldc
,
c
);
std
::
cout
<<
"C1:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
n
,
ldc
,
c1
);
print_mat
ri
x
(
m
,
n
,
ldc
,
c1
);
}
std
::
cout
<<
"mnk="
<<
m
<<
" "
<<
n
<<
" "
<<
k
<<
" relu="
<<
relu
<<
" eq="
<<
eq
<<
" neq="
<<
neq
<<
std
::
endl
;
PADDLE_MOBILE_ENFORCE
(
neq
==
0
,
"The execution of do_sgemm is failed!"
);
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
c
);
...
...
test/common/test_gemm_int8_accuracy.cpp
浏览文件 @
6ce11736
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <iostream>
#include <limits>
#include <random>
#include <type_traits>
#include "../test_helper.h"
#include "common/log.h"
#include "memory/t_malloc.h"
...
...
@@ -33,24 +34,32 @@ limitations under the License. */
using
std
::
default_random_engine
;
using
std
::
uniform_int_distribution
;
void
print_matirx
(
int
m
,
int
n
,
int
ldc
,
int32_t
*
c
)
{
template
<
typename
T
>
void
print_matrix
(
int
m
,
int
n
,
int
ldc
,
T
*
c
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
std
::
cout
<<
c
(
i
,
0
);
for
(
int
j
=
1
;
j
<
n
;
++
j
)
{
std
::
cout
<<
" | "
<<
c
(
i
,
j
);
if
(
std
::
is_same
<
T
,
int8_t
>::
value
)
{
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
.
width
(
4
);
std
::
cout
<<
static_cast
<
int32_t
>
(
c
(
i
,
0
));
}
else
{
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
.
width
(
6
);
std
::
cout
<<
c
(
i
,
0
);
}
std
::
cout
<<
std
::
endl
;
}
std
::
cout
<<
std
::
endl
;
}
void
print_matirx
(
int
m
,
int
n
,
int
ldc
,
int8_t
*
c
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
std
::
cout
<<
static_cast
<
int32_t
>
(
c
(
i
,
0
));
for
(
int
j
=
1
;
j
<
n
;
++
j
)
{
std
::
cout
<<
" | "
<<
static_cast
<
int32_t
>
(
c
(
i
,
j
));
if
(
std
::
is_same
<
T
,
int8_t
>::
value
)
{
std
::
cout
<<
" | "
;
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
.
width
(
4
);
std
::
cout
<<
static_cast
<
int32_t
>
(
c
(
i
,
j
));
}
else
{
std
::
cout
<<
" | "
;
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
.
width
(
6
);
std
::
cout
<<
c
(
i
,
j
);
}
}
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"
\n
"
;
}
std
::
cout
<<
std
::
endl
;
}
...
...
@@ -138,18 +147,20 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
if
(
pr
>
0
)
{
std
::
cout
<<
"A:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
k
,
lda
,
a
);
print_mat
ri
x
(
m
,
k
,
lda
,
a
);
std
::
cout
<<
"B:"
<<
std
::
endl
;
print_mat
ir
x
(
k
,
n
,
ldb
,
b
);
print_mat
ri
x
(
k
,
n
,
ldb
,
b
);
std
::
cout
<<
"C:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
n
,
ldc
,
c
);
print_mat
ri
x
(
m
,
n
,
ldc
,
c
);
std
::
cout
<<
"C1:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
n
,
ldc
,
c1
);
print_mat
ri
x
(
m
,
n
,
ldc
,
c1
);
}
std
::
cout
<<
"mnk="
<<
m
<<
" "
<<
n
<<
" "
<<
k
<<
" relu="
<<
relu
<<
" eq="
<<
eq
<<
" neq="
<<
neq
<<
std
::
endl
;
PADDLE_MOBILE_ENFORCE
(
neq
==
0
,
"The execution of do_sgemm is failed!"
);
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
c
);
...
...
@@ -158,7 +169,8 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
return
0
;
}
int
do_sgemm_with_bias
(
int
m
,
int
n
,
int
k
,
bool
relu
,
int
pr
)
{
int
do_sgemm_with_bias
(
int
m
,
int
n
,
int
k
,
bool
relu
,
int
pr
,
bool
addOnRow
=
false
)
{
int
lda
=
k
;
int
ldb
=
n
;
int
ldc
=
n
;
...
...
@@ -174,8 +186,14 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
int8_t
*
c1
=
static_cast
<
int8_t
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
int8_t
)
*
m
*
n
));
int32_t
*
bias
=
static_cast
<
int32_t
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
int32_t
)
*
m
));
int32_t
*
bias
=
nullptr
;
if
(
addOnRow
)
{
bias
=
static_cast
<
int32_t
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
int32_t
)
*
n
));
}
else
{
bias
=
static_cast
<
int32_t
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
int32_t
)
*
m
));
}
for
(
int
i
=
0
;
i
<
m
*
k
;
++
i
)
{
a
[
i
]
=
pixel
(
e
);
...
...
@@ -183,29 +201,48 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
for
(
int
i
=
0
;
i
<
k
*
n
;
++
i
)
{
b
[
i
]
=
pixel
(
e
);
}
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
bias
[
i
]
=
static_cast
<
int32_t
>
(
pixel
(
e
));
}
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
int32_t
bias_v
=
bias
[
i
];
for
(
int
j
=
0
;
j
<
n
;
++
j
)
{
int32_t
r
=
0
;
for
(
int
p
=
0
;
p
<
k
;
p
++
)
{
r
+=
static_cast
<
int32_t
>
(
a
(
i
,
p
))
*
static_cast
<
int32_t
>
(
b
(
p
,
j
));
if
(
addOnRow
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
bias
[
i
]
=
static_cast
<
int32_t
>
(
pixel
(
e
));
}
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
for
(
int
j
=
0
;
j
<
n
;
++
j
)
{
int32_t
bias_v
=
bias
[
j
];
int32_t
r
=
0
;
for
(
int
p
=
0
;
p
<
k
;
p
++
)
{
r
+=
static_cast
<
int32_t
>
(
a
(
i
,
p
))
*
static_cast
<
int32_t
>
(
b
(
p
,
j
));
}
r
=
qadd_int32
(
r
,
bias_v
);
if
(
relu
)
r
=
std
::
max
(
0
,
r
);
c1
(
i
,
j
)
=
qscale_int32
(
r
,
scale
);
}
}
}
else
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
bias
[
i
]
=
static_cast
<
int32_t
>
(
pixel
(
e
));
}
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
int32_t
bias_v
=
bias
[
i
];
for
(
int
j
=
0
;
j
<
n
;
++
j
)
{
int32_t
r
=
0
;
for
(
int
p
=
0
;
p
<
k
;
p
++
)
{
r
+=
static_cast
<
int32_t
>
(
a
(
i
,
p
))
*
static_cast
<
int32_t
>
(
b
(
p
,
j
));
}
r
=
qadd_int32
(
r
,
bias_v
);
if
(
relu
)
r
=
std
::
max
(
0
,
r
);
c1
(
i
,
j
)
=
qscale_int32
(
r
,
scale
);
}
r
=
qadd_int32
(
r
,
bias_v
);
if
(
relu
)
r
=
std
::
max
(
0
,
r
);
c1
(
i
,
j
)
=
qscale_int32
(
r
,
scale
);
}
}
paddle_mobile
::
operators
::
math
::
Gemm
gemm
;
#ifdef _OPENMP
gemm
.
Sgemm_omp
(
m
,
n
,
k
,
scale
,
a
,
lda
,
b
,
ldb
,
static_cast
<
float
>
(
0
),
c
,
ldc
,
relu
,
bias
);
relu
,
bias
,
addOnRow
);
#else
gemm
.
Sgemm
(
m
,
n
,
k
,
scale
,
a
,
lda
,
b
,
ldb
,
static_cast
<
float
>
(
0
),
c
,
ldc
,
relu
,
bias
);
relu
,
bias
,
addOnRow
);
#endif
int
eq
=
0
;
int
neq
=
0
;
...
...
@@ -219,20 +256,27 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
if
(
pr
>
0
)
{
std
::
cout
<<
"A:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
k
,
lda
,
a
);
print_mat
ri
x
(
m
,
k
,
lda
,
a
);
std
::
cout
<<
"B:"
<<
std
::
endl
;
print_mat
ir
x
(
k
,
n
,
ldb
,
b
);
print_mat
ri
x
(
k
,
n
,
ldb
,
b
);
std
::
cout
<<
"Bias:"
<<
std
::
endl
;
print_matirx
(
m
,
1
,
1
,
bias
);
if
(
addOnRow
)
{
print_matrix
(
1
,
n
,
n
,
bias
);
}
else
{
print_matrix
(
m
,
1
,
1
,
bias
);
}
std
::
cout
<<
"C:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
n
,
ldc
,
c
);
print_mat
ri
x
(
m
,
n
,
ldc
,
c
);
std
::
cout
<<
"C1:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
n
,
ldc
,
c1
);
print_mat
ri
x
(
m
,
n
,
ldc
,
c1
);
}
std
::
cout
<<
"mnk="
<<
m
<<
" "
<<
n
<<
" "
<<
k
<<
" relu="
<<
relu
<<
" eq="
<<
eq
<<
" neq="
<<
neq
<<
std
::
endl
;
PADDLE_MOBILE_ENFORCE
(
neq
==
0
,
"The execution of do_sgemm_with_bias is failed!"
);
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
c
);
...
...
@@ -261,7 +305,7 @@ int main() {
std
::
cout
<<
"
\n\n
******************************************************
\n\n
"
<<
std
::
endl
;
std
::
cout
<<
"Test gemm with bias:"
<<
std
::
endl
;
std
::
cout
<<
"Test gemm with bias
(bias is added on column)
:"
<<
std
::
endl
;
do_sgemm_with_bias
(
9
,
9
,
9
,
false
,
1
);
do_sgemm_with_bias
(
10
,
6
,
12
,
false
,
0
);
do_sgemm_with_bias
(
512
,
256
,
384
,
false
,
0
);
...
...
@@ -272,6 +316,19 @@ int main() {
do_sgemm_with_bias
(
333
,
797
,
939
,
false
,
0
);
do_sgemm_with_bias
(
1024
,
1024
,
1024
,
false
,
0
);
std
::
cout
<<
"
\n\n
******************************************************
\n\n
"
<<
std
::
endl
;
std
::
cout
<<
"Test gemm with bias(bias is added on row):"
<<
std
::
endl
;
do_sgemm_with_bias
(
9
,
9
,
9
,
false
,
1
,
true
);
do_sgemm_with_bias
(
10
,
6
,
12
,
false
,
0
,
true
);
do_sgemm_with_bias
(
512
,
256
,
384
,
false
,
0
,
true
);
do_sgemm_with_bias
(
1366
,
768
,
256
,
false
,
0
,
true
);
do_sgemm_with_bias
(
1255
,
755
,
333
,
false
,
0
,
true
);
do_sgemm_with_bias
(
599
,
1133
,
393
,
false
,
0
,
true
);
do_sgemm_with_bias
(
777
,
555
,
999
,
false
,
0
,
true
);
do_sgemm_with_bias
(
333
,
797
,
939
,
false
,
0
,
true
);
do_sgemm_with_bias
(
1024
,
1024
,
1024
,
false
,
0
,
true
);
std
::
cout
<<
"
\n\n
******************************************************
\n\n
"
<<
std
::
endl
;
std
::
cout
<<
"Test gemm with relu and bias:"
<<
std
::
endl
;
...
...
test/common/test_gemm_perf.cpp
浏览文件 @
6ce11736
...
...
@@ -49,7 +49,8 @@ int main() {
auto
bbptr_int8
=
bb_int8
.
mutable_data
<
int8_t
>
({
k
,
n
});
auto
ccptr_int32
=
cc_int32
.
mutable_data
<
int32_t
>
({
m
,
n
});
auto
ccptr_int8
=
cc_int8
.
mutable_data
<
int8_t
>
({
m
,
n
});
int32_t
*
bias_data
=
new
int32_t
[
m
];
int32_t
*
bias_data_col
=
new
int32_t
[
m
];
int32_t
*
bias_data_row
=
new
int32_t
[
n
];
for
(
int
i
=
0
;
i
<
m
*
k
;
++
i
)
{
aaptr_int8
[
i
]
=
static_cast
<
int8_t
>
(
2
);
...
...
@@ -62,7 +63,11 @@ int main() {
}
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
bias_data
[
i
]
=
2
;
bias_data_col
[
i
]
=
2
;
}
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
bias_data_row
[
i
]
=
2
;
}
// float
...
...
@@ -73,14 +78,15 @@ int main() {
false
,
nullptr
);
}
auto
time
1
=
time
();
auto
time
_start0
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
<
float
>
(
aa
,
false
,
bb
,
false
,
static_cast
<
float
>
(
1
),
&
cc
,
static_cast
<
float
>
(
0
),
false
,
nullptr
);
}
auto
time2
=
time
();
std
::
cout
<<
"float gemm cost :"
<<
time_diff
(
time1
,
time2
)
/
10
<<
"ms
\n
"
;
auto
time_end0
=
time
();
std
::
cout
<<
"float gemm cost :"
<<
time_diff
(
time_start0
,
time_end0
)
/
10
<<
"ms
\n
"
;
// int8_t without bias
// warm-up 10 times
...
...
@@ -90,33 +96,69 @@ int main() {
static_cast
<
float
>
(
0
));
}
auto
time
3
=
time
();
auto
time
_start1
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
<
float
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
1
),
&
cc_int32
,
static_cast
<
float
>
(
0
));
}
auto
time4
=
time
();
std
::
cout
<<
"int8_t gemm cost :"
<<
time_diff
(
time3
,
time4
)
/
10
<<
"ms
\n
"
;
auto
time_end1
=
time
();
std
::
cout
<<
"int8_t gemm cost :"
<<
time_diff
(
time_start1
,
time_end1
)
/
10
<<
"ms
\n
"
;
// int8_t with bias, column element wise add
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_col
,
false
);
}
auto
time_start2
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_col
,
false
);
}
auto
time_end2
=
time
();
std
::
cout
<<
"int8_t gemm_with_bias(column add) cost :"
<<
time_diff
(
time_start2
,
time_end2
)
/
10
<<
"ms
\n
"
;
// int8_t with bias, row element wise add
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_row
,
true
);
}
auto
time_start3
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_row
,
true
);
}
auto
time_end3
=
time
();
std
::
cout
<<
"int8_t gemm_with_bias(row add) cost :"
<<
time_diff
(
time_start3
,
time_end3
)
/
10
<<
"ms
\n
"
;
// int8_t with bias&relu
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
true
,
bias_data
);
static_cast
<
float
>
(
0
),
true
,
bias_data
_col
,
false
);
}
auto
time
5
=
time
();
auto
time
_start4
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
true
,
bias_data
);
static_cast
<
float
>
(
0
),
true
,
bias_data
_col
,
false
);
}
auto
time
6
=
time
();
auto
time
_end4
=
time
();
std
::
cout
<<
"int8_t gemm_with_bias_relu cost :"
<<
time_diff
(
time
5
,
time6
)
/
10
<<
"ms
\n
"
;
<<
time_diff
(
time
_start4
,
time_end4
)
/
10
<<
"ms
\n
"
;
delete
[]
bias_data
;
delete
[]
bias_data_row
;
delete
[]
bias_data_col
;
return
0
;
}
test/operators/test_fusion_conv_add_relu_int8_op.cpp
浏览文件 @
6ce11736
...
...
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#include <iostream>
#ifdef FUSION_CONVADDRELU_INT8_OP
#include <limits>
#include "../test_helper.h"
#include "../test_include.h"
...
...
@@ -356,5 +356,9 @@ int main(int argc, char *argv[]) {
paddle_mobile
::
TestConvOp
<
int8_t
,
5
,
2
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
}
#else
int
main
()
{
std
::
cout
<<
"FUSION_CONVADDRELU_INT8_OP is not defined!"
<<
std
::
endl
;
return
0
;
}
#endif
test/operators/test_fusion_fc_op.cpp
浏览文件 @
6ce11736
...
...
@@ -12,147 +12,163 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <framework/program/program-optimize/program_optimize.h>
#include <iostream>
#include <type_traits>
#include "../test_helper.h"
#include "../test_include.h"
#include "framework/operator.h"
#include "operators/fusion_fc_int8_op.h"
#include "operators/fusion_fc_op.h"
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define c(i, j) c[(i)*ldc + (j)]
namespace
paddle_mobile
{
namespace
framework
{
using
framework
::
AttributeMap
;
using
framework
::
DDim
;
using
framework
::
Scope
;
using
framework
::
make_ddim
;
int32_t
qadd_int32
(
int32_t
l
,
int32_t
r
)
{
int64_t
res
=
static_cast
<
int64_t
>
(
l
)
+
static_cast
<
int64_t
>
(
r
);
if
(
res
>
std
::
numeric_limits
<
int32_t
>::
max
())
return
std
::
numeric_limits
<
int32_t
>::
max
();
else
if
(
res
<
std
::
numeric_limits
<
int32_t
>::
min
())
return
std
::
numeric_limits
<
int32_t
>::
min
();
else
return
static_cast
<
int32_t
>
(
res
);
}
template
<
typename
Dtype
>
class
TestFcOp
{
public:
explicit
TestFcOp
(
const
Program
<
Dtype
>
p
)
:
program_
(
p
)
{
use_optimize_
=
true
;
if
(
use_optimize_
)
{
to_predict_program_
=
program_
.
optimizeProgram
;
}
else
{
to_predict_program_
=
program_
.
originProgram
;
}
// round to zero
float
round2zero
(
float
v
)
{
float
res
;
if
(
v
>
0
)
res
=
std
::
floor
(
v
);
else
if
(
v
<
0
)
res
=
std
::
ceil
(
v
);
return
res
;
}
const
std
::
vector
<
std
::
shared_ptr
<
BlockDesc
>>
blocks
=
to_predict_program_
->
Blocks
();
// DLOG << " **block size " << blocks.size();
for
(
int
i
=
0
;
i
<
blocks
.
size
();
++
i
)
{
std
::
shared_ptr
<
BlockDesc
>
block_desc
=
blocks
[
i
];
std
::
vector
<
std
::
shared_ptr
<
OpDesc
>>
ops
=
block_desc
->
Ops
();
// DLOG << " ops " << ops.size();
for
(
int
j
=
0
;
j
<
ops
.
size
();
++
j
)
{
std
::
shared_ptr
<
OpDesc
>
op
=
ops
[
j
];
if
(
op
->
Type
()
==
"fc"
&&
op
->
Input
(
"X"
)[
0
]
==
"pool2d_13.tmp_0"
)
{
DLOG
<<
" fc attr size: "
<<
op
->
GetAttrMap
().
size
();
DLOG
<<
" inputs size: "
<<
op
->
GetInputs
().
size
();
DLOG
<<
" outputs size: "
<<
op
->
GetOutputs
().
size
();
DLOG
<<
" Input X is : "
<<
op
->
Input
(
"X"
)[
0
];
DLOG
<<
" Input Y is : "
<<
op
->
Input
(
"Y"
)[
0
];
DLOG
<<
" Input Y is : "
<<
op
->
Input
(
"Z"
)[
0
];
DLOG
<<
" Output Out is : "
<<
op
->
Output
(
"Out"
)[
0
];
std
::
shared_ptr
<
operators
::
FusionFcOp
<
Dtype
,
float
>>
testOp
=
std
::
make_shared
<
operators
::
FusionFcOp
<
Dtype
,
float
>>
(
op
->
Type
(),
op
->
GetInputs
(),
op
->
GetOutputs
(),
op
->
GetAttrMap
(),
program_
.
scope
);
ops_of_block_
[
*
block_desc
.
get
()].
push_back
(
testOp
);
int8_t
qscale_int32
(
int32_t
v
,
float
scale
)
{
float
res
=
static_cast
<
float
>
(
v
)
*
scale
;
res
=
round2zero
(
res
);
if
(
res
>
127
)
return
static_cast
<
int8_t
>
(
127
);
else
if
(
res
<
-
127
)
return
static_cast
<
int8_t
>
(
-
127
);
else
return
static_cast
<
int8_t
>
(
res
);
}
template
<
typename
T
,
typename
S
>
int
TestFcOP
()
{
int32_t
m
=
377
;
int32_t
n
=
1363
;
int32_t
k
=
577
;
int32_t
lda
=
k
;
int32_t
ldb
=
n
;
int32_t
ldc
=
n
;
DDim
inputA_shape
=
make_ddim
({
m
,
k
});
DDim
inputB_shape
=
make_ddim
({
k
,
n
});
DDim
bias_shape
=
make_ddim
({
n
});
VariableNameMap
inputs
;
VariableNameMap
outputs
;
auto
scope
=
std
::
make_shared
<
Scope
>
();
inputs
[
"X"
]
=
std
::
vector
<
std
::
string
>
({
"inputA"
});
inputs
[
"Y"
]
=
std
::
vector
<
std
::
string
>
({
"inputB"
});
inputs
[
"Z"
]
=
std
::
vector
<
std
::
string
>
({
"bias"
});
inputs
[
"Scale"
]
=
std
::
vector
<
std
::
string
>
({
"scale"
});
outputs
[
"Out"
]
=
std
::
vector
<
std
::
string
>
({
"output"
});
auto
inputA_var
=
scope
.
get
()
->
Var
(
"inputA"
);
auto
inputA
=
inputA_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
T
>
(
inputA
,
inputA_shape
,
-
127
,
127
);
auto
inputB_var
=
scope
.
get
()
->
Var
(
"inputB"
);
auto
inputB
=
inputB_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
T
>
(
inputB
,
inputB_shape
,
-
127
,
127
);
auto
bias_var
=
scope
.
get
()
->
Var
(
"bias"
);
auto
bias
=
bias_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
S
>
(
bias
,
bias_shape
,
-
127
,
127
);
auto
scale_var
=
scope
.
get
()
->
Var
(
"scale"
);
auto
scale
=
scale_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
scale
->
Resize
(
framework
::
make_ddim
({
1
}));
float
scale_v
=
0.000828
f
;
scale
->
mutable_data
<
float
>
()[
0
]
=
scale_v
;
auto
output_var
=
scope
.
get
()
->
Var
(
"output"
);
AttributeMap
attrs
;
attrs
[
"x_num_col_dims"
].
Set
<
int
>
(
1
);
attrs
[
"y_num_col_dims"
].
Set
<
int
>
(
1
);
attrs
[
"axis"
].
Set
<
int
>
(
1
);
operators
::
OperatorBase
<
CPU
>
*
op
=
nullptr
;
#ifdef FUSION_FC_INT8_OP
if
(
std
::
is_same
<
T
,
int8_t
>::
value
)
{
op
=
new
operators
::
FusionFcInt8Op
<
CPU
,
T
>
(
"fusion_fc_int8"
,
inputs
,
outputs
,
attrs
,
scope
);
}
else
{
op
=
new
operators
::
FusionFcOp
<
CPU
,
T
>
(
"fusion_fc"
,
inputs
,
outputs
,
attrs
,
scope
);
}
#else
op
=
new
operators
::
FusionFcOp
<
CPU
,
T
>
(
"fusion_fc"
,
inputs
,
outputs
,
attrs
,
scope
);
#endif
op
->
InferShape
();
op
->
Run
();
auto
output
=
output_var
->
template
Get
<
framework
::
LoDTensor
>();
const
T
*
output_data
=
output
->
data
<
T
>
();
// compare
T
*
c
=
static_cast
<
T
*>
(
memory
::
Alloc
(
sizeof
(
T
)
*
m
*
n
));
T
*
a
=
inputA
->
data
<
T
>
();
T
*
b
=
inputB
->
data
<
T
>
();
S
*
bias_data
=
bias
->
data
<
S
>
();
for
(
int32_t
i
=
0
;
i
<
m
;
++
i
)
{
for
(
int32_t
j
=
0
;
j
<
n
;
++
j
)
{
S
bias_v
=
bias_data
[
j
];
if
(
std
::
is_same
<
T
,
int8_t
>::
value
)
{
int32_t
r
=
0
;
for
(
int32_t
p
=
0
;
p
<
k
;
p
++
)
{
r
+=
static_cast
<
int32_t
>
(
a
(
i
,
p
))
*
static_cast
<
int32_t
>
(
b
(
p
,
j
));
}
r
=
qadd_int32
(
r
,
bias_v
);
c
(
i
,
j
)
=
qscale_int32
(
r
,
scale_v
);
}
else
{
T
r
=
0
;
for
(
int32_t
p
=
0
;
p
<
k
;
p
++
)
{
r
+=
a
(
i
,
p
)
*
b
(
p
,
j
);
}
r
+=
bias_v
;
c
(
i
,
j
)
=
r
;
}
}
}
std
::
shared_ptr
<
Tensor
>
predict
(
const
Tensor
&
t1
,
const
Tensor
&
t2
,
const
Tensor
&
t3
)
{
// feed
auto
scope
=
program_
.
scope
;
Variable
*
x_feed_value
=
scope
->
Var
(
"pool2d_13.tmp_0"
);
auto
tensor_x
=
x_feed_value
->
GetMutable
<
LoDTensor
>
();
tensor_x
->
ShareDataWith
(
t1
);
Variable
*
y_feed_value
=
scope
->
Var
(
"loss3_classifier-loc_weights"
);
auto
tensor_y
=
y_feed_value
->
GetMutable
<
LoDTensor
>
();
tensor_y
->
ShareDataWith
(
t2
);
Variable
*
z_feed_value
=
scope
->
Var
(
"loss3_classifier-loc_biases"
);
auto
tensor_z
=
z_feed_value
->
GetMutable
<
LoDTensor
>
();
tensor_z
->
ShareDataWith
(
t3
);
Variable
*
con_output
=
scope
->
Var
(
"loss3_classifier-loc.tmp_1"
);
auto
*
output_tensor
=
con_output
->
GetMutable
<
LoDTensor
>
();
output_tensor
->
mutable_data
<
float
>
({
3
,
10
});
// DLOG << typeid(output_tensor).name();
// DLOG << "output_tensor dims: " << output_tensor->dims();
std
::
shared_ptr
<
LoDTensor
>
out_tensor
=
std
::
make_shared
<
LoDTensor
>
();
out_tensor
.
reset
(
output_tensor
);
predict
(
t1
,
t2
,
t3
,
0
);
return
out_tensor
;
}
private:
const
framework
::
Program
<
Dtype
>
program_
;
std
::
shared_ptr
<
ProgramDesc
>
to_predict_program_
;
std
::
map
<
framework
::
BlockDesc
,
std
::
vector
<
std
::
shared_ptr
<
OperatorBase
<
Dtype
>>>>
ops_of_block_
;
bool
use_optimize_
=
false
;
void
predict
(
const
Tensor
&
t1
,
const
Tensor
&
t2
,
const
Tensor
&
t3
,
int
block_id
)
{
std
::
shared_ptr
<
BlockDesc
>
to_predict_block
=
to_predict_program_
->
Block
(
block_id
);
for
(
int
j
=
0
;
j
<
ops_of_block_
[
*
to_predict_block
.
get
()].
size
();
++
j
)
{
auto
op
=
ops_of_block_
[
*
to_predict_block
.
get
()][
j
];
DLOG
<<
"op -> run()"
;
op
->
Run
();
int32_t
eq
=
0
;
int32_t
neq
=
0
;
for
(
int32_t
i
=
0
;
i
<
m
*
n
;
++
i
)
{
PADDLE_MOBILE_ENFORCE
(
output_data
[
i
]
==
c
[
i
],
"The execution of test_fusion_fc_op is failed!"
);
if
(
output_data
[
i
]
==
c
[
i
])
{
++
eq
;
}
else
{
++
neq
;
}
}
};
template
class
TestFcOp
<
CPU
>;
}
// namespace framework
std
::
cout
<<
"mnk="
<<
m
<<
" "
<<
n
<<
" "
<<
k
<<
" eq="
<<
eq
<<
" neq="
<<
neq
<<
std
::
endl
;
delete
op
;
return
0
;
}
}
// namespace paddle_mobile
int
main
()
{
DLOG
<<
"----------**********----------"
;
DLOG
<<
"begin to run Fc Test"
;
paddle_mobile
::
framework
::
Loader
<
paddle_mobile
::
CPU
>
loader
;
// "../../../test/models/googlenet"
auto
program
=
loader
.
Load
(
g_googlenet
);
paddle_mobile
::
framework
::
ProgramOptimize
optimize
;
// program.originProgram->Description("origin");
auto
optimize_program
=
optimize
.
FusionOptimize
(
program
.
originProgram
);
program
.
optimizeProgram
=
optimize_program
;
if
(
optimize_program
!=
nullptr
)
{
optimize_program
->
Description
(
"optimize"
);
}
else
{
LOG
(
paddle_mobile
::
kLOG_ERROR
)
<<
"optimize_program is null"
;
}
/// input x (1,3,224,224)
paddle_mobile
::
framework
::
LoDTensor
inputx
;
SetupTensor
<
float
>
(
&
inputx
,
{
3
,
64
,
1
,
1
},
static_cast
<
float
>
(
1
),
static_cast
<
float
>
(
1
));
auto
*
inputx_ptr
=
inputx
.
data
<
float
>
();
/// input y (224,)
paddle_mobile
::
framework
::
LoDTensor
inputy
;
SetupTensor
<
float
>
(
&
inputy
,
{
64
,
10
},
static_cast
<
float
>
(
1.5
),
static_cast
<
float
>
(
1.5
));
auto
*
inputy_ptr
=
inputy
.
data
<
float
>
();
paddle_mobile
::
framework
::
LoDTensor
inputz
;
SetupTensor
<
float
>
(
&
inputz
,
{
10
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
auto
*
inputz_ptr
=
inputz
.
data
<
float
>
();
paddle_mobile
::
framework
::
TestFcOp
<
paddle_mobile
::
CPU
>
testFcOp
(
program
);
auto
output
=
testFcOp
.
predict
(
inputx
,
inputy
,
inputz
);
auto
*
output_ptr
=
output
->
data
<
float
>
();
for
(
int
j
=
0
;
j
<
output
->
numel
();
++
j
)
{
DLOG
<<
"value of output: "
<<
output_ptr
[
j
];
}
DLOG
<<
"1 (3,64) * 2 (64,10) = 96(3,10)"
;
DLOG
<<
"output : 96(3,10) + bias(10)"
;
int
main
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile
;
paddle_mobile
.
SetThreadNum
(
4
);
#ifdef FUSION_FC_INT8_OP
paddle_mobile
::
TestFcOP
<
int8_t
,
int32_t
>
();
#endif
paddle_mobile
::
TestFcOP
<
float
,
float
>
();
return
0
;
}
tools/op.cmake
浏览文件 @
6ce11736
...
...
@@ -214,6 +214,7 @@ if(NOT FOUND_MATCH)
set
(
FUSION_CONVADDPRELU_OP ON
)
set
(
FUSION_CONVADDRELU_OP ON
)
set
(
FUSION_CONVADDRELU_INT8_OP ON
)
set
(
FUSION_FC_INT8_OP ON
)
set
(
FUSION_FC_OP ON
)
set
(
LRN_OP ON
)
set
(
MUL_OP ON
)
...
...
@@ -322,6 +323,9 @@ endif()
if
(
FUSION_FC_OP
)
add_definitions
(
-DFUSION_FC_OP
)
endif
()
if
(
FUSION_FC_INT8_OP
)
add_definitions
(
-DFUSION_FC_INT8_OP
)
endif
()
if
(
LRN_OP
)
add_definitions
(
-DLRN_OP
)
endif
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录