Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
6ce11736
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
337
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
6ce11736
编写于
12月 06, 2018
作者:
J
Jiaying Zhao
提交者:
GitHub
12月 06, 2018
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1336 from wzzju/add_fusion_fc_int8_op
add fusion fc int8_t op and its UT.
上级
ce969f68
3dbf9966
变更
20
隐藏空白更改
内联
并排
Showing
20 changed file
with
696 addition
and
262 deletion
+696
-262
src/common/types.cpp
src/common/types.cpp
+3
-1
src/common/types.h
src/common/types.h
+1
-0
src/operators/fusion_conv_add_relu_int8_op.h
src/operators/fusion_conv_add_relu_int8_op.h
+7
-7
src/operators/fusion_fc_int8_op.cpp
src/operators/fusion_fc_int8_op.cpp
+61
-0
src/operators/fusion_fc_int8_op.h
src/operators/fusion_fc_int8_op.h
+50
-0
src/operators/kernel/arm/fusion_fc_kernel.cpp
src/operators/kernel/arm/fusion_fc_kernel.cpp
+18
-1
src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
...perators/kernel/central-arm-func/conv_add_relu_arm_func.h
+1
-2
src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+30
-17
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+0
-2
src/operators/math/gemm.h
src/operators/math/gemm.h
+48
-14
src/operators/math/gemm_int8.cpp
src/operators/math/gemm_int8.cpp
+137
-15
src/operators/math/math_function.h
src/operators/math/math_function.h
+1
-1
src/operators/math/math_function_int8.cpp
src/operators/math/math_function_int8.cpp
+10
-10
src/operators/op_param.h
src/operators/op_param.h
+14
-2
test/common/test_gemm_accuracy.cpp
test/common/test_gemm_accuracy.cpp
+7
-5
test/common/test_gemm_int8_accuracy.cpp
test/common/test_gemm_int8_accuracy.cpp
+98
-41
test/common/test_gemm_perf.cpp
test/common/test_gemm_perf.cpp
+56
-14
test/operators/test_fusion_conv_add_relu_int8_op.cpp
test/operators/test_fusion_conv_add_relu_int8_op.cpp
+7
-3
test/operators/test_fusion_fc_op.cpp
test/operators/test_fusion_fc_op.cpp
+143
-127
tools/op.cmake
tools/op.cmake
+4
-0
未找到文件。
src/common/types.cpp
浏览文件 @
6ce11736
...
@@ -32,6 +32,7 @@ const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
...
@@ -32,6 +32,7 @@ const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
const
char
*
G_OP_TYPE_FUSION_DWCONV_BN_RELU
=
"fusion_dwconv_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_DWCONV_BN_RELU
=
"fusion_dwconv_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_BN_RELU
=
"fusion_conv_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_BN_RELU
=
"fusion_conv_bn_relu"
;
const
char
*
G_OP_TYPE_FC
=
"fusion_fc"
;
const
char
*
G_OP_TYPE_FC
=
"fusion_fc"
;
const
char
*
G_OP_TYPE_FC_INT8
=
"fusion_fc_int8"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
=
"fusion_conv_add"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
=
"fusion_conv_add"
;
const
char
*
G_OP_TYPE_LRN
=
"lrn"
;
const
char
*
G_OP_TYPE_LRN
=
"lrn"
;
const
char
*
G_OP_TYPE_MUL
=
"mul"
;
const
char
*
G_OP_TYPE_MUL
=
"mul"
;
...
@@ -111,12 +112,13 @@ std::unordered_map<
...
@@ -111,12 +112,13 @@ std::unordered_map<
{
G_OP_TYPE_MULTICLASS_NMS
,
{{
"BBoxes"
,
"Scores"
},
{
"Out"
}}},
{
G_OP_TYPE_MULTICLASS_NMS
,
{{
"BBoxes"
,
"Scores"
},
{
"Out"
}}},
{
G_OP_TYPE_POLYGON_BOX_TRANSFORM
,
{{
"Input"
},
{
"Output"
}}},
{
G_OP_TYPE_POLYGON_BOX_TRANSFORM
,
{{
"Input"
},
{
"Output"
}}},
{
G_OP_TYPE_FC
,
{{
"X"
,
"Y"
,
"Z"
},
{
"Out"
}}},
{
G_OP_TYPE_FC
,
{{
"X"
,
"Y"
,
"Z"
},
{
"Out"
}}},
{
G_OP_TYPE_FC_INT8
,
{{
"X"
,
"Y"
,
"Z"
,
"Scale"
},
{
"Out"
}}},
{
G_OP_TYPE_RESHAPE
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_RESHAPE
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_RESHAPE2
,
{{
"X"
},
{
"Out"
,
"XShape"
}}},
{
G_OP_TYPE_RESHAPE2
,
{{
"X"
},
{
"Out"
,
"XShape"
}}},
{
G_OP_TYPE_DEPTHWISE_CONV
,
{{
"Input"
},
{
"Output"
}}},
{
G_OP_TYPE_DEPTHWISE_CONV
,
{{
"Input"
},
{
"Output"
}}},
{
G_OP_TYPE_FILL_CONSTANT
,
{{},
{
"Out"
}}},
{
G_OP_TYPE_FILL_CONSTANT
,
{{},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_RELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_RELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8
,
{{
"Input"
,
"Scale"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_PRELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_PRELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_IM2SEQUENCE
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_IM2SEQUENCE
,
{{
"X"
},
{
"Out"
}}},
...
...
src/common/types.h
浏览文件 @
6ce11736
...
@@ -103,6 +103,7 @@ extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8;
...
@@ -103,6 +103,7 @@ extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_PRELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_PRELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
;
extern
const
char
*
G_OP_TYPE_FC
;
extern
const
char
*
G_OP_TYPE_FC
;
extern
const
char
*
G_OP_TYPE_FC_INT8
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_BN_ADD_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_BN_ADD_RELU
;
...
...
src/operators/fusion_conv_add_relu_int8_op.h
浏览文件 @
6ce11736
...
@@ -22,19 +22,19 @@ namespace paddle_mobile {
...
@@ -22,19 +22,19 @@ namespace paddle_mobile {
namespace
operators
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
template
<
typename
DeviceType
,
typename
T
>
class
FusionConvAddReluInt8Op
class
FusionConvAddReluInt8Op
:
public
framework
::
OperatorWithKernel
<
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
DeviceType
,
FusionConvAddReluParam
<
DeviceType
>
,
FusionConvAddReluParam
<
DeviceType
>
,
operators
::
ConvAddReluKernel
<
DeviceType
,
T
>>
{
ConvAddReluKernel
<
DeviceType
,
T
>>
{
public:
public:
FusionConvAddReluInt8Op
(
const
std
::
string
&
type
,
FusionConvAddReluInt8Op
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
const
framework
::
AttributeMap
&
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorWithKernel
<
:
framework
::
OperatorWithKernel
<
DeviceType
,
DeviceType
,
FusionConvAddReluParam
<
DeviceType
>
,
FusionConvAddReluParam
<
DeviceType
>
,
operators
::
ConvAddReluKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
ConvAddReluKernel
<
DeviceType
,
T
>>
(
attrs
,
scope
)
{}
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
void
InferShape
()
const
override
;
void
InferShape
()
const
override
;
};
};
}
// namespace operators
}
// namespace operators
...
...
src/operators/fusion_fc_int8_op.cpp
0 → 100644
浏览文件 @
6ce11736
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_INT8_OP
#include "operators/fusion_fc_int8_op.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
Dtype
,
typename
T
>
void
FusionFcInt8Op
<
Dtype
,
T
>::
InferShape
()
const
{
auto
x_dims
=
this
->
param_
.
InputX
()
->
dims
();
auto
y_dims
=
this
->
param_
.
InputY
()
->
dims
();
int
x_num_col_dims
=
this
->
param_
.
XNumColDims
();
int
y_num_col_dims
=
this
->
param_
.
YNumColDims
();
assert
(
x_dims
.
size
()
>
x_num_col_dims
);
assert
(
y_dims
.
size
()
>
y_num_col_dims
);
/// (1,2,3,4) , x_num_col_dims = 2 -> (2,12)
auto
x_mat_dims
=
framework
::
flatten_to_2d
(
x_dims
,
x_num_col_dims
);
auto
y_mat_dims
=
framework
::
flatten_to_2d
(
y_dims
,
y_num_col_dims
);
assert
(
x_mat_dims
[
1
]
==
y_mat_dims
[
0
]);
std
::
vector
<
int64_t
>
output_dims
;
output_dims
.
reserve
(
static_cast
<
size_t
>
(
x_num_col_dims
+
y_dims
.
size
()
-
y_num_col_dims
));
for
(
int
i
=
0
;
i
<
x_num_col_dims
;
++
i
)
{
output_dims
.
push_back
(
x_dims
[
i
]);
}
for
(
int
i
=
y_num_col_dims
;
i
<
y_dims
.
size
();
++
i
)
{
output_dims
.
push_back
(
y_dims
[
i
]);
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_dims
);
this
->
param_
.
Out
()
->
Resize
(
ddim
);
}
}
// namespace operators
}
// namespace paddle_mobile
namespace
ops
=
paddle_mobile
::
operators
;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU_INT8
(
fusion_fc_int8
,
ops
::
FusionFcInt8Op
);
#endif
#endif // FUSION_FC_INT8_OP
src/operators/fusion_fc_int8_op.h
0 → 100644
浏览文件 @
6ce11736
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_FC_INT8_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/fusion_fc_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
DeviceType
,
typename
T
>
class
FusionFcInt8Op
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionFcParam
<
DeviceType
>
,
FusionFcKernel
<
DeviceType
,
T
>>
{
public:
FusionFcInt8Op
(
const
std
::
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionFcParam
<
DeviceType
>
,
FusionFcKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
void
InferShape
()
const
override
;
};
}
// namespace operators
}
// namespace paddle_mobile
#endif // FUSION_FC_INT8_OP
src/operators/kernel/arm/fusion_fc_kernel.cpp
浏览文件 @
6ce11736
...
@@ -27,10 +27,27 @@ bool FusionFcKernel<CPU, float>::Init(FusionFcParam<CPU> *param) {
...
@@ -27,10 +27,27 @@ bool FusionFcKernel<CPU, float>::Init(FusionFcParam<CPU> *param) {
template
<
>
template
<
>
void
FusionFcKernel
<
CPU
,
float
>::
Compute
(
const
FusionFcParam
<
CPU
>
&
param
)
{
void
FusionFcKernel
<
CPU
,
float
>::
Compute
(
const
FusionFcParam
<
CPU
>
&
param
)
{
FusionFcCompute
<
float
>
(
param
);
FusionFcCompute
<
float
,
float
>
(
param
);
param
.
Out
()
->
set_lod
(
param
.
InputX
()
->
lod
());
param
.
Out
()
->
set_lod
(
param
.
InputX
()
->
lod
());
}
}
template
class
FusionFcKernel
<
CPU
,
float
>;
#ifdef FUSION_FC_INT8_OP
template
<
>
bool
FusionFcKernel
<
CPU
,
int8_t
>::
Init
(
FusionFcParam
<
CPU
>
*
param
)
{
return
true
;
}
template
<
>
void
FusionFcKernel
<
CPU
,
int8_t
>::
Compute
(
const
FusionFcParam
<
CPU
>
&
param
)
{
FusionFcCompute
<
int8_t
,
int32_t
>
(
param
);
param
.
Out
()
->
set_lod
(
param
.
InputX
()
->
lod
());
}
template
class
FusionFcKernel
<
CPU
,
int8_t
>;
#endif
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
...
...
src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
浏览文件 @
6ce11736
...
@@ -39,8 +39,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> ¶m) {
...
@@ -39,8 +39,7 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> ¶m) {
float
beta
=
1.0
f
;
float
beta
=
1.0
f
;
#ifdef FUSION_CONVADDRELU_INT8_OP
#ifdef FUSION_CONVADDRELU_INT8_OP
Tensor
scale
=
*
param
.
InputScale
();
alpha
=
param
.
InputScale
()
->
data
<
float
>
()[
0
];
alpha
=
scale
.
data
<
float
>
()[
0
];
beta
=
0.0
f
;
beta
=
0.0
f
;
#endif
#endif
...
...
src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
浏览文件 @
6ce11736
...
@@ -15,23 +15,29 @@ limitations under the License. */
...
@@ -15,23 +15,29 @@ limitations under the License. */
#ifdef FUSION_FC_OP
#ifdef FUSION_FC_OP
#pragma once
#pragma once
#include <type_traits>
#include "operators/math/math_function.h"
#include "operators/math/math_function.h"
#include "operators/op_param.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
template
<
typename
P
>
template
<
typename
P
,
typename
S
>
void
FusionFcCompute
(
const
FusionFcParam
<
CPU
>
&
param
)
{
void
FusionFcCompute
(
const
FusionFcParam
<
CPU
>
&
param
)
{
const
Tensor
*
input_x
=
param
.
InputX
();
const
Tensor
*
input_x
=
param
.
InputX
();
const
Tensor
*
input_y
=
param
.
InputY
();
const
Tensor
*
input_y
=
param
.
InputY
();
const
Tensor
*
input_z
=
param
.
InputZ
();
Tensor
*
input_z
=
param
.
InputZ
();
auto
*
input_z_data
=
input_z
->
data
<
float
>
();
S
*
input_z_data
=
input_z
->
data
<
S
>
();
int
axis
=
param
.
Axis
();
int
axis
=
param
.
Axis
();
Tensor
*
out
=
param
.
Out
();
Tensor
*
out
=
param
.
Out
();
// int m = out->dims()[0];
// int m = out->dims()[0];
// int n = out->dims()[1];
// int n = out->dims()[1];
auto
*
out_data
=
out
->
mutable_data
<
float
>
();
auto
*
out_data
=
out
->
mutable_data
<
P
>
();
float
alpha
=
1.0
f
;
float
beta
=
1.0
f
;
const
Tensor
x_matrix
=
const
Tensor
x_matrix
=
input_x
->
dims
().
size
()
>
2
input_x
->
dims
().
size
()
>
2
?
framework
::
ReshapeToMatrix
(
*
input_x
,
param
.
XNumColDims
())
?
framework
::
ReshapeToMatrix
(
*
input_x
,
param
.
XNumColDims
())
...
@@ -51,21 +57,28 @@ void FusionFcCompute(const FusionFcParam<CPU> ¶m) {
...
@@ -51,21 +57,28 @@ void FusionFcCompute(const FusionFcParam<CPU> ¶m) {
axis
=
(
axis
==
-
1
?
out_dim
.
size
()
-
input_z
->
dims
().
size
()
:
axis
);
axis
=
(
axis
==
-
1
?
out_dim
.
size
()
-
input_z
->
dims
().
size
()
:
axis
);
PADDLE_MOBILE_ENFORCE
(
axis
==
1
,
" to fit broadcast, axis = 1. "
);
PADDLE_MOBILE_ENFORCE
(
axis
==
1
,
" to fit broadcast, axis = 1. "
);
int64_t
classes
=
input_z
->
numel
();
if
(
std
::
is_same
<
P
,
int8_t
>::
value
)
{
for
(
int
i
=
0
;
i
<
out_dim
[
0
];
i
++
)
{
#ifdef FUSION_FC_INT8_OP
memory
::
Copy
(
out_data
+
i
*
classes
,
input_z_data
,
sizeof
(
float
)
*
classes
);
alpha
=
param
.
InputScale
()
->
data
<
float
>
()[
0
];
}
beta
=
0.0
f
;
math
::
matmul
(
x_matrix
,
false
,
y_matrix
,
false
,
alpha
,
out
,
beta
,
false
,
input_z_data
,
true
);
#endif
}
else
{
// bias_data的维度和out的第二个维度一致
int64_t
classes
=
input_z
->
numel
();
for
(
int
i
=
0
;
i
<
out_dim
[
0
];
i
++
)
{
memory
::
Copy
(
out_data
+
i
*
classes
,
input_z_data
,
sizeof
(
float
)
*
classes
);
}
// for (int i = 0; i < out->numel(); i++) {
math
::
matmul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
alpha
,
out
,
beta
,
// DLOG << out_data[i];
false
);
// }
}
// bias_data的维度和out的维度一致
math
::
matmul
<
float
>
(
x_matrix
,
false
,
y_matrix
,
false
,
static_cast
<
float
>
(
1
),
out
,
static_cast
<
float
>
(
1
),
false
);
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
PADDLE_MOBILE_ENFORCE
(
out_dim
.
size
()
==
2
,
" out_dim.size must be 2."
);
//
if (out_dim.size() != 2) {
// if (out_dim.size() != 2) {
//
out->Resize(out_dim);
// out->Resize(out_dim);
//
}
// }
}
}
}
// namespace operators
}
// namespace operators
...
...
src/operators/math/gemm.cpp
浏览文件 @
6ce11736
...
@@ -2924,7 +2924,6 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
...
@@ -2924,7 +2924,6 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
#endif // __ARM_NEON
#endif // __ARM_NEON
// 32位 float 矩阵乘法
// 32位 float 矩阵乘法
template
<
>
void
Gemm
::
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
void
Gemm
::
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
)
{
bool
relu
,
float
*
bias
)
{
...
@@ -3147,7 +3146,6 @@ void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
...
@@ -3147,7 +3146,6 @@ void Gemm::SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
}
}
// 32位 float 矩阵乘法
// 32位 float 矩阵乘法
template
<
>
void
Gemm
::
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
void
Gemm
::
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
)
{
bool
relu
,
float
*
bias
)
{
...
...
src/operators/math/gemm.h
浏览文件 @
6ce11736
...
@@ -167,14 +167,25 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
...
@@ -167,14 +167,25 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *new_bias);
float *new_bias);
*/
*/
// 32位 float 矩阵乘法
void
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
// 32位 float 矩阵乘法, 并对结果进行 batchnrom
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
void
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
);
void
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
void
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
// 32位 float 矩阵乘法(openmp 多线程版本)
void
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
);
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
// 32位 float 矩阵乘法, 并对结果进行 batchnrom(openmp 多线程版本)
void
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
void
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
...
@@ -202,7 +213,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
...
@@ -202,7 +213,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
template
<
typename
Otype
>
template
<
typename
Otype
>
void
InnerKernelWithBias
(
int32_t
mc
,
int32_t
nc
,
float
alpha
,
const
int8_t
*
a
,
void
InnerKernelWithBias
(
int32_t
mc
,
int32_t
nc
,
float
alpha
,
const
int8_t
*
a
,
const
int8_t
*
b
,
float
beta
,
int32_t
*
c
,
Otype
*
C
,
const
int8_t
*
b
,
float
beta
,
int32_t
*
c
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
);
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
,
bool
addOnRow
=
false
);
// 8 bits int pack function
// 8 bits int pack function
void
PackMatrixA_4r
(
int32_t
m
,
int32_t
k
,
int32_t
m_tail
,
const
int8_t
*
A
,
void
PackMatrixA_4r
(
int32_t
m
,
int32_t
k
,
int32_t
m_tail
,
const
int8_t
*
A
,
...
@@ -228,28 +240,32 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
...
@@ -228,28 +240,32 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
template
<
typename
Itype
,
typename
Btype
,
typename
Otype
>
template
<
typename
Itype
,
typename
Btype
,
typename
Otype
>
void
Sgemm_omp
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
Itype
*
A
,
void
Sgemm_omp
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
Itype
*
A
,
int32_t
lda
,
const
Itype
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
lda
,
const
Itype
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
Btype
*
bias
);
int32_t
ldc
,
bool
relu
,
Btype
*
bias
,
bool
addOnRow
=
false
);
template
<
typename
Otype
>
template
<
typename
Otype
>
void
Sgemm_omp
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
int8_t
*
A
,
void
Sgemm_omp
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
int8_t
*
A
,
int32_t
lda
,
const
int8_t
*
B
,
int32_t
ldb
,
float
beta
,
int32_t
lda
,
const
int8_t
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
);
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
,
bool
addOnRow
=
false
);
template
<
typename
Itype
,
typename
Btype
,
typename
Otype
>
template
<
typename
Itype
,
typename
Btype
,
typename
Otype
>
void
Sgemm
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
Itype
*
A
,
void
Sgemm
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
Itype
*
A
,
int32_t
lda
,
const
Itype
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
lda
,
const
Itype
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
Btype
*
bias
);
int32_t
ldc
,
bool
relu
,
Btype
*
bias
,
bool
addOnRow
=
false
);
template
<
typename
Otype
>
template
<
typename
Otype
>
void
Sgemm
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
int8_t
*
A
,
void
Sgemm
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
int8_t
*
A
,
int32_t
lda
,
const
int8_t
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
lda
,
const
int8_t
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
);
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
,
bool
addOnRow
=
false
);
// 8 bits int write back
// 8 bits int write back
// C = A * B
// C = A * B
void
WriteBasic
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int32_t
*
C
,
int32_t
ldc
);
void
WriteBasic
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int32_t
*
C
,
int32_t
ldc
);
// C = A * B + bias, scale * relu(C)
// C = A * B + bias, scale * relu(C)
void
WriteWithAddReluScale
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
void
WriteWithAddReluScale
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
int32_t
*
bias
,
float
scale
);
int32_t
ldc
,
int32_t
*
bias
,
float
scale
);
// C = A * B + bias, scale * C
// C = A * B + bias, scale * C
, bias is added on column
void
WriteWithAddScale
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
void
WriteWithAddScale
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
int32_t
*
bias
,
float
scale
);
int32_t
ldc
,
int32_t
*
bias
,
float
scale
);
// C = A * B + bias, scale * C, bias is added on row
void
WriteWithAddScaleT
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
int32_t
*
bias
,
float
scale
);
private:
private:
int
MC
=
0
;
int
MC
=
0
;
...
@@ -273,7 +289,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
...
@@ -273,7 +289,8 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
template
<
typename
Otype
>
template
<
typename
Otype
>
void
Gemm
::
Sgemm
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
int8_t
*
A
,
void
Gemm
::
Sgemm
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
int8_t
*
A
,
int32_t
lda
,
const
int8_t
*
B
,
int32_t
ldb
,
float
beta
,
int32_t
lda
,
const
int8_t
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
)
{
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
,
bool
addOnRow
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int32_t
L1
=
32
*
1024
;
int32_t
L1
=
32
*
1024
;
...
@@ -322,8 +339,15 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
...
@@ -322,8 +339,15 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
InnerKernel
(
mc
,
nc
,
alpha
,
packedA_int8
,
packedB_int8
,
beta
,
InnerKernel
(
mc
,
nc
,
alpha
,
packedA_int8
,
packedB_int8
,
beta
,
packedC_int32
,
&
C
(
i
,
j
),
ldc
,
relu
);
packedC_int32
,
&
C
(
i
,
j
),
ldc
,
relu
);
}
else
{
}
else
{
InnerKernelWithBias
(
mc
,
nc
,
alpha
,
packedA_int8
,
packedB_int8
,
beta
,
if
(
addOnRow
)
{
packedC_int32
,
&
C
(
i
,
j
),
ldc
,
relu
,
bias
+
i
);
InnerKernelWithBias
(
mc
,
nc
,
alpha
,
packedA_int8
,
packedB_int8
,
beta
,
packedC_int32
,
&
C
(
i
,
j
),
ldc
,
relu
,
bias
+
j
,
addOnRow
);
}
else
{
InnerKernelWithBias
(
mc
,
nc
,
alpha
,
packedA_int8
,
packedB_int8
,
beta
,
packedC_int32
,
&
C
(
i
,
j
),
ldc
,
relu
,
bias
+
i
,
addOnRow
);
}
}
}
}
}
}
}
...
@@ -339,7 +363,7 @@ template <typename Otype>
...
@@ -339,7 +363,7 @@ template <typename Otype>
void
Gemm
::
Sgemm_omp
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
void
Gemm
::
Sgemm_omp
(
int32_t
m
,
int32_t
n
,
int32_t
k
,
float
alpha
,
const
int8_t
*
A
,
int32_t
lda
,
const
int8_t
*
B
,
int32_t
ldb
,
const
int8_t
*
A
,
int32_t
lda
,
const
int8_t
*
B
,
int32_t
ldb
,
float
beta
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
float
beta
,
Otype
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
)
{
int32_t
*
bias
,
bool
addOnRow
)
{
#ifdef _OPENMP
#ifdef _OPENMP
int32_t
max_threads
=
omp_get_max_threads
();
int32_t
max_threads
=
omp_get_max_threads
();
#else
#else
...
@@ -422,8 +446,13 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
...
@@ -422,8 +446,13 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
InnerKernel
(
mc
,
n
,
alpha
,
local_A
,
packedB_int8
,
beta
,
local_C
,
InnerKernel
(
mc
,
n
,
alpha
,
local_A
,
packedB_int8
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
);
&
C
(
i
,
0
),
ldc
,
relu
);
}
else
{
}
else
{
InnerKernelWithBias
(
mc
,
n
,
alpha
,
local_A
,
packedB_int8
,
beta
,
local_C
,
if
(
addOnRow
)
{
&
C
(
i
,
0
),
ldc
,
relu
,
bias
+
i
);
InnerKernelWithBias
(
mc
,
n
,
alpha
,
local_A
,
packedB_int8
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
bias
,
addOnRow
);
}
else
{
InnerKernelWithBias
(
mc
,
n
,
alpha
,
local_A
,
packedB_int8
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
bias
+
i
,
addOnRow
);
}
}
}
}
}
}
else
{
}
else
{
...
@@ -447,8 +476,13 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
...
@@ -447,8 +476,13 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
InnerKernel
(
m
,
nc
,
alpha
,
packedA_int8
,
local_B
,
beta
,
local_C
,
InnerKernel
(
m
,
nc
,
alpha
,
packedA_int8
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
);
&
C
(
0
,
j
),
ldc
,
relu
);
}
else
{
}
else
{
InnerKernelWithBias
(
m
,
nc
,
alpha
,
packedA_int8
,
local_B
,
beta
,
local_C
,
if
(
addOnRow
)
{
&
C
(
0
,
j
),
ldc
,
relu
,
bias
);
InnerKernelWithBias
(
m
,
nc
,
alpha
,
packedA_int8
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
bias
+
j
,
addOnRow
);
}
else
{
InnerKernelWithBias
(
m
,
nc
,
alpha
,
packedA_int8
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
bias
,
addOnRow
);
}
}
}
}
}
}
}
...
...
src/operators/math/gemm_int8.cpp
浏览文件 @
6ce11736
...
@@ -699,7 +699,7 @@ template <>
...
@@ -699,7 +699,7 @@ template <>
void
Gemm
::
InnerKernelWithBias
(
int32_t
mc
,
int32_t
nc
,
float
alpha
,
void
Gemm
::
InnerKernelWithBias
(
int32_t
mc
,
int32_t
nc
,
float
alpha
,
const
int8_t
*
a
,
const
int8_t
*
b
,
float
beta
,
const
int8_t
*
a
,
const
int8_t
*
b
,
float
beta
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
)
{
int32_t
*
bias
,
bool
addOnRow
)
{
#pragma omp parallel for
#pragma omp parallel for
for
(
int32_t
j
=
0
;
j
<
nc
;
j
+=
NR_INT8
)
{
for
(
int32_t
j
=
0
;
j
<
nc
;
j
+=
NR_INT8
)
{
for
(
int32_t
i
=
0
;
i
<
mc
;
i
+=
MR_INT8
)
{
for
(
int32_t
i
=
0
;
i
<
mc
;
i
+=
MR_INT8
)
{
...
@@ -716,7 +716,11 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
...
@@ -716,7 +716,11 @@ void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
WriteWithAddReluScale
(
mc
,
nc
,
c
,
C
,
ldc
,
bias
,
alpha
);
WriteWithAddReluScale
(
mc
,
nc
,
c
,
C
,
ldc
,
bias
,
alpha
);
return
;
return
;
}
else
{
}
else
{
WriteWithAddScale
(
mc
,
nc
,
c
,
C
,
ldc
,
bias
,
alpha
);
if
(
addOnRow
)
{
WriteWithAddScaleT
(
mc
,
nc
,
c
,
C
,
ldc
,
bias
,
alpha
);
}
else
{
WriteWithAddScale
(
mc
,
nc
,
c
,
C
,
ldc
,
bias
,
alpha
);
}
}
}
}
}
...
@@ -724,7 +728,7 @@ template <>
...
@@ -724,7 +728,7 @@ template <>
void
Gemm
::
InnerKernelWithBias
(
int32_t
mc
,
int32_t
nc
,
float
alpha
,
void
Gemm
::
InnerKernelWithBias
(
int32_t
mc
,
int32_t
nc
,
float
alpha
,
const
int8_t
*
a
,
const
int8_t
*
b
,
float
beta
,
const
int8_t
*
a
,
const
int8_t
*
b
,
float
beta
,
int32_t
*
c
,
int32_t
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
c
,
int32_t
*
C
,
int32_t
ldc
,
bool
relu
,
int32_t
*
bias
)
{}
int32_t
*
bias
,
bool
addOnRow
)
{}
// 8 bits int PackMatrixA_4r
// 8 bits int PackMatrixA_4r
void
Gemm
::
PackMatrixA_4r_16
(
int32_t
m
,
int32_t
k
,
int32_t
m_tail
,
void
Gemm
::
PackMatrixA_4r_16
(
int32_t
m
,
int32_t
k
,
int32_t
m_tail
,
...
@@ -1159,14 +1163,13 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
...
@@ -1159,14 +1163,13 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
#endif // __ARM_NEON
#endif // __ARM_NEON
}
}
// C = A * B + bias, scale * C
// C = A * B + bias, scale * C
, bias is added on column
void
Gemm
::
WriteWithAddScale
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
void
Gemm
::
WriteWithAddScale
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
int32_t
*
bias
,
float
scale
)
{
int32_t
ldc
,
int32_t
*
bias
,
float
scale
)
{
#if __ARM_NEON
#if __ARM_NEON
#if __aarch64__
#if __aarch64__
// TODO
// TODO
#else
#else
int32_t
zero
=
0
;
int8_t
narrow
=
-
128
;
int8_t
narrow
=
-
128
;
int32_t
nc1
=
nc
>>
3
;
int32_t
nc1
=
nc
>>
3
;
int32_t
_nc1
=
nc
&
7
;
int32_t
_nc1
=
nc
&
7
;
...
@@ -1184,7 +1187,6 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
...
@@ -1184,7 +1187,6 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
"subs %[mc], %[mc], #1
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"blt end_mc_%=
\n\t
"
"blt end_mc_%=
\n\t
"
"vdup.32 q15, %[scale]
\n\t
"
"vdup.32 q15, %[scale]
\n\t
"
"vdup.32 q14, %[zero]
\n\t
"
"vdup.8 d24, %[narrow]
\n\t
"
"vdup.8 d24, %[narrow]
\n\t
"
"loop_mc_%=:
\n\t
"
"loop_mc_%=:
\n\t
"
"vld1.32 {d26[0]}, [%[bias_ptr]]!
\n\t
"
"vld1.32 {d26[0]}, [%[bias_ptr]]!
\n\t
"
...
@@ -1222,9 +1224,9 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
...
@@ -1222,9 +1224,9 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
:
:
:
[
C_ptr
]
"r"
(
C_ptr
),
[
c_ptr
]
"r"
(
c_ptr
),
[
mc
]
"r"
(
m
),
[
nc1
]
"r"
(
n
),
:
[
C_ptr
]
"r"
(
C_ptr
),
[
c_ptr
]
"r"
(
c_ptr
),
[
mc
]
"r"
(
m
),
[
nc1
]
"r"
(
n
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
),
[
bias_ptr
]
"r"
(
bias_ptr
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
),
[
bias_ptr
]
"r"
(
bias_ptr
),
[
scale
]
"r"
(
scale
),
[
zero
]
"r"
(
zero
),
[
narrow
]
"r"
(
narrow
)
[
scale
]
"r"
(
scale
),
[
narrow
]
"r"
(
narrow
)
:
"cc"
,
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
:
"cc"
,
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q12"
,
"q13"
,
"q1
4"
,
"q1
5"
);
"q7"
,
"q12"
,
"q13"
,
"q15"
);
}
}
int32_t
nc_left
;
int32_t
nc_left
;
...
@@ -1239,7 +1241,6 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
...
@@ -1239,7 +1241,6 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
nc_left
=
_nc1
;
nc_left
=
_nc1
;
asm
volatile
(
asm
volatile
(
"vdup.32 q15, %[scale]
\n\t
"
"vdup.32 q15, %[scale]
\n\t
"
"vdup.32 q14, %[zero]
\n\t
"
"vdup.8 d24, %[narrow]
\n\t
"
"vdup.8 d24, %[narrow]
\n\t
"
"vdup.32 q13, %[bias_v]
\n\t
"
"vdup.32 q13, %[bias_v]
\n\t
"
"cmp %[_nc1], #4
\n\t
"
"cmp %[_nc1], #4
\n\t
"
...
@@ -1260,7 +1261,7 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
...
@@ -1260,7 +1261,7 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
"subs %[_nc1], %[_nc1], #4
\n\t
"
"subs %[_nc1], %[_nc1], #4
\n\t
"
"beq process_over_%=
\n\t
"
"beq process_over_%=
\n\t
"
"less_four_%=:
\n\t
"
"less_four_%=:
\n\t
"
"vld1.32 {q0}, [%[c0]]
!
\n\t
"
"vld1.32 {q0}, [%[c0]]
\n\t
"
"vqadd.s32 q0, q0, q13
\n\t
"
"vqadd.s32 q0, q0, q13
\n\t
"
"vcvt.f32.s32 q1, q0
\n\t
"
"vcvt.f32.s32 q1, q0
\n\t
"
"vmul.f32 q1, q1, q15
\n\t
"
"vmul.f32 q1, q1, q15
\n\t
"
...
@@ -1277,17 +1278,138 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
...
@@ -1277,17 +1278,138 @@ void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
"process_over_%=:
\n\t
"
"process_over_%=:
\n\t
"
:
:
:
[
_nc1
]
"r"
(
nc_left
),
[
C0
]
"r"
(
C0
),
[
c0
]
"r"
(
c0
),
:
[
_nc1
]
"r"
(
nc_left
),
[
C0
]
"r"
(
C0
),
[
c0
]
"r"
(
c0
),
[
bias_v
]
"r"
(
bias_v
),
[
scale
]
"r"
(
scale
),
[
zero
]
"r"
(
zero
),
[
bias_v
]
"r"
(
bias_v
),
[
scale
]
"r"
(
scale
),
[
narrow
]
"r"
(
narrow
)
[
narrow
]
"r"
(
narrow
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q12"
,
"q13"
,
"q15"
);
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q12"
,
"q13"
,
"q14"
,
}
"q15"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
}
// C = A * B + bias, scale * C, bias is added on row
void
Gemm
::
WriteWithAddScaleT
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
int32_t
*
bias
,
float
scale
)
{
#if __ARM_NEON
#if __aarch64__
// TODO
#else
int8_t
narrow
=
-
128
;
int32_t
nc1
=
nc
>>
3
;
int32_t
_nc1
=
nc
&
7
;
int32_t
step
=
sizeof
(
int8_t
)
*
ldc
;
int32_t
step1
=
sizeof
(
int32_t
)
*
(
NC
-
(
nc1
<<
3
));
int32_t
volatile
m
=
mc
;
int32_t
volatile
n
=
nc1
;
int32_t
*
volatile
c_ptr
,
*
volatile
bias_ptr
;
int8_t
*
volatile
C_ptr
;
c_ptr
=
c
;
C_ptr
=
C
;
bias_ptr
=
bias
;
if
(
nc1
>
0
)
{
asm
volatile
(
"subs %[mc], %[mc], #1
\n\t
"
"blt end_mc_%=
\n\t
"
"vdup.32 q15, %[scale]
\n\t
"
"vdup.8 d24, %[narrow]
\n\t
"
"loop_mc_%=:
\n\t
"
"mov r4, %[bias_ptr]
\n\t
"
"mov r6, %[C_ptr]
\n\t
"
"mov r5, %[nc1]
\n\t
"
"subs r5, r5, #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q13, q14}, [r4]!
\n\t
"
"vld1.32 {q0, q1}, [%[c_ptr]]!
\n\t
"
"vqadd.s32 q0, q0, q13
\n\t
"
"vqadd.s32 q1, q1, q14
\n\t
"
"vcvt.f32.s32 q2, q0
\n\t
"
"vcvt.f32.s32 q3, q1
\n\t
"
"vmul.f32 q2, q2, q15
\n\t
"
"vmul.f32 q3, q3, q15
\n\t
"
"vcvt.s32.f32 q4, q2
\n\t
"
"vcvt.s32.f32 q5, q3
\n\t
"
"vqmovn.s32 d12, q4
\n\t
"
"vqmovn.s32 d13, q5
\n\t
"
"vqmovn.s16 d14, q6
\n\t
"
"vceq.s8 d15, d14, d24
\n\t
"
"vsub.s8 d14, d14, d15
\n\t
"
"vst1.8 {d14}, [r6]!
\n\t
"
"subs r5, r5, #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"add %[C_ptr], %[C_ptr], %[step]
\n\t
"
"add %[c_ptr], %[c_ptr], %[step1]
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"bge loop_mc_%=
\n\t
"
"end_mc_%=:
\n\t
"
:
:
[
C_ptr
]
"r"
(
C_ptr
),
[
c_ptr
]
"r"
(
c_ptr
),
[
mc
]
"r"
(
m
),
[
nc1
]
"r"
(
n
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
),
[
bias_ptr
]
"r"
(
bias_ptr
),
[
scale
]
"r"
(
scale
),
[
narrow
]
"r"
(
narrow
)
:
"cc"
,
"memory"
,
"r4"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q12"
,
"q13"
,
"q15"
);
}
int32_t
nc_left
;
int32_t
*
c0
;
int8_t
*
C0
;
int32_t
*
volatile
bias0
=
bias_ptr
+
nc1
*
8
;
if
(
_nc1
!=
0
)
{
for
(
int32_t
i
=
0
;
i
<
mc
;
i
++
)
{
C0
=
C_ptr
+
nc1
*
8
+
i
*
ldc
;
c0
=
c_ptr
+
nc1
*
8
+
i
*
NC
;
nc_left
=
_nc1
;
asm
volatile
(
"vdup.32 q15, %[scale]
\n\t
"
"vdup.8 d24, %[narrow]
\n\t
"
"cmp %[_nc1], #4
\n\t
"
"blt less_four_%=
\n\t
"
"vld1.32 {q0}, [%[c0]]!
\n\t
"
"vld1.32 {q13}, [%[bias0]]!
\n\t
"
"vqadd.s32 q0, q0, q13
\n\t
"
"vcvt.f32.s32 q1, q0
\n\t
"
"vmul.f32 q1, q1, q15
\n\t
"
"vcvt.s32.f32 q2, q1
\n\t
"
"vqmovn.s32 d6, q2
\n\t
"
"vqmovn.s16 d8, q3
\n\t
"
"vceq.s8 d9, d8, d24
\n\t
"
"vsub.s8 d8, d8, d9
\n\t
"
"vst1.8 {d8[0]}, [%[C0]]!
\n\t
"
"vst1.8 {d8[1]}, [%[C0]]!
\n\t
"
"vst1.8 {d8[2]}, [%[C0]]!
\n\t
"
"vst1.8 {d8[3]}, [%[C0]]!
\n\t
"
"subs %[_nc1], %[_nc1], #4
\n\t
"
"beq process_over_%=
\n\t
"
"less_four_%=:
\n\t
"
"vld1.32 {q0}, [%[c0]]
\n\t
"
"vld1.32 {q13}, [%[bias0]]
\n\t
"
"vqadd.s32 q0, q0, q13
\n\t
"
"vcvt.f32.s32 q1, q0
\n\t
"
"vmul.f32 q1, q1, q15
\n\t
"
"vcvt.s32.f32 q2, q1
\n\t
"
"vqmovn.s32 d6, q2
\n\t
"
"vqmovn.s16 d8, q3
\n\t
"
"vceq.s8 d9, d8, d24
\n\t
"
"vsub.s8 d8, d8, d9
\n\t
"
"loop_save_%=:
\n\t
"
"vst1.8 {d8[0]}, [%[C0]]!
\n\t
"
"vext.8 d8, d8, d8, #1
\n\t
"
"subs %[_nc1], %[_nc1], #1
\n\t
"
"bgt loop_save_%=
\n\t
"
"process_over_%=:
\n\t
"
:
:
[
_nc1
]
"r"
(
nc_left
),
[
C0
]
"r"
(
C0
),
[
c0
]
"r"
(
c0
),
[
bias0
]
"r"
(
bias0
),
[
scale
]
"r"
(
scale
),
[
narrow
]
"r"
(
narrow
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q12"
,
"q13"
,
"q15"
);
}
}
}
}
#endif // __aarch64__
#endif // __aarch64__
#endif // __ARM_NEON
#endif // __ARM_NEON
}
}
// C = A * B + bias, scale * relu(C)
// C = A * B + bias, scale * relu(C)
, bias is added on column
void
Gemm
::
WriteWithAddReluScale
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
void
Gemm
::
WriteWithAddReluScale
(
int32_t
mc
,
int32_t
nc
,
int32_t
*
c
,
int8_t
*
C
,
int32_t
ldc
,
int32_t
*
bias
,
float
scale
)
{
int32_t
ldc
,
int32_t
*
bias
,
float
scale
)
{
#if __ARM_NEON
#if __ARM_NEON
...
...
src/operators/math/math_function.h
浏览文件 @
6ce11736
...
@@ -34,7 +34,7 @@ template <typename T, typename S>
...
@@ -34,7 +34,7 @@ template <typename T, typename S>
void
matmul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
matmul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
T
alpha
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
T
alpha
,
framework
::
Tensor
*
matrix_out
,
T
beta
,
bool
relu
=
false
,
framework
::
Tensor
*
matrix_out
,
T
beta
,
bool
relu
=
false
,
S
*
bias
=
nullptr
);
S
*
bias
=
nullptr
,
bool
addOnRow
=
false
);
template
<
typename
T
>
template
<
typename
T
>
void
matmulWithBn
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
matmulWithBn
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
...
...
src/operators/math/math_function_int8.cpp
浏览文件 @
6ce11736
...
@@ -24,8 +24,8 @@ namespace math {
...
@@ -24,8 +24,8 @@ namespace math {
template
<
>
template
<
>
void
matmul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
void
matmul
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
float
alpha
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
framework
::
Tensor
*
matrix_out
,
float
beta
,
bool
relu
,
int32_t
*
bias
,
int32_t
*
bias
)
{
bool
addOnRow
)
{
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
auto
dim_out
=
matrix_out
->
dims
();
...
@@ -55,18 +55,18 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -55,18 +55,18 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
#ifdef _OPENMP
#ifdef _OPENMP
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
);
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
,
addOnRow
);
}
else
{
}
else
{
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
);
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
,
addOnRow
);
}
}
#else
#else
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
);
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
,
addOnRow
);
}
else
{
}
else
{
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
a
,
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
);
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
,
addOnRow
);
}
}
#endif
#endif
}
else
{
}
else
{
...
@@ -74,21 +74,21 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -74,21 +74,21 @@ void matmul(const framework::Tensor &matrix_a, bool trans_a,
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
int8_t
>
(),
K
,
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
int8_t
>
(),
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
);
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
,
addOnRow
);
}
else
{
}
else
{
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
int8_t
>
(),
K
,
gemm
.
Sgemm_omp
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
int8_t
>
(),
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
);
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
,
addOnRow
);
}
}
#else
#else
if
(
bias
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
int8_t
>
(),
K
,
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
int8_t
>
(),
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int8_t
>
(),
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int8_t
>
(),
N
,
relu
,
bias
);
N
,
relu
,
bias
,
addOnRow
);
}
else
{
}
else
{
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
int8_t
>
(),
K
,
gemm
.
Sgemm
(
M
,
N
,
K
,
alpha
,
matrix_a
.
data
<
int8_t
>
(),
K
,
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int32_t
>
(),
matrix_b
.
data
<
int8_t
>
(),
N
,
beta
,
matrix_out
->
data
<
int32_t
>
(),
N
,
relu
,
bias
);
N
,
relu
,
bias
,
addOnRow
);
}
}
#endif
#endif
}
}
...
...
src/operators/op_param.h
浏览文件 @
6ce11736
...
@@ -1632,6 +1632,10 @@ class FusionFcParam : public OpParam {
...
@@ -1632,6 +1632,10 @@ class FusionFcParam : public OpParam {
x_num_col_dims_
=
GetAttr
<
int
>
(
"x_num_col_dims"
,
attrs
);
x_num_col_dims_
=
GetAttr
<
int
>
(
"x_num_col_dims"
,
attrs
);
y_num_col_dims_
=
GetAttr
<
int
>
(
"y_num_col_dims"
,
attrs
);
y_num_col_dims_
=
GetAttr
<
int
>
(
"y_num_col_dims"
,
attrs
);
axis_
=
GetAttr
<
int
>
(
"axis"
,
attrs
);
axis_
=
GetAttr
<
int
>
(
"axis"
,
attrs
);
#ifdef FUSION_FC_INT8_OP
scale_
=
InputScaleFrom
<
GType
>
(
inputs
,
scope
);
#endif
}
}
GType
*
InputX
()
const
{
return
input_x_
;
}
GType
*
InputX
()
const
{
return
input_x_
;
}
...
@@ -1655,8 +1659,16 @@ class FusionFcParam : public OpParam {
...
@@ -1655,8 +1659,16 @@ class FusionFcParam : public OpParam {
int
x_num_col_dims_
;
int
x_num_col_dims_
;
int
y_num_col_dims_
;
int
y_num_col_dims_
;
int
axis_
;
int
axis_
;
#ifdef PADDLE_MOBILE_FPGA
#ifdef FUSION_FC_INT8_OP
public:
const
RType
*
InputScale
()
const
{
return
scale_
;
}
private:
RType
*
scale_
;
#endif
#ifdef PADDLE_MOBILE_FPGA
private:
private:
fpga
::
SplitConvArgs
fpga_conv_args
;
fpga
::
SplitConvArgs
fpga_conv_args
;
...
@@ -1717,7 +1729,7 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
...
@@ -1717,7 +1729,7 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
typedef
typename
DtypeTensorTrait
<
DeviceType
>::
rtype
RType
;
typedef
typename
DtypeTensorTrait
<
DeviceType
>::
rtype
RType
;
const
RType
*
InputScale
()
const
{
return
scale_
;
}
const
RType
*
InputScale
()
const
{
return
scale_
;
}
pr
otected
:
pr
ivate
:
RType
*
scale_
;
RType
*
scale_
;
#endif
#endif
};
};
...
...
test/common/test_gemm_accuracy.cpp
浏览文件 @
6ce11736
...
@@ -25,7 +25,7 @@ limitations under the License. */
...
@@ -25,7 +25,7 @@ limitations under the License. */
#define c(i, j) c[(i)*ldc + (j)]
#define c(i, j) c[(i)*ldc + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
#define c1(i, j) c1[(i)*ldc + (j)]
void
print_mat
ir
x
(
int
m
,
int
n
,
int
ldc
,
float
*
c
)
{
void
print_mat
ri
x
(
int
m
,
int
n
,
int
ldc
,
float
*
c
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
std
::
cout
<<
c
(
i
,
0
);
std
::
cout
<<
c
(
i
,
0
);
for
(
int
j
=
1
;
j
<
n
;
++
j
)
{
for
(
int
j
=
1
;
j
<
n
;
++
j
)
{
...
@@ -98,18 +98,20 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
...
@@ -98,18 +98,20 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {
if
(
pr
>
0
)
{
if
(
pr
>
0
)
{
std
::
cout
<<
"A:"
<<
std
::
endl
;
std
::
cout
<<
"A:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
k
,
lda
,
a
);
print_mat
ri
x
(
m
,
k
,
lda
,
a
);
std
::
cout
<<
"B:"
<<
std
::
endl
;
std
::
cout
<<
"B:"
<<
std
::
endl
;
print_mat
ir
x
(
k
,
n
,
ldb
,
b
);
print_mat
ri
x
(
k
,
n
,
ldb
,
b
);
std
::
cout
<<
"C:"
<<
std
::
endl
;
std
::
cout
<<
"C:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
n
,
ldc
,
c
);
print_mat
ri
x
(
m
,
n
,
ldc
,
c
);
std
::
cout
<<
"C1:"
<<
std
::
endl
;
std
::
cout
<<
"C1:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
n
,
ldc
,
c1
);
print_mat
ri
x
(
m
,
n
,
ldc
,
c1
);
}
}
std
::
cout
<<
"mnk="
<<
m
<<
" "
<<
n
<<
" "
<<
k
<<
" relu="
<<
relu
std
::
cout
<<
"mnk="
<<
m
<<
" "
<<
n
<<
" "
<<
k
<<
" relu="
<<
relu
<<
" eq="
<<
eq
<<
" neq="
<<
neq
<<
std
::
endl
;
<<
" eq="
<<
eq
<<
" neq="
<<
neq
<<
std
::
endl
;
PADDLE_MOBILE_ENFORCE
(
neq
==
0
,
"The execution of do_sgemm is failed!"
);
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
c
);
paddle_mobile
::
memory
::
Free
(
c
);
...
...
test/common/test_gemm_int8_accuracy.cpp
浏览文件 @
6ce11736
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <iostream>
#include <iostream>
#include <limits>
#include <limits>
#include <random>
#include <random>
#include <type_traits>
#include "../test_helper.h"
#include "../test_helper.h"
#include "common/log.h"
#include "common/log.h"
#include "memory/t_malloc.h"
#include "memory/t_malloc.h"
...
@@ -33,24 +34,32 @@ limitations under the License. */
...
@@ -33,24 +34,32 @@ limitations under the License. */
using
std
::
default_random_engine
;
using
std
::
default_random_engine
;
using
std
::
uniform_int_distribution
;
using
std
::
uniform_int_distribution
;
void
print_matirx
(
int
m
,
int
n
,
int
ldc
,
int32_t
*
c
)
{
template
<
typename
T
>
void
print_matrix
(
int
m
,
int
n
,
int
ldc
,
T
*
c
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
std
::
cout
<<
c
(
i
,
0
);
if
(
std
::
is_same
<
T
,
int8_t
>::
value
)
{
for
(
int
j
=
1
;
j
<
n
;
++
j
)
{
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
<<
" | "
<<
c
(
i
,
j
);
std
::
cout
.
width
(
4
);
std
::
cout
<<
static_cast
<
int32_t
>
(
c
(
i
,
0
));
}
else
{
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
.
width
(
6
);
std
::
cout
<<
c
(
i
,
0
);
}
}
std
::
cout
<<
std
::
endl
;
}
std
::
cout
<<
std
::
endl
;
}
void
print_matirx
(
int
m
,
int
n
,
int
ldc
,
int8_t
*
c
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
std
::
cout
<<
static_cast
<
int32_t
>
(
c
(
i
,
0
));
for
(
int
j
=
1
;
j
<
n
;
++
j
)
{
for
(
int
j
=
1
;
j
<
n
;
++
j
)
{
std
::
cout
<<
" | "
<<
static_cast
<
int32_t
>
(
c
(
i
,
j
));
if
(
std
::
is_same
<
T
,
int8_t
>::
value
)
{
std
::
cout
<<
" | "
;
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
.
width
(
4
);
std
::
cout
<<
static_cast
<
int32_t
>
(
c
(
i
,
j
));
}
else
{
std
::
cout
<<
" | "
;
std
::
cout
.
setf
(
std
::
ios
::
left
);
std
::
cout
.
width
(
6
);
std
::
cout
<<
c
(
i
,
j
);
}
}
}
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
"
\n
"
;
}
}
std
::
cout
<<
std
::
endl
;
std
::
cout
<<
std
::
endl
;
}
}
...
@@ -138,18 +147,20 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
...
@@ -138,18 +147,20 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
if
(
pr
>
0
)
{
if
(
pr
>
0
)
{
std
::
cout
<<
"A:"
<<
std
::
endl
;
std
::
cout
<<
"A:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
k
,
lda
,
a
);
print_mat
ri
x
(
m
,
k
,
lda
,
a
);
std
::
cout
<<
"B:"
<<
std
::
endl
;
std
::
cout
<<
"B:"
<<
std
::
endl
;
print_mat
ir
x
(
k
,
n
,
ldb
,
b
);
print_mat
ri
x
(
k
,
n
,
ldb
,
b
);
std
::
cout
<<
"C:"
<<
std
::
endl
;
std
::
cout
<<
"C:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
n
,
ldc
,
c
);
print_mat
ri
x
(
m
,
n
,
ldc
,
c
);
std
::
cout
<<
"C1:"
<<
std
::
endl
;
std
::
cout
<<
"C1:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
n
,
ldc
,
c1
);
print_mat
ri
x
(
m
,
n
,
ldc
,
c1
);
}
}
std
::
cout
<<
"mnk="
<<
m
<<
" "
<<
n
<<
" "
<<
k
<<
" relu="
<<
relu
std
::
cout
<<
"mnk="
<<
m
<<
" "
<<
n
<<
" "
<<
k
<<
" relu="
<<
relu
<<
" eq="
<<
eq
<<
" neq="
<<
neq
<<
std
::
endl
;
<<
" eq="
<<
eq
<<
" neq="
<<
neq
<<
std
::
endl
;
PADDLE_MOBILE_ENFORCE
(
neq
==
0
,
"The execution of do_sgemm is failed!"
);
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
c
);
paddle_mobile
::
memory
::
Free
(
c
);
...
@@ -158,7 +169,8 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
...
@@ -158,7 +169,8 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {
return
0
;
return
0
;
}
}
int
do_sgemm_with_bias
(
int
m
,
int
n
,
int
k
,
bool
relu
,
int
pr
)
{
int
do_sgemm_with_bias
(
int
m
,
int
n
,
int
k
,
bool
relu
,
int
pr
,
bool
addOnRow
=
false
)
{
int
lda
=
k
;
int
lda
=
k
;
int
ldb
=
n
;
int
ldb
=
n
;
int
ldc
=
n
;
int
ldc
=
n
;
...
@@ -174,8 +186,14 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
...
@@ -174,8 +186,14 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
int8_t
*
c1
=
static_cast
<
int8_t
*>
(
int8_t
*
c1
=
static_cast
<
int8_t
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
int8_t
)
*
m
*
n
));
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
int8_t
)
*
m
*
n
));
int32_t
*
bias
=
int32_t
*
bias
=
nullptr
;
static_cast
<
int32_t
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
int32_t
)
*
m
));
if
(
addOnRow
)
{
bias
=
static_cast
<
int32_t
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
int32_t
)
*
n
));
}
else
{
bias
=
static_cast
<
int32_t
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
int32_t
)
*
m
));
}
for
(
int
i
=
0
;
i
<
m
*
k
;
++
i
)
{
for
(
int
i
=
0
;
i
<
m
*
k
;
++
i
)
{
a
[
i
]
=
pixel
(
e
);
a
[
i
]
=
pixel
(
e
);
...
@@ -183,29 +201,48 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
...
@@ -183,29 +201,48 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
for
(
int
i
=
0
;
i
<
k
*
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
k
*
n
;
++
i
)
{
b
[
i
]
=
pixel
(
e
);
b
[
i
]
=
pixel
(
e
);
}
}
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
bias
[
i
]
=
static_cast
<
int32_t
>
(
pixel
(
e
));
if
(
addOnRow
)
{
}
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
bias
[
i
]
=
static_cast
<
int32_t
>
(
pixel
(
e
));
int32_t
bias_v
=
bias
[
i
];
}
for
(
int
j
=
0
;
j
<
n
;
++
j
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
int32_t
r
=
0
;
for
(
int
j
=
0
;
j
<
n
;
++
j
)
{
for
(
int
p
=
0
;
p
<
k
;
p
++
)
{
int32_t
bias_v
=
bias
[
j
];
r
+=
static_cast
<
int32_t
>
(
a
(
i
,
p
))
*
static_cast
<
int32_t
>
(
b
(
p
,
j
));
int32_t
r
=
0
;
for
(
int
p
=
0
;
p
<
k
;
p
++
)
{
r
+=
static_cast
<
int32_t
>
(
a
(
i
,
p
))
*
static_cast
<
int32_t
>
(
b
(
p
,
j
));
}
r
=
qadd_int32
(
r
,
bias_v
);
if
(
relu
)
r
=
std
::
max
(
0
,
r
);
c1
(
i
,
j
)
=
qscale_int32
(
r
,
scale
);
}
}
}
else
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
bias
[
i
]
=
static_cast
<
int32_t
>
(
pixel
(
e
));
}
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
int32_t
bias_v
=
bias
[
i
];
for
(
int
j
=
0
;
j
<
n
;
++
j
)
{
int32_t
r
=
0
;
for
(
int
p
=
0
;
p
<
k
;
p
++
)
{
r
+=
static_cast
<
int32_t
>
(
a
(
i
,
p
))
*
static_cast
<
int32_t
>
(
b
(
p
,
j
));
}
r
=
qadd_int32
(
r
,
bias_v
);
if
(
relu
)
r
=
std
::
max
(
0
,
r
);
c1
(
i
,
j
)
=
qscale_int32
(
r
,
scale
);
}
}
r
=
qadd_int32
(
r
,
bias_v
);
if
(
relu
)
r
=
std
::
max
(
0
,
r
);
c1
(
i
,
j
)
=
qscale_int32
(
r
,
scale
);
}
}
}
}
paddle_mobile
::
operators
::
math
::
Gemm
gemm
;
paddle_mobile
::
operators
::
math
::
Gemm
gemm
;
#ifdef _OPENMP
#ifdef _OPENMP
gemm
.
Sgemm_omp
(
m
,
n
,
k
,
scale
,
a
,
lda
,
b
,
ldb
,
static_cast
<
float
>
(
0
),
c
,
ldc
,
gemm
.
Sgemm_omp
(
m
,
n
,
k
,
scale
,
a
,
lda
,
b
,
ldb
,
static_cast
<
float
>
(
0
),
c
,
ldc
,
relu
,
bias
);
relu
,
bias
,
addOnRow
);
#else
#else
gemm
.
Sgemm
(
m
,
n
,
k
,
scale
,
a
,
lda
,
b
,
ldb
,
static_cast
<
float
>
(
0
),
c
,
ldc
,
gemm
.
Sgemm
(
m
,
n
,
k
,
scale
,
a
,
lda
,
b
,
ldb
,
static_cast
<
float
>
(
0
),
c
,
ldc
,
relu
,
bias
);
relu
,
bias
,
addOnRow
);
#endif
#endif
int
eq
=
0
;
int
eq
=
0
;
int
neq
=
0
;
int
neq
=
0
;
...
@@ -219,20 +256,27 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
...
@@ -219,20 +256,27 @@ int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr) {
if
(
pr
>
0
)
{
if
(
pr
>
0
)
{
std
::
cout
<<
"A:"
<<
std
::
endl
;
std
::
cout
<<
"A:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
k
,
lda
,
a
);
print_mat
ri
x
(
m
,
k
,
lda
,
a
);
std
::
cout
<<
"B:"
<<
std
::
endl
;
std
::
cout
<<
"B:"
<<
std
::
endl
;
print_mat
ir
x
(
k
,
n
,
ldb
,
b
);
print_mat
ri
x
(
k
,
n
,
ldb
,
b
);
std
::
cout
<<
"Bias:"
<<
std
::
endl
;
std
::
cout
<<
"Bias:"
<<
std
::
endl
;
print_matirx
(
m
,
1
,
1
,
bias
);
if
(
addOnRow
)
{
print_matrix
(
1
,
n
,
n
,
bias
);
}
else
{
print_matrix
(
m
,
1
,
1
,
bias
);
}
std
::
cout
<<
"C:"
<<
std
::
endl
;
std
::
cout
<<
"C:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
n
,
ldc
,
c
);
print_mat
ri
x
(
m
,
n
,
ldc
,
c
);
std
::
cout
<<
"C1:"
<<
std
::
endl
;
std
::
cout
<<
"C1:"
<<
std
::
endl
;
print_mat
ir
x
(
m
,
n
,
ldc
,
c1
);
print_mat
ri
x
(
m
,
n
,
ldc
,
c1
);
}
}
std
::
cout
<<
"mnk="
<<
m
<<
" "
<<
n
<<
" "
<<
k
<<
" relu="
<<
relu
std
::
cout
<<
"mnk="
<<
m
<<
" "
<<
n
<<
" "
<<
k
<<
" relu="
<<
relu
<<
" eq="
<<
eq
<<
" neq="
<<
neq
<<
std
::
endl
;
<<
" eq="
<<
eq
<<
" neq="
<<
neq
<<
std
::
endl
;
PADDLE_MOBILE_ENFORCE
(
neq
==
0
,
"The execution of do_sgemm_with_bias is failed!"
);
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
a
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
b
);
paddle_mobile
::
memory
::
Free
(
c
);
paddle_mobile
::
memory
::
Free
(
c
);
...
@@ -261,7 +305,7 @@ int main() {
...
@@ -261,7 +305,7 @@ int main() {
std
::
cout
<<
"
\n\n
******************************************************
\n\n
"
std
::
cout
<<
"
\n\n
******************************************************
\n\n
"
<<
std
::
endl
;
<<
std
::
endl
;
std
::
cout
<<
"Test gemm with bias:"
<<
std
::
endl
;
std
::
cout
<<
"Test gemm with bias
(bias is added on column)
:"
<<
std
::
endl
;
do_sgemm_with_bias
(
9
,
9
,
9
,
false
,
1
);
do_sgemm_with_bias
(
9
,
9
,
9
,
false
,
1
);
do_sgemm_with_bias
(
10
,
6
,
12
,
false
,
0
);
do_sgemm_with_bias
(
10
,
6
,
12
,
false
,
0
);
do_sgemm_with_bias
(
512
,
256
,
384
,
false
,
0
);
do_sgemm_with_bias
(
512
,
256
,
384
,
false
,
0
);
...
@@ -272,6 +316,19 @@ int main() {
...
@@ -272,6 +316,19 @@ int main() {
do_sgemm_with_bias
(
333
,
797
,
939
,
false
,
0
);
do_sgemm_with_bias
(
333
,
797
,
939
,
false
,
0
);
do_sgemm_with_bias
(
1024
,
1024
,
1024
,
false
,
0
);
do_sgemm_with_bias
(
1024
,
1024
,
1024
,
false
,
0
);
std
::
cout
<<
"
\n\n
******************************************************
\n\n
"
<<
std
::
endl
;
std
::
cout
<<
"Test gemm with bias(bias is added on row):"
<<
std
::
endl
;
do_sgemm_with_bias
(
9
,
9
,
9
,
false
,
1
,
true
);
do_sgemm_with_bias
(
10
,
6
,
12
,
false
,
0
,
true
);
do_sgemm_with_bias
(
512
,
256
,
384
,
false
,
0
,
true
);
do_sgemm_with_bias
(
1366
,
768
,
256
,
false
,
0
,
true
);
do_sgemm_with_bias
(
1255
,
755
,
333
,
false
,
0
,
true
);
do_sgemm_with_bias
(
599
,
1133
,
393
,
false
,
0
,
true
);
do_sgemm_with_bias
(
777
,
555
,
999
,
false
,
0
,
true
);
do_sgemm_with_bias
(
333
,
797
,
939
,
false
,
0
,
true
);
do_sgemm_with_bias
(
1024
,
1024
,
1024
,
false
,
0
,
true
);
std
::
cout
<<
"
\n\n
******************************************************
\n\n
"
std
::
cout
<<
"
\n\n
******************************************************
\n\n
"
<<
std
::
endl
;
<<
std
::
endl
;
std
::
cout
<<
"Test gemm with relu and bias:"
<<
std
::
endl
;
std
::
cout
<<
"Test gemm with relu and bias:"
<<
std
::
endl
;
...
...
test/common/test_gemm_perf.cpp
浏览文件 @
6ce11736
...
@@ -49,7 +49,8 @@ int main() {
...
@@ -49,7 +49,8 @@ int main() {
auto
bbptr_int8
=
bb_int8
.
mutable_data
<
int8_t
>
({
k
,
n
});
auto
bbptr_int8
=
bb_int8
.
mutable_data
<
int8_t
>
({
k
,
n
});
auto
ccptr_int32
=
cc_int32
.
mutable_data
<
int32_t
>
({
m
,
n
});
auto
ccptr_int32
=
cc_int32
.
mutable_data
<
int32_t
>
({
m
,
n
});
auto
ccptr_int8
=
cc_int8
.
mutable_data
<
int8_t
>
({
m
,
n
});
auto
ccptr_int8
=
cc_int8
.
mutable_data
<
int8_t
>
({
m
,
n
});
int32_t
*
bias_data
=
new
int32_t
[
m
];
int32_t
*
bias_data_col
=
new
int32_t
[
m
];
int32_t
*
bias_data_row
=
new
int32_t
[
n
];
for
(
int
i
=
0
;
i
<
m
*
k
;
++
i
)
{
for
(
int
i
=
0
;
i
<
m
*
k
;
++
i
)
{
aaptr_int8
[
i
]
=
static_cast
<
int8_t
>
(
2
);
aaptr_int8
[
i
]
=
static_cast
<
int8_t
>
(
2
);
...
@@ -62,7 +63,11 @@ int main() {
...
@@ -62,7 +63,11 @@ int main() {
}
}
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
for
(
int
i
=
0
;
i
<
m
;
++
i
)
{
bias_data
[
i
]
=
2
;
bias_data_col
[
i
]
=
2
;
}
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
bias_data_row
[
i
]
=
2
;
}
}
// float
// float
...
@@ -73,14 +78,15 @@ int main() {
...
@@ -73,14 +78,15 @@ int main() {
false
,
nullptr
);
false
,
nullptr
);
}
}
auto
time
1
=
time
();
auto
time
_start0
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
<
float
>
(
paddle_mobile
::
operators
::
math
::
matmul
<
float
>
(
aa
,
false
,
bb
,
false
,
static_cast
<
float
>
(
1
),
&
cc
,
static_cast
<
float
>
(
0
),
aa
,
false
,
bb
,
false
,
static_cast
<
float
>
(
1
),
&
cc
,
static_cast
<
float
>
(
0
),
false
,
nullptr
);
false
,
nullptr
);
}
}
auto
time2
=
time
();
auto
time_end0
=
time
();
std
::
cout
<<
"float gemm cost :"
<<
time_diff
(
time1
,
time2
)
/
10
<<
"ms
\n
"
;
std
::
cout
<<
"float gemm cost :"
<<
time_diff
(
time_start0
,
time_end0
)
/
10
<<
"ms
\n
"
;
// int8_t without bias
// int8_t without bias
// warm-up 10 times
// warm-up 10 times
...
@@ -90,33 +96,69 @@ int main() {
...
@@ -90,33 +96,69 @@ int main() {
static_cast
<
float
>
(
0
));
static_cast
<
float
>
(
0
));
}
}
auto
time
3
=
time
();
auto
time
_start1
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
<
float
,
int32_t
>
(
paddle_mobile
::
operators
::
math
::
matmul
<
float
,
int32_t
>
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
1
),
&
cc_int32
,
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
1
),
&
cc_int32
,
static_cast
<
float
>
(
0
));
static_cast
<
float
>
(
0
));
}
}
auto
time4
=
time
();
auto
time_end1
=
time
();
std
::
cout
<<
"int8_t gemm cost :"
<<
time_diff
(
time3
,
time4
)
/
10
<<
"ms
\n
"
;
std
::
cout
<<
"int8_t gemm cost :"
<<
time_diff
(
time_start1
,
time_end1
)
/
10
<<
"ms
\n
"
;
// int8_t with bias, column element wise add
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_col
,
false
);
}
auto
time_start2
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_col
,
false
);
}
auto
time_end2
=
time
();
std
::
cout
<<
"int8_t gemm_with_bias(column add) cost :"
<<
time_diff
(
time_start2
,
time_end2
)
/
10
<<
"ms
\n
"
;
// int8_t with bias, row element wise add
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_row
,
true
);
}
auto
time_start3
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
false
,
bias_data_row
,
true
);
}
auto
time_end3
=
time
();
std
::
cout
<<
"int8_t gemm_with_bias(row add) cost :"
<<
time_diff
(
time_start3
,
time_end3
)
/
10
<<
"ms
\n
"
;
// int8_t with bias&relu
// int8_t with bias&relu
// warm-up 10 times
// warm-up 10 times
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
(
paddle_mobile
::
operators
::
math
::
matmul
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
true
,
bias_data
);
static_cast
<
float
>
(
0
),
true
,
bias_data
_col
,
false
);
}
}
auto
time
5
=
time
();
auto
time
_start4
=
time
();
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
for
(
int
j
=
0
;
j
<
10
;
++
j
)
{
paddle_mobile
::
operators
::
math
::
matmul
(
paddle_mobile
::
operators
::
math
::
matmul
(
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
aa_int8
,
false
,
bb_int8
,
false
,
static_cast
<
float
>
(
0.618
),
&
cc_int8
,
static_cast
<
float
>
(
0
),
true
,
bias_data
);
static_cast
<
float
>
(
0
),
true
,
bias_data
_col
,
false
);
}
}
auto
time
6
=
time
();
auto
time
_end4
=
time
();
std
::
cout
<<
"int8_t gemm_with_bias_relu cost :"
std
::
cout
<<
"int8_t gemm_with_bias_relu cost :"
<<
time_diff
(
time
5
,
time6
)
/
10
<<
"ms
\n
"
;
<<
time_diff
(
time
_start4
,
time_end4
)
/
10
<<
"ms
\n
"
;
delete
[]
bias_data
;
delete
[]
bias_data_row
;
delete
[]
bias_data_col
;
return
0
;
return
0
;
}
}
test/operators/test_fusion_conv_add_relu_int8_op.cpp
浏览文件 @
6ce11736
...
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#ifdef FUSION_CONVADDRELU_INT8_OP
#include <iostream>
#include <iostream>
#ifdef FUSION_CONVADDRELU_INT8_OP
#include <limits>
#include <limits>
#include "../test_helper.h"
#include "../test_helper.h"
#include "../test_include.h"
#include "../test_include.h"
...
@@ -356,5 +356,9 @@ int main(int argc, char *argv[]) {
...
@@ -356,5 +356,9 @@ int main(int argc, char *argv[]) {
paddle_mobile
::
TestConvOp
<
int8_t
,
5
,
2
,
1
>
(
in_channels
,
in_height
,
in_width
,
paddle_mobile
::
TestConvOp
<
int8_t
,
5
,
2
,
1
>
(
in_channels
,
in_height
,
in_width
,
out_channels
);
out_channels
);
}
}
#else
int
main
()
{
std
::
cout
<<
"FUSION_CONVADDRELU_INT8_OP is not defined!"
<<
std
::
endl
;
return
0
;
}
#endif
#endif
test/operators/test_fusion_fc_op.cpp
浏览文件 @
6ce11736
...
@@ -12,147 +12,163 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,147 +12,163 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <framework/program/program-optimize/program_optimize.h>
#include <iostream>
#include <type_traits>
#include "../test_helper.h"
#include "../test_include.h"
#include "../test_include.h"
#include "framework/operator.h"
#include "operators/fusion_fc_int8_op.h"
#include "operators/fusion_fc_op.h"
#include "operators/fusion_fc_op.h"
#define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)]
#define c(i, j) c[(i)*ldc + (j)]
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
framework
{
using
framework
::
AttributeMap
;
using
framework
::
DDim
;
using
framework
::
Scope
;
using
framework
::
make_ddim
;
int32_t
qadd_int32
(
int32_t
l
,
int32_t
r
)
{
int64_t
res
=
static_cast
<
int64_t
>
(
l
)
+
static_cast
<
int64_t
>
(
r
);
if
(
res
>
std
::
numeric_limits
<
int32_t
>::
max
())
return
std
::
numeric_limits
<
int32_t
>::
max
();
else
if
(
res
<
std
::
numeric_limits
<
int32_t
>::
min
())
return
std
::
numeric_limits
<
int32_t
>::
min
();
else
return
static_cast
<
int32_t
>
(
res
);
}
template
<
typename
Dtype
>
// round to zero
class
TestFcOp
{
float
round2zero
(
float
v
)
{
public:
float
res
;
explicit
TestFcOp
(
const
Program
<
Dtype
>
p
)
:
program_
(
p
)
{
if
(
v
>
0
)
use_optimize_
=
true
;
res
=
std
::
floor
(
v
);
if
(
use_optimize_
)
{
else
if
(
v
<
0
)
to_predict_program_
=
program_
.
optimizeProgram
;
res
=
std
::
ceil
(
v
);
}
else
{
return
res
;
to_predict_program_
=
program_
.
originProgram
;
}
}
const
std
::
vector
<
std
::
shared_ptr
<
BlockDesc
>>
blocks
=
int8_t
qscale_int32
(
int32_t
v
,
float
scale
)
{
to_predict_program_
->
Blocks
();
float
res
=
static_cast
<
float
>
(
v
)
*
scale
;
// DLOG << " **block size " << blocks.size();
res
=
round2zero
(
res
);
for
(
int
i
=
0
;
i
<
blocks
.
size
();
++
i
)
{
if
(
res
>
127
)
std
::
shared_ptr
<
BlockDesc
>
block_desc
=
blocks
[
i
];
return
static_cast
<
int8_t
>
(
127
);
std
::
vector
<
std
::
shared_ptr
<
OpDesc
>>
ops
=
block_desc
->
Ops
();
else
if
(
res
<
-
127
)
// DLOG << " ops " << ops.size();
return
static_cast
<
int8_t
>
(
-
127
);
for
(
int
j
=
0
;
j
<
ops
.
size
();
++
j
)
{
else
std
::
shared_ptr
<
OpDesc
>
op
=
ops
[
j
];
return
static_cast
<
int8_t
>
(
res
);
if
(
op
->
Type
()
==
"fc"
&&
op
->
Input
(
"X"
)[
0
]
==
"pool2d_13.tmp_0"
)
{
}
DLOG
<<
" fc attr size: "
<<
op
->
GetAttrMap
().
size
();
DLOG
<<
" inputs size: "
<<
op
->
GetInputs
().
size
();
template
<
typename
T
,
typename
S
>
DLOG
<<
" outputs size: "
<<
op
->
GetOutputs
().
size
();
int
TestFcOP
()
{
DLOG
<<
" Input X is : "
<<
op
->
Input
(
"X"
)[
0
];
int32_t
m
=
377
;
DLOG
<<
" Input Y is : "
<<
op
->
Input
(
"Y"
)[
0
];
int32_t
n
=
1363
;
DLOG
<<
" Input Y is : "
<<
op
->
Input
(
"Z"
)[
0
];
int32_t
k
=
577
;
DLOG
<<
" Output Out is : "
<<
op
->
Output
(
"Out"
)[
0
];
int32_t
lda
=
k
;
std
::
shared_ptr
<
operators
::
FusionFcOp
<
Dtype
,
float
>>
testOp
=
int32_t
ldb
=
n
;
std
::
make_shared
<
operators
::
FusionFcOp
<
Dtype
,
float
>>
(
int32_t
ldc
=
n
;
op
->
Type
(),
op
->
GetInputs
(),
op
->
GetOutputs
(),
DDim
inputA_shape
=
make_ddim
({
m
,
k
});
op
->
GetAttrMap
(),
program_
.
scope
);
DDim
inputB_shape
=
make_ddim
({
k
,
n
});
ops_of_block_
[
*
block_desc
.
get
()].
push_back
(
testOp
);
DDim
bias_shape
=
make_ddim
({
n
});
VariableNameMap
inputs
;
VariableNameMap
outputs
;
auto
scope
=
std
::
make_shared
<
Scope
>
();
inputs
[
"X"
]
=
std
::
vector
<
std
::
string
>
({
"inputA"
});
inputs
[
"Y"
]
=
std
::
vector
<
std
::
string
>
({
"inputB"
});
inputs
[
"Z"
]
=
std
::
vector
<
std
::
string
>
({
"bias"
});
inputs
[
"Scale"
]
=
std
::
vector
<
std
::
string
>
({
"scale"
});
outputs
[
"Out"
]
=
std
::
vector
<
std
::
string
>
({
"output"
});
auto
inputA_var
=
scope
.
get
()
->
Var
(
"inputA"
);
auto
inputA
=
inputA_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
T
>
(
inputA
,
inputA_shape
,
-
127
,
127
);
auto
inputB_var
=
scope
.
get
()
->
Var
(
"inputB"
);
auto
inputB
=
inputB_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
T
>
(
inputB
,
inputB_shape
,
-
127
,
127
);
auto
bias_var
=
scope
.
get
()
->
Var
(
"bias"
);
auto
bias
=
bias_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
SetupTensor
<
S
>
(
bias
,
bias_shape
,
-
127
,
127
);
auto
scale_var
=
scope
.
get
()
->
Var
(
"scale"
);
auto
scale
=
scale_var
->
template
GetMutable
<
framework
::
LoDTensor
>();
scale
->
Resize
(
framework
::
make_ddim
({
1
}));
float
scale_v
=
0.000828
f
;
scale
->
mutable_data
<
float
>
()[
0
]
=
scale_v
;
auto
output_var
=
scope
.
get
()
->
Var
(
"output"
);
AttributeMap
attrs
;
attrs
[
"x_num_col_dims"
].
Set
<
int
>
(
1
);
attrs
[
"y_num_col_dims"
].
Set
<
int
>
(
1
);
attrs
[
"axis"
].
Set
<
int
>
(
1
);
operators
::
OperatorBase
<
CPU
>
*
op
=
nullptr
;
#ifdef FUSION_FC_INT8_OP
if
(
std
::
is_same
<
T
,
int8_t
>::
value
)
{
op
=
new
operators
::
FusionFcInt8Op
<
CPU
,
T
>
(
"fusion_fc_int8"
,
inputs
,
outputs
,
attrs
,
scope
);
}
else
{
op
=
new
operators
::
FusionFcOp
<
CPU
,
T
>
(
"fusion_fc"
,
inputs
,
outputs
,
attrs
,
scope
);
}
#else
op
=
new
operators
::
FusionFcOp
<
CPU
,
T
>
(
"fusion_fc"
,
inputs
,
outputs
,
attrs
,
scope
);
#endif
op
->
InferShape
();
op
->
Run
();
auto
output
=
output_var
->
template
Get
<
framework
::
LoDTensor
>();
const
T
*
output_data
=
output
->
data
<
T
>
();
// compare
T
*
c
=
static_cast
<
T
*>
(
memory
::
Alloc
(
sizeof
(
T
)
*
m
*
n
));
T
*
a
=
inputA
->
data
<
T
>
();
T
*
b
=
inputB
->
data
<
T
>
();
S
*
bias_data
=
bias
->
data
<
S
>
();
for
(
int32_t
i
=
0
;
i
<
m
;
++
i
)
{
for
(
int32_t
j
=
0
;
j
<
n
;
++
j
)
{
S
bias_v
=
bias_data
[
j
];
if
(
std
::
is_same
<
T
,
int8_t
>::
value
)
{
int32_t
r
=
0
;
for
(
int32_t
p
=
0
;
p
<
k
;
p
++
)
{
r
+=
static_cast
<
int32_t
>
(
a
(
i
,
p
))
*
static_cast
<
int32_t
>
(
b
(
p
,
j
));
}
}
r
=
qadd_int32
(
r
,
bias_v
);
c
(
i
,
j
)
=
qscale_int32
(
r
,
scale_v
);
}
else
{
T
r
=
0
;
for
(
int32_t
p
=
0
;
p
<
k
;
p
++
)
{
r
+=
a
(
i
,
p
)
*
b
(
p
,
j
);
}
r
+=
bias_v
;
c
(
i
,
j
)
=
r
;
}
}
}
}
}
}
std
::
shared_ptr
<
Tensor
>
predict
(
const
Tensor
&
t1
,
const
Tensor
&
t2
,
int32_t
eq
=
0
;
const
Tensor
&
t3
)
{
int32_t
neq
=
0
;
// feed
for
(
int32_t
i
=
0
;
i
<
m
*
n
;
++
i
)
{
auto
scope
=
program_
.
scope
;
PADDLE_MOBILE_ENFORCE
(
output_data
[
i
]
==
c
[
i
],
Variable
*
x_feed_value
=
scope
->
Var
(
"pool2d_13.tmp_0"
);
"The execution of test_fusion_fc_op is failed!"
);
auto
tensor_x
=
x_feed_value
->
GetMutable
<
LoDTensor
>
();
if
(
output_data
[
i
]
==
c
[
i
])
{
tensor_x
->
ShareDataWith
(
t1
);
++
eq
;
}
else
{
Variable
*
y_feed_value
=
scope
->
Var
(
"loss3_classifier-loc_weights"
);
++
neq
;
auto
tensor_y
=
y_feed_value
->
GetMutable
<
LoDTensor
>
();
tensor_y
->
ShareDataWith
(
t2
);
Variable
*
z_feed_value
=
scope
->
Var
(
"loss3_classifier-loc_biases"
);
auto
tensor_z
=
z_feed_value
->
GetMutable
<
LoDTensor
>
();
tensor_z
->
ShareDataWith
(
t3
);
Variable
*
con_output
=
scope
->
Var
(
"loss3_classifier-loc.tmp_1"
);
auto
*
output_tensor
=
con_output
->
GetMutable
<
LoDTensor
>
();
output_tensor
->
mutable_data
<
float
>
({
3
,
10
});
// DLOG << typeid(output_tensor).name();
// DLOG << "output_tensor dims: " << output_tensor->dims();
std
::
shared_ptr
<
LoDTensor
>
out_tensor
=
std
::
make_shared
<
LoDTensor
>
();
out_tensor
.
reset
(
output_tensor
);
predict
(
t1
,
t2
,
t3
,
0
);
return
out_tensor
;
}
private:
const
framework
::
Program
<
Dtype
>
program_
;
std
::
shared_ptr
<
ProgramDesc
>
to_predict_program_
;
std
::
map
<
framework
::
BlockDesc
,
std
::
vector
<
std
::
shared_ptr
<
OperatorBase
<
Dtype
>>>>
ops_of_block_
;
bool
use_optimize_
=
false
;
void
predict
(
const
Tensor
&
t1
,
const
Tensor
&
t2
,
const
Tensor
&
t3
,
int
block_id
)
{
std
::
shared_ptr
<
BlockDesc
>
to_predict_block
=
to_predict_program_
->
Block
(
block_id
);
for
(
int
j
=
0
;
j
<
ops_of_block_
[
*
to_predict_block
.
get
()].
size
();
++
j
)
{
auto
op
=
ops_of_block_
[
*
to_predict_block
.
get
()][
j
];
DLOG
<<
"op -> run()"
;
op
->
Run
();
}
}
}
}
};
std
::
cout
<<
"mnk="
<<
m
<<
" "
<<
n
<<
" "
<<
k
<<
" eq="
<<
eq
<<
" neq="
<<
neq
<<
std
::
endl
;
template
class
TestFcOp
<
CPU
>;
delete
op
;
}
// namespace framework
return
0
;
}
}
// namespace paddle_mobile
}
// namespace paddle_mobile
int
main
()
{
DLOG
<<
"----------**********----------"
;
DLOG
<<
"begin to run Fc Test"
;
paddle_mobile
::
framework
::
Loader
<
paddle_mobile
::
CPU
>
loader
;
// "../../../test/models/googlenet"
auto
program
=
loader
.
Load
(
g_googlenet
);
paddle_mobile
::
framework
::
ProgramOptimize
optimize
;
// program.originProgram->Description("origin");
auto
optimize_program
=
optimize
.
FusionOptimize
(
program
.
originProgram
);
program
.
optimizeProgram
=
optimize_program
;
if
(
optimize_program
!=
nullptr
)
{
optimize_program
->
Description
(
"optimize"
);
}
else
{
LOG
(
paddle_mobile
::
kLOG_ERROR
)
<<
"optimize_program is null"
;
}
/// input x (1,3,224,224)
paddle_mobile
::
framework
::
LoDTensor
inputx
;
SetupTensor
<
float
>
(
&
inputx
,
{
3
,
64
,
1
,
1
},
static_cast
<
float
>
(
1
),
static_cast
<
float
>
(
1
));
auto
*
inputx_ptr
=
inputx
.
data
<
float
>
();
/// input y (224,)
paddle_mobile
::
framework
::
LoDTensor
inputy
;
SetupTensor
<
float
>
(
&
inputy
,
{
64
,
10
},
static_cast
<
float
>
(
1.5
),
static_cast
<
float
>
(
1.5
));
auto
*
inputy_ptr
=
inputy
.
data
<
float
>
();
paddle_mobile
::
framework
::
LoDTensor
inputz
;
SetupTensor
<
float
>
(
&
inputz
,
{
10
},
static_cast
<
float
>
(
0
),
static_cast
<
float
>
(
1
));
auto
*
inputz_ptr
=
inputz
.
data
<
float
>
();
paddle_mobile
::
framework
::
TestFcOp
<
paddle_mobile
::
CPU
>
testFcOp
(
program
);
auto
output
=
testFcOp
.
predict
(
inputx
,
inputy
,
inputz
);
auto
*
output_ptr
=
output
->
data
<
float
>
();
for
(
int
j
=
0
;
j
<
output
->
numel
();
++
j
)
{
DLOG
<<
"value of output: "
<<
output_ptr
[
j
];
}
DLOG
<<
"1 (3,64) * 2 (64,10) = 96(3,10)"
;
DLOG
<<
"output : 96(3,10) + bias(10)"
;
int
main
()
{
paddle_mobile
::
PaddleMobile
<
paddle_mobile
::
CPU
>
paddle_mobile
;
paddle_mobile
.
SetThreadNum
(
4
);
#ifdef FUSION_FC_INT8_OP
paddle_mobile
::
TestFcOP
<
int8_t
,
int32_t
>
();
#endif
paddle_mobile
::
TestFcOP
<
float
,
float
>
();
return
0
;
return
0
;
}
}
tools/op.cmake
浏览文件 @
6ce11736
...
@@ -214,6 +214,7 @@ if(NOT FOUND_MATCH)
...
@@ -214,6 +214,7 @@ if(NOT FOUND_MATCH)
set
(
FUSION_CONVADDPRELU_OP ON
)
set
(
FUSION_CONVADDPRELU_OP ON
)
set
(
FUSION_CONVADDRELU_OP ON
)
set
(
FUSION_CONVADDRELU_OP ON
)
set
(
FUSION_CONVADDRELU_INT8_OP ON
)
set
(
FUSION_CONVADDRELU_INT8_OP ON
)
set
(
FUSION_FC_INT8_OP ON
)
set
(
FUSION_FC_OP ON
)
set
(
FUSION_FC_OP ON
)
set
(
LRN_OP ON
)
set
(
LRN_OP ON
)
set
(
MUL_OP ON
)
set
(
MUL_OP ON
)
...
@@ -322,6 +323,9 @@ endif()
...
@@ -322,6 +323,9 @@ endif()
if
(
FUSION_FC_OP
)
if
(
FUSION_FC_OP
)
add_definitions
(
-DFUSION_FC_OP
)
add_definitions
(
-DFUSION_FC_OP
)
endif
()
endif
()
if
(
FUSION_FC_INT8_OP
)
add_definitions
(
-DFUSION_FC_INT8_OP
)
endif
()
if
(
LRN_OP
)
if
(
LRN_OP
)
add_definitions
(
-DLRN_OP
)
add_definitions
(
-DLRN_OP
)
endif
()
endif
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录