Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
5acae32b
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5acae32b
编写于
1月 29, 2019
作者:
Z
zhaojiaying01
浏览文件
操作
浏览文件
下载
差异文件
resolve conflicts and adjust gemm code style
上级
06eead1c
c69517d2
变更
19
显示空白变更内容
内联
并排
Showing
19 changed file
with
3007 addition
and
2770 deletion
+3007
-2770
src/common/types.cpp
src/common/types.cpp
+12
-1
src/common/types.h
src/common/types.h
+5
-0
src/framework/load_ops.h
src/framework/load_ops.h
+9
-0
src/operators/detection_ops.cpp
src/operators/detection_ops.cpp
+81
-0
src/operators/detection_ops.h
src/operators/detection_ops.h
+38
-0
src/operators/kernel/arm/anchor_generator_kernel.cpp
src/operators/kernel/arm/anchor_generator_kernel.cpp
+37
-0
src/operators/kernel/arm/proposal_kernel.cpp
src/operators/kernel/arm/proposal_kernel.cpp
+36
-0
src/operators/kernel/arm/psroi_pool_kernel.cpp
src/operators/kernel/arm/psroi_pool_kernel.cpp
+36
-0
src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
...ors/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
+2
-2
src/operators/kernel/central-arm-func/conv_add_arm_func.h
src/operators/kernel/central-arm-func/conv_add_arm_func.h
+2
-1
src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
...ators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
+2
-2
src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
...erators/kernel/central-arm-func/conv_add_prelu_arm_func.h
+2
-1
src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
...ators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
+3
-3
src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
...erators/kernel/central-arm-func/conv_transpose_arm_func.h
+1
-2
src/operators/kernel/detection_kernel.h
src/operators/kernel/detection_kernel.h
+140
-0
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+2541
-2694
src/operators/math/gemm.h
src/operators/math/gemm.h
+34
-38
src/operators/op_param.h
src/operators/op_param.h
+12
-25
tools/op.cmake
tools/op.cmake
+14
-1
未找到文件。
src/common/types.cpp
浏览文件 @
5acae32b
...
...
@@ -106,6 +106,11 @@ const char *G_OP_TYPE_SEQUENCE_EXPAND = "sequence_expand";
const
char
*
G_OP_TYPE_SEQUENCE_POOL
=
"sequence_pool"
;
const
char
*
G_OP_TYPE_SEQUENCE_SOFTMAX
=
"sequence_softmax"
;
const
char
*
G_OP_TYPE_SLICE
=
"slice"
;
const
char
*
G_OP_TYPE_ANCHOR_GENERATOR
=
"anchor_generator"
;
const
char
*
G_OP_TYPE_GENERATE_PROPOSALS
=
"generate_proposals"
;
const
char
*
G_OP_TYPE_PSROI_POOL
=
"psroi_pool"
;
std
::
unordered_map
<
std
::
string
,
std
::
pair
<
std
::
vector
<
std
::
string
>
,
std
::
vector
<
std
::
string
>>>
op_input_output_key
=
{
...
...
@@ -197,5 +202,11 @@ std::unordered_map<
{
G_OP_TYPE_WRITE_TO_ARRAY
,
{{
"X"
,
"I"
},
{
"Out"
}}},
{
G_OP_TYPE_READ_FROM_ARRAY
,
{{
"X"
,
"I"
},
{
"Out"
}}},
{
G_OP_TYPE_IS_EMPTY
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_INCREMENT
,
{{
"X"
},
{
"Out"
}}}};
{
G_OP_TYPE_INCREMENT
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_SLICE
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_ANCHOR_GENERATOR
,
{{
"Input"
},
{
"Anchors"
,
"Variances"
}}},
{
G_OP_TYPE_GENERATE_PROPOSALS
,
{{
"Scores"
,
"BboxDeltas"
,
"ImInfo"
,
"Anchors"
,
"Variances"
},
{
"RpnRois"
,
"RpnRoiProbs"
}}},
{
G_OP_TYPE_PSROI_POOL
,
{{
"X"
,
"ROIs"
},
{
"Out"
}}}};
}
// namespace paddle_mobile
src/common/types.h
浏览文件 @
5acae32b
...
...
@@ -194,6 +194,11 @@ extern const char *G_OP_TYPE_SEQUENCE_EXPAND;
extern
const
char
*
G_OP_TYPE_SEQUENCE_POOL
;
extern
const
char
*
G_OP_TYPE_SEQUENCE_SOFTMAX
;
extern
const
char
*
G_OP_TYPE_SLICE
;
extern
const
char
*
G_OP_TYPE_ANCHOR_GENERATOR
;
extern
const
char
*
G_OP_TYPE_GENERATE_PROPOSALS
;
extern
const
char
*
G_OP_TYPE_PSROI_POOL
;
extern
std
::
unordered_map
<
std
::
string
,
std
::
pair
<
std
::
vector
<
std
::
string
>
,
std
::
vector
<
std
::
string
>>>
op_input_output_key
;
...
...
src/framework/load_ops.h
浏览文件 @
5acae32b
...
...
@@ -312,3 +312,12 @@ LOAD_OP1(is_empty, CPU);
#ifdef INCREMENT_OP
LOAD_OP1
(
increment
,
CPU
);
#endif
#ifdef ANCHOR_GENERATOR_OP
LOAD_OP1
(
anchor_generator
,
CPU
);
#endif
#ifdef PROPOSAL_OP
LOAD_OP1
(
generate_proposals
,
CPU
);
#endif
#ifdef PSROI_POOL_OP
LOAD_OP1
(
psroi_pool
,
CPU
);
#endif
src/operators/detection_ops.cpp
0 → 100644
浏览文件 @
5acae32b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "operators/detection_ops.h"
#include <vector>
namespace
paddle_mobile
{
namespace
operators
{
#ifdef ANCHOR_GENERATOR_OP
template
<
typename
DeviceType
,
typename
T
>
void
AnchorGeneratorOp
<
DeviceType
,
T
>::
InferShape
()
const
{
const
auto
&
input_dims
=
this
->
param_
.
input_
->
dims
();
PADDLE_MOBILE_ENFORCE
(
input_dims
.
size
()
==
4
,
"The layout of input is NCHW."
);
const
auto
&
anchor_sizes
=
this
->
param_
.
anchor_sizes_
;
const
auto
&
aspect_ratios
=
this
->
param_
.
aspect_ratios_
;
size_t
num_anchors
=
aspect_ratios
.
size
()
*
anchor_sizes
.
size
();
std
::
vector
<
int64_t
>
dim_vec
(
4
);
dim_vec
[
0
]
=
input_dims
[
2
];
dim_vec
[
1
]
=
input_dims
[
3
];
dim_vec
[
2
]
=
num_anchors
;
dim_vec
[
3
]
=
4
;
this
->
param_
.
output_anchors_
->
Resize
(
framework
::
make_ddim
(
dim_vec
));
this
->
param_
.
output_variances_
->
Resize
(
framework
::
make_ddim
(
dim_vec
));
}
#endif
#ifdef PROPOSAL_OP
template
<
typename
DeviceType
,
typename
T
>
void
ProposalOp
<
DeviceType
,
T
>::
InferShape
()
const
{
this
->
param_
.
rpn_rois_
->
Resize
(
framework
::
make_ddim
({
-
1
,
4
}));
this
->
param_
.
rpn_probs_
->
Resize
(
framework
::
make_ddim
({
-
1
,
1
}));
}
#endif
#ifdef PSROI_POOL_OP
template
<
typename
DeviceType
,
typename
T
>
void
PSRoiPoolOp
<
DeviceType
,
T
>::
InferShape
()
const
{
const
auto
&
rois_dims
=
this
->
param_
.
input_rois_
->
dims
();
const
int
pooled_height
=
this
->
param_
.
pooled_height_
;
const
int
pooled_width
=
this
->
param_
.
pooled_width_
;
const
int
output_channels
=
this
->
param_
.
output_channels_
;
auto
out_dims
=
this
->
param_
.
input_x_
->
dims
();
out_dims
[
0
]
=
rois_dims
[
0
];
out_dims
[
1
]
=
output_channels
;
// input_dims[1] / (pooled_height * pooled_width);
out_dims
[
2
]
=
pooled_height
;
out_dims
[
3
]
=
pooled_width
;
this
->
param_
.
output_
->
Resize
(
out_dims
);
}
#endif
}
// namespace operators
}
// namespace paddle_mobile
namespace
ops
=
paddle_mobile
::
operators
;
#ifdef PADDLE_MOBILE_CPU
#ifdef ANCHOR_GENERATOR_OP
REGISTER_OPERATOR_CPU
(
anchor_generator
,
ops
::
AnchorGeneratorOp
);
#endif
#ifdef PROPOSAL_OP
REGISTER_OPERATOR_CPU
(
generate_proposals
,
ops
::
ProposalOp
);
#endif
#ifdef PSROI_POOL_OP
REGISTER_OPERATOR_CPU
(
psroi_pool
,
ops
::
PSRoiPoolOp
);
#endif
#endif
src/operators/detection_ops.h
0 → 100644
浏览文件 @
5acae32b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "framework/operator.h"
#include "operators/kernel/detection_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
#ifdef ANCHOR_GENERATOR_OP
DECLARE_OPERATOR
(
AnchorGenerator
,
AnchorGeneratorParam
,
AnchorGeneratorKernel
);
#endif
#ifdef PROPOSAL_OP
DECLARE_OPERATOR
(
Proposal
,
ProposalParam
,
ProposalKernel
);
#endif
#ifdef PSROI_POOL_OP
DECLARE_OPERATOR
(
PSRoiPool
,
PSRoiPoolParam
,
PSRoiPoolKernel
);
#endif
}
// namespace operators
}
// namespace paddle_mobile
src/operators/kernel/arm/anchor_generator_kernel.cpp
0 → 100644
浏览文件 @
5acae32b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef ANCHOR_GENERATOR_OP
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
AnchorGeneratorKernel
<
CPU
,
float
>::
Init
(
AnchorGeneratorParam
<
CPU
>
*
param
)
{
return
true
;
}
template
<
>
void
AnchorGeneratorKernel
<
CPU
,
float
>::
Compute
(
const
AnchorGeneratorParam
<
CPU
>
&
param
)
{
// TODO(hjchen2)
}
}
// namespace operators
}
// namespace paddle_mobile
#endif // ANCHOR_GENERATOR_OP
src/operators/kernel/arm/proposal_kernel.cpp
0 → 100644
浏览文件 @
5acae32b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PROPOSAL_OP
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ProposalKernel
<
CPU
,
float
>::
Init
(
ProposalParam
<
CPU
>
*
param
)
{
return
true
;
}
template
<
>
void
ProposalKernel
<
CPU
,
float
>::
Compute
(
const
ProposalParam
<
CPU
>
&
param
)
{
// TODO(hjchen2)
}
}
// namespace operators
}
// namespace paddle_mobile
#endif // PROPOSAL_OP
src/operators/kernel/arm/psroi_pool_kernel.cpp
0 → 100644
浏览文件 @
5acae32b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PSROI_POOL_OP
#include <vector>
#include "operators/kernel/detection_kernel.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
PSRoiPoolKernel
<
CPU
,
float
>::
Init
(
PSRoiPoolParam
<
CPU
>
*
param
)
{
return
true
;
}
template
<
>
void
PSRoiPoolKernel
<
CPU
,
float
>::
Compute
(
const
PSRoiPoolParam
<
CPU
>
&
param
)
{
// TODO(hjchen2)
}
}
// namespace operators
}
// namespace paddle_mobile
#endif // PSROI_POOL_OP
src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
浏览文件 @
5acae32b
...
...
@@ -32,11 +32,11 @@ void ConvAddAddPReluCompute(const FusionConvAddAddPReluParam<CPU> ¶m) {
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
Tensor
bias1
=
*
param
.
Bias1
();
int
axis
=
param
.
Axis
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
float
*
biase_data
=
bias
.
data
<
float
>
();
int
axis
=
param
.
Axis
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
...
...
src/operators/kernel/central-arm-func/conv_add_arm_func.h
浏览文件 @
5acae32b
...
...
@@ -30,10 +30,11 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
int
axis
=
param
.
Axis
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
float
*
biase_data
=
bias
.
data
<
float
>
();
int
axis
=
param
.
Axis
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
...
...
src/operators/kernel/central-arm-func/conv_add_bn_relu_arm_func.h
浏览文件 @
5acae32b
...
...
@@ -32,6 +32,8 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> ¶m) {
Tensor
new_bias
=
*
param
.
NewBias
();
Tensor
new_scale
=
*
param
.
NewScale
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
...
...
@@ -115,8 +117,6 @@ void ConvAddBNReluBasic(const FusionConvAddBNReluParam<CPU> ¶m) {
template
<
typename
P
>
void
ConvAddBNReluCompute
(
const
FusionConvAddBNReluParam
<
CPU
>
&
param
)
{
Tensor
Bias
;
Bias
.
mutable_data
<
float
>
({
param
.
Groups
()});
if
(
param
.
Groups
()
==
param
.
Input
()
->
dims
()[
1
]
&&
param
.
Input
()
->
dims
()[
1
]
==
param
.
Output
()
->
dims
()[
1
]
&&
param
.
Filter
()
->
dims
()[
2
]
==
param
.
Filter
()
->
dims
()[
3
]
&&
...
...
src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
浏览文件 @
5acae32b
...
...
@@ -31,10 +31,11 @@ void ConvAddPReluCompute(const FusionConvAddPReluParam<CPU> ¶m) {
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
int
axis
=
param
.
Axis
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
float
*
biase_data
=
bias
.
data
<
float
>
();
int
axis
=
param
.
Axis
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
...
...
src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
浏览文件 @
5acae32b
...
...
@@ -30,11 +30,11 @@ void ConvBNAddReluBasic(const FusionConvBNAddReluParam<CPU> ¶m) {
Tensor
filter
=
*
param
.
Filter
();
Tensor
new_bias
=
*
param
.
NewBias
();
Tensor
new_scale
=
*
param
.
NewScale
();
Tensor
*
output
=
param
.
Output
();
Tensor
*
bias1
=
param
.
Bias
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
DLOG
<<
"yangfei2"
;
DLOG
<<
bias1
->
dims
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
...
...
src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
浏览文件 @
5acae32b
...
...
@@ -31,6 +31,7 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> ¶m) {
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
P
>
();
auto
strides
=
param
.
Strides
();
auto
paddings
=
param
.
Paddings
();
...
...
@@ -76,8 +77,6 @@ void ConvTransposeCompute(const ConvTransposeParam<CPU> ¶m) {
framework
::
DDim
filter_matrix_shape
=
{
input
->
dims
()[
1
],
col_matrix_shape
[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
output
->
mutable_data
<
P
>
();
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
...
...
src/operators/kernel/detection_kernel.h
0 → 100644
浏览文件 @
5acae32b
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
#ifdef ANCHOR_GENERATOR_OP
template
<
typename
Dtype
>
class
AnchorGeneratorParam
:
public
OpParam
{
public:
AnchorGeneratorParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
input_
=
OpParam
::
GetVarValue
<
framework
::
Tensor
>
(
"Input"
,
inputs
,
scope
);
output_anchors_
=
OpParam
::
GetVarValue
<
framework
::
Tensor
>
(
"Anchors"
,
outputs
,
scope
);
output_variances_
=
OpParam
::
GetVarValue
<
framework
::
Tensor
>
(
"Variances"
,
outputs
,
scope
);
anchor_sizes_
=
OpParam
::
GetAttr
<
std
::
vector
<
float
>>
(
"anchor_sizes"
,
attrs
);
aspect_ratios_
=
OpParam
::
GetAttr
<
std
::
vector
<
float
>>
(
"aspect_ratios"
,
attrs
);
variances_
=
OpParam
::
GetAttr
<
std
::
vector
<
float
>>
(
"variances"
,
attrs
);
stride_
=
OpParam
::
GetAttr
<
std
::
vector
<
float
>>
(
"stride"
,
attrs
);
offset_
=
OpParam
::
GetAttr
<
float
>
(
"offset"
,
attrs
);
}
public:
// input
framework
::
Tensor
*
input_
;
// outputs
framework
::
Tensor
*
output_anchors_
;
framework
::
Tensor
*
output_variances_
;
std
::
vector
<
float
>
anchor_sizes_
;
std
::
vector
<
float
>
aspect_ratios_
;
std
::
vector
<
float
>
variances_
;
std
::
vector
<
float
>
stride_
;
float
offset_
;
};
DECLARE_KERNEL
(
AnchorGenerator
,
AnchorGeneratorParam
);
#endif
#ifdef PROPOSAL_OP
template
<
typename
Dtype
>
class
ProposalParam
:
public
OpParam
{
public:
ProposalParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
scores_
=
OpParam
::
GetVarValue
<
framework
::
Tensor
>
(
"Scores"
,
inputs
,
scope
);
bbox_deltas_
=
OpParam
::
GetVarValue
<
framework
::
Tensor
>
(
"BboxDeltas"
,
inputs
,
scope
);
im_info_
=
OpParam
::
GetVarValue
<
framework
::
Tensor
>
(
"ImInfo"
,
inputs
,
scope
);
anchors_
=
OpParam
::
GetVarValue
<
framework
::
Tensor
>
(
"Anchors"
,
inputs
,
scope
);
variances_
=
OpParam
::
GetVarValue
<
framework
::
Tensor
>
(
"Variances"
,
inputs
,
scope
);
rpn_rois_
=
OpParam
::
GetVarValue
<
framework
::
LoDTensor
>
(
"RpnRois"
,
outputs
,
scope
);
rpn_probs_
=
OpParam
::
GetVarValue
<
framework
::
LoDTensor
>
(
"RpnRoiProbs"
,
outputs
,
scope
);
pre_nms_topn_
=
OpParam
::
GetAttr
<
int
>
(
"pre_nms_topN"
,
attrs
);
post_nms_topn_
=
OpParam
::
GetAttr
<
int
>
(
"post_nms_topN"
,
attrs
);
nms_thresh_
=
OpParam
::
GetAttr
<
float
>
(
"nms_thresh"
,
attrs
);
min_size_
=
OpParam
::
GetAttr
<
float
>
(
"min_size"
,
attrs
);
eta_
=
OpParam
::
GetAttr
<
float
>
(
"eta"
,
attrs
);
}
public:
framework
::
Tensor
*
scores_
;
framework
::
Tensor
*
bbox_deltas_
;
framework
::
Tensor
*
im_info_
;
framework
::
Tensor
*
anchors_
;
framework
::
Tensor
*
variances_
;
framework
::
LoDTensor
*
rpn_rois_
;
framework
::
LoDTensor
*
rpn_probs_
;
int
pre_nms_topn_
;
int
post_nms_topn_
;
float
nms_thresh_
;
float
min_size_
;
float
eta_
;
};
DECLARE_KERNEL
(
Proposal
,
ProposalParam
);
#endif
#ifdef PSROI_POOL_OP
template
<
typename
Dtype
>
class
PSRoiPoolParam
:
public
OpParam
{
public:
PSRoiPoolParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
input_x_
=
OpParam
::
GetVarValue
<
framework
::
Tensor
>
(
"X"
,
inputs
,
scope
);
input_rois_
=
OpParam
::
GetVarValue
<
framework
::
LoDTensor
>
(
"ROIs"
,
inputs
,
scope
);
output_
=
OpParam
::
GetVarValue
<
framework
::
Tensor
>
(
"Out"
,
outputs
,
scope
);
output_channels_
=
OpParam
::
GetAttr
<
int
>
(
"output_channels"
,
attrs
);
pooled_height_
=
OpParam
::
GetAttr
<
int
>
(
"pooled_height"
,
attrs
);
pooled_width_
=
OpParam
::
GetAttr
<
int
>
(
"pooled_width"
,
attrs
);
spatial_scale_
=
OpParam
::
GetAttr
<
float
>
(
"spatial_scale"
,
attrs
);
}
public:
framework
::
Tensor
*
input_x_
;
framework
::
LoDTensor
*
input_rois_
;
framework
::
Tensor
*
output_
;
int
output_channels_
;
int
pooled_height_
;
int
pooled_width_
;
float
spatial_scale_
;
};
DECLARE_KERNEL
(
PSRoiPool
,
PSRoiPoolParam
);
#endif
}
// namespace operators
}
// namespace paddle_mobile
src/operators/math/gemm.cpp
浏览文件 @
5acae32b
...
...
@@ -415,6 +415,7 @@ void Gemm::PackMatrixB_omp_8c(int k, int n, int n_tail, const float *B, int ldb,
}
}
#if __ARM_NEON
#if __aarch64__
void
Gemm
::
PackMatrixB_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
)
{
...
...
@@ -538,6 +539,7 @@ void Gemm::PackMatrixB_omp_16c(int k, int n, int n_tail, const float *B,
}
}
#endif // __aarch64__
#endif // __ARM_NEON
// 分块矩阵乘法
void
Gemm
::
InnerKernel
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
...
...
@@ -688,42 +690,7 @@ void Gemm::InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
#if __ARM_NEON
#if __aarch64__
void
Gemm
::
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
// init C
float32x4_t
cv0
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv1
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv2
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv3
=
vdupq_n_f32
(
0.0
);
float32x4_t
av
;
float32x4_t
bv
;
float32x2_t
av01
;
float32x2_t
av23
;
for
(
int
p
=
0
;
p
<
k
;
p
+=
1
)
{
av
=
vld1q_f32
(
a
);
bv
=
vld1q_f32
(
b
);
av01
=
vget_low_f32
(
av
);
cv0
=
vmlaq_lane_f32
(
cv0
,
bv
,
av01
,
0
);
cv1
=
vmlaq_lane_f32
(
cv1
,
bv
,
av01
,
1
);
av23
=
vget_high_f32
(
av
);
cv2
=
vmlaq_lane_f32
(
cv2
,
bv
,
av23
,
0
);
cv3
=
vmlaq_lane_f32
(
cv3
,
bv
,
av23
,
1
);
a
+=
MR
;
b
+=
NR
;
}
vst1q_f32
(
c
,
cv0
);
vst1q_f32
(
c
+
ldc
,
cv1
);
vst1q_f32
(
c
+
2
*
ldc
,
cv2
);
vst1q_f32
(
c
+
3
*
ldc
,
cv3
);
// float32x4x4_t cv = {cv0, cv1, cv2, cv3};
}
void
Gemm
::
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
void
Gemm
::
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
// init C
float32x4_t
cv0
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv1
=
vdupq_n_f32
(
0.0
);
...
...
@@ -733,6 +700,10 @@ void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
float32x4_t
cv5
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv6
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv7
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv8
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv9
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv10
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv11
=
vdupq_n_f32
(
0.0
);
float32x4_t
av
;
float32x4_t
bv0
;
...
...
@@ -740,23 +711,31 @@ void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
float32x2_t
av01
;
float32x2_t
av23
;
float32x2_t
av45
;
for
(
int
p
=
0
;
p
<
k
;
p
+=
1
)
{
av
=
vld1q_f32
(
a
);
av01
=
vget_low_f32
(
av
);
av23
=
vget_high_f32
(
av
);
av45
=
vld1_f32
(
a
+
4
);
bv0
=
vld1q_f32
(
b
);
bv1
=
vld1q_f32
(
b
+
4
);
av01
=
vget_low_f32
(
av
);
cv0
=
vmlaq_lane_f32
(
cv0
,
bv0
,
av01
,
0
);
cv1
=
vmlaq_lane_f32
(
cv1
,
bv1
,
av01
,
0
);
cv2
=
vmlaq_lane_f32
(
cv2
,
bv0
,
av01
,
1
);
cv3
=
vmlaq_lane_f32
(
cv3
,
bv1
,
av01
,
1
);
av23
=
vget_high_f32
(
av
);
cv4
=
vmlaq_lane_f32
(
cv4
,
bv0
,
av23
,
0
);
cv5
=
vmlaq_lane_f32
(
cv5
,
bv1
,
av23
,
0
);
cv6
=
vmlaq_lane_f32
(
cv6
,
bv0
,
av23
,
1
);
cv7
=
vmlaq_lane_f32
(
cv7
,
bv1
,
av23
,
1
);
cv8
=
vmlaq_lane_f32
(
cv8
,
bv0
,
av45
,
0
);
cv9
=
vmlaq_lane_f32
(
cv9
,
bv1
,
av45
,
0
);
cv10
=
vmlaq_lane_f32
(
cv10
,
bv0
,
av45
,
1
);
cv11
=
vmlaq_lane_f32
(
cv11
,
bv1
,
av45
,
1
);
a
+=
MR
;
b
+=
NR
;
}
...
...
@@ -769,131 +748,719 @@ void Gemm::AddDot4x8(int k, const float *a, const float *b, float *c, int ldc) {
vst1q_f32
(
c
+
2
*
ldc
+
4
,
cv5
);
vst1q_f32
(
c
+
3
*
ldc
,
cv6
);
vst1q_f32
(
c
+
3
*
ldc
+
4
,
cv7
);
vst1q_f32
(
c
+
4
*
ldc
,
cv8
);
vst1q_f32
(
c
+
4
*
ldc
+
4
,
cv9
);
vst1q_f32
(
c
+
5
*
ldc
,
cv10
);
vst1q_f32
(
c
+
5
*
ldc
+
4
,
cv11
);
}
// 分块矩阵乘法结果回写
// C = A * B
void
Gemm
::
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
void
Gemm
::
AddDot8x12
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
;
int
step
=
4
*
ldc
;
asm
volatile
(
"dup v5.4s, wzr
\n\t
"
"dup v6.4s, wzr
\n\t
"
"dup v7.4s, wzr
\n\t
"
"dup v8.4s, wzr
\n\t
"
"dup v9.4s, wzr
\n\t
"
"dup v10.4s, wzr
\n\t
"
"dup v11.4s, wzr
\n\t
"
"dup v12.4s, wzr
\n\t
"
"dup v13.4s, wzr
\n\t
"
"dup v14.4s, wzr
\n\t
"
"dup v15.4s, wzr
\n\t
"
"dup v16.4s, wzr
\n\t
"
float
*
c_ptr
,
*
C_ptr
;
float32x4_t
cv
;
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
c_ptr
=
c
+
i
*
NC
;
C_ptr
=
C
+
i
*
ldc
;
for
(
int
j
=
0
;
j
<
nc1
;
++
j
)
{
cv
=
vld1q_f32
(
c_ptr
);
vst1q_f32
(
C_ptr
,
cv
);
c_ptr
+=
4
;
C_ptr
+=
4
;
}
if
(
_nc1
!=
0
)
{
cv
=
vld1q_f32
(
c_ptr
);
if
(
_nc1
>=
1
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
0
);
C_ptr
++
;
}
if
(
_nc1
>=
2
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
1
);
C_ptr
++
;
}
if
(
_nc1
>=
3
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
2
);
}
}
}
}
"dup v17.4s, wzr
\n\t
"
"dup v18.4s, wzr
\n\t
"
"dup v19.4s, wzr
\n\t
"
"dup v20.4s, wzr
\n\t
"
"dup v21.4s, wzr
\n\t
"
"dup v22.4s, wzr
\n\t
"
"dup v23.4s, wzr
\n\t
"
"dup v24.4s, wzr
\n\t
"
"dup v25.4s, wzr
\n\t
"
"dup v26.4s, wzr
\n\t
"
"dup v27.4s, wzr
\n\t
"
"dup v28.4s, wzr
\n\t
"
// C = alpha * A * B + beta * C
void
Gemm
::
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
"subs %[kc1], %[kc1], #1
\n\t
"
"blt 2f
\n\t
"
"1:
\n\t
"
// C = A * B + C
void
Gemm
::
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
"prfm pldl1keep, [%[a_ptr], #32]
\n\t
"
"prfm pldl1keep, [%[b_ptr], #48]
\n\t
"
float
*
c_ptr
,
*
C_ptr
;
float32x4_t
cv
;
float32x4_t
cv1
;
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
c_ptr
=
c
+
i
*
NC
;
C_ptr
=
C
+
i
*
ldc
;
for
(
int
j
=
0
;
j
<
nc1
;
++
j
)
{
cv
=
vld1q_f32
(
c_ptr
);
cv1
=
vld1q_f32
(
C_ptr
);
cv
=
vaddq_f32
(
cv
,
cv1
);
vst1q_f32
(
C_ptr
,
cv
);
c_ptr
+=
4
;
C_ptr
+=
4
;
}
if
(
_nc1
!=
0
)
{
cv
=
vld1q_f32
(
c_ptr
);
cv1
=
vld1q_f32
(
C_ptr
);
cv
=
vaddq_f32
(
cv
,
cv1
);
if
(
_nc1
>=
1
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
0
);
C_ptr
++
;
}
if
(
_nc1
>=
2
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
1
);
C_ptr
++
;
}
if
(
_nc1
>=
3
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
2
);
}
}
}
}
// C = A * B + bias
void
Gemm
::
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
"ld1 {v0.4s, v1.4s}, [%[a_ptr]], #32
\n\t
"
"ld1 {v2.4s, v3.4s, v4.4s}, [%[b_ptr]], #48
\n\t
"
float
*
c_ptr
,
*
C_ptr
;
float32x4_t
cv
;
float32x4_t
biasv
;
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
c_ptr
=
c
+
i
*
NC
;
C_ptr
=
C
+
i
*
ldc
;
biasv
=
vld1q_dup_f32
(
bias
+
i
);
for
(
int
j
=
0
;
j
<
nc1
;
++
j
)
{
cv
=
vld1q_f32
(
c_ptr
);
cv
=
vaddq_f32
(
cv
,
biasv
);
vst1q_f32
(
C_ptr
,
cv
);
c_ptr
+=
4
;
C_ptr
+=
4
;
}
if
(
_nc1
!=
0
)
{
cv
=
vld1q_f32
(
c_ptr
);
cv
=
vaddq_f32
(
cv
,
biasv
);
if
(
_nc1
>=
1
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
0
);
C_ptr
++
;
}
if
(
_nc1
>=
2
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
1
);
C_ptr
++
;
}
if
(
_nc1
>=
3
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
2
);
C_ptr
++
;
}
}
}
"fmla v5.4s, v2.4s, v0.s[0]
\n\t
"
"fmla v6.4s, v3.4s, v0.s[0]
\n\t
"
"fmla v7.4s, v4.4s, v0.s[0]
\n\t
"
"fmla v8.4s, v2.4s, v0.s[1]
\n\t
"
"fmla v9.4s, v3.4s, v0.s[1]
\n\t
"
"fmla v10.4s, v4.4s, v0.s[1]
\n\t
"
"fmla v11.4s, v2.4s, v0.s[2]
\n\t
"
"fmla v12.4s, v3.4s, v0.s[2]
\n\t
"
"fmla v13.4s, v4.4s, v0.s[2]
\n\t
"
"fmla v14.4s, v2.4s, v0.s[3]
\n\t
"
"fmla v15.4s, v3.4s, v0.s[3]
\n\t
"
"fmla v16.4s, v4.4s, v0.s[3]
\n\t
"
"fmla v17.4s, v2.4s, v1.s[0]
\n\t
"
"fmla v18.4s, v3.4s, v1.s[0]
\n\t
"
"fmla v19.4s, v4.4s, v1.s[0]
\n\t
"
"fmla v20.4s, v2.4s, v1.s[1]
\n\t
"
"fmla v21.4s, v3.4s, v1.s[1]
\n\t
"
"fmla v22.4s, v4.4s, v1.s[1]
\n\t
"
"fmla v23.4s, v2.4s, v1.s[2]
\n\t
"
"fmla v24.4s, v3.4s, v1.s[2]
\n\t
"
"fmla v25.4s, v4.4s, v1.s[2]
\n\t
"
"fmla v26.4s, v2.4s, v1.s[3]
\n\t
"
"fmla v27.4s, v3.4s, v1.s[3]
\n\t
"
"fmla v28.4s, v4.4s, v1.s[3]
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge 1b
\n\t
"
"2:
\n\t
"
"st1 {v5.4s, v6.4s, v7.4s}, [%[c]], %[step]
\n\t
"
"st1 {v8.4s, v9.4s, v10.4s}, [%[c]], %[step]
\n\t
"
"st1 {v11.4s, v12.4s, v13.4s}, [%[c]], %[step]
\n\t
"
"st1 {v14.4s, v15.4s, v16.4s}, [%[c]], %[step]
\n\t
"
"st1 {v17.4s, v18.4s, v19.4s}, [%[c]], %[step]
\n\t
"
"st1 {v20.4s, v21.4s, v22.4s}, [%[c]], %[step]
\n\t
"
"st1 {v23.4s, v24.4s, v25.4s}, [%[c]], %[step]
\n\t
"
"st1 {v26.4s, v27.4s, v28.4s}, [%[c]], %[step]
\n\t
"
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
step
]
"r"
(
step
)
:
"memory"
,
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v18"
,
"v19"
,
"v20"
,
"v21"
,
"v22"
,
"v23"
,
"v24"
,
"v25"
,
"v26"
,
"v27"
,
"v28"
);
}
// C = A * B + C, relu(C)
void
Gemm
::
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
void
Gemm
::
AddDot6x16
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
;
int
step
=
4
*
ldc
;
int
step1
=
4
*
6
;
asm
volatile
(
float
*
c_ptr
,
*
C_ptr
;
float32x4_t
cv
;
float32x4_t
cv1
;
float32x4_t
zero
=
vdupq_n_f32
(
0.0
);
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
"dup v6.4s, wzr
\n\t
"
"dup v7.4s, wzr
\n\t
"
"dup v8.4s, wzr
\n\t
"
"dup v9.4s, wzr
\n\t
"
"dup v10.4s, wzr
\n\t
"
"dup v11.4s, wzr
\n\t
"
"dup v12.4s, wzr
\n\t
"
"dup v13.4s, wzr
\n\t
"
"dup v14.4s, wzr
\n\t
"
"dup v15.4s, wzr
\n\t
"
"dup v16.4s, wzr
\n\t
"
"dup v17.4s, wzr
\n\t
"
"dup v18.4s, wzr
\n\t
"
"dup v19.4s, wzr
\n\t
"
"dup v20.4s, wzr
\n\t
"
"dup v21.4s, wzr
\n\t
"
"dup v22.4s, wzr
\n\t
"
"dup v23.4s, wzr
\n\t
"
"dup v24.4s, wzr
\n\t
"
"dup v25.4s, wzr
\n\t
"
"dup v26.4s, wzr
\n\t
"
"dup v27.4s, wzr
\n\t
"
"dup v28.4s, wzr
\n\t
"
"dup v29.4s, wzr
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"blt 2f
\n\t
"
"1:
\n\t
"
"prfm pldl1keep, [%[a_ptr], #24]
\n\t
"
"prfm pldl1keep, [%[b_ptr], #64]
\n\t
"
"ld1 {v0.4s, v1.4s}, [%[a_ptr]], %[step1]
\n\t
"
"ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [%[b_ptr]], #64
\n\t
"
"fmla v6.4s, v2.4s, v0.s[0]
\n\t
"
"fmla v7.4s, v3.4s, v0.s[0]
\n\t
"
"fmla v8.4s, v4.4s, v0.s[0]
\n\t
"
"fmla v9.4s, v5.4s, v0.s[0]
\n\t
"
"fmla v10.4s, v2.4s, v0.s[1]
\n\t
"
"fmla v11.4s, v3.4s, v0.s[1]
\n\t
"
"fmla v12.4s, v4.4s, v0.s[1]
\n\t
"
"fmla v13.4s, v5.4s, v0.s[1]
\n\t
"
"fmla v14.4s, v2.4s, v0.s[2]
\n\t
"
"fmla v15.4s, v3.4s, v0.s[2]
\n\t
"
"fmla v16.4s, v4.4s, v0.s[2]
\n\t
"
"fmla v17.4s, v5.4s, v0.s[2]
\n\t
"
"fmla v18.4s, v2.4s, v0.s[3]
\n\t
"
"fmla v19.4s, v3.4s, v0.s[3]
\n\t
"
"fmla v20.4s, v4.4s, v0.s[3]
\n\t
"
"fmla v21.4s, v5.4s, v0.s[3]
\n\t
"
"fmla v22.4s, v2.4s, v1.s[0]
\n\t
"
"fmla v23.4s, v3.4s, v1.s[0]
\n\t
"
"fmla v24.4s, v4.4s, v1.s[0]
\n\t
"
"fmla v25.4s, v5.4s, v1.s[0]
\n\t
"
"fmla v26.4s, v2.4s, v1.s[1]
\n\t
"
"fmla v27.4s, v3.4s, v1.s[1]
\n\t
"
"fmla v28.4s, v4.4s, v1.s[1]
\n\t
"
"fmla v29.4s, v5.4s, v1.s[1]
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge 1b
\n\t
"
"2:
\n\t
"
"st1 {v6.4s, v7.4s, v8.4s, v9.4s}, [%[c]], %[step]
\n\t
"
"st1 {v10.4s, v11.4s, v12.4s, v13.4s}, [%[c]], %[step]
\n\t
"
"st1 {v14.4s, v15.4s, v16.4s, v17.4s}, [%[c]], %[step]
\n\t
"
"st1 {v18.4s, v19.4s, v20.4s, v21.4s}, [%[c]], %[step]
\n\t
"
"st1 {v22.4s, v23.4s, v24.4s, v25.4s}, [%[c]], %[step]
\n\t
"
"st1 {v26.4s, v27.4s, v28.4s, v29.4s}, [%[c]], %[step]
\n\t
"
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
)
:
"memory"
,
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v18"
,
"v19"
,
"v20"
,
"v21"
,
"v22"
,
"v23"
,
"v24"
,
"v25"
,
"v26"
,
"v27"
,
"v28"
,
"v29"
);
}
#else
void
Gemm
::
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
/
4
;
int
kc2
=
k
%
4
;
int
step
=
4
*
ldc
;
asm
volatile
(
"pld [%[a_ptr]]
\n\t
"
"pld [%[b_ptr]]
\n\t
"
"vmov.f32 q10, #0.0
\n\t
"
"vmov.f32 q11, #0.0
\n\t
"
"vmov.f32 q12, #0.0
\n\t
"
"vmov.f32 q13, #0.0
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"blt end_kc1_%=
\n\t
"
"loop_kc1_%=:
\n\t
"
"pld [%[a_ptr], #64]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vld1.32 {q0, q1}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q10, q2, d0[0]
\n\t
"
"vmla.f32 q11, q2, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q2, d1[1]
\n\t
"
"vmla.f32 q10, q3, d2[0]
\n\t
"
"vmla.f32 q11, q3, d2[1]
\n\t
"
"vmla.f32 q12, q3, d3[0]
\n\t
"
"vmla.f32 q13, q3, d3[1]
\n\t
"
"vld1.32 {q4, q5}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q6, q7}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q10, q6, d8[0]
\n\t
"
"vmla.f32 q11, q6, d8[1]
\n\t
"
"vmla.f32 q12, q6, d9[0]
\n\t
"
"vmla.f32 q13, q6, d9[1]
\n\t
"
"vmla.f32 q10, q7, d10[0]
\n\t
"
"vmla.f32 q11, q7, d10[1]
\n\t
"
"vmla.f32 q12, q7, d11[0]
\n\t
"
"vmla.f32 q13, q7, d11[1]
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge loop_kc1_%=
\n\t
"
"end_kc1_%=:
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"blt end_kc2_%=
\n\t
"
"loop_kc2_%=:
\n\t
"
"vld1.32 {q0}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q1}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q10, q1, d0[0]
\n\t
"
"vmla.f32 q11, q1, d0[1]
\n\t
"
"vmla.f32 q12, q1, d1[0]
\n\t
"
"vmla.f32 q13, q1, d1[1]
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"bge loop_kc2_%=
\n\t
"
"end_kc2_%=:
\n\t
"
"mov r5, %[c]
\n\t
"
"mov r6, %[step]
\n\t
"
"vst1.32 {q10}, [r5], r6
\n\t
"
"vst1.32 {q11}, [r5], r6
\n\t
"
"vst1.32 {q12}, [r5], r6
\n\t
"
"vst1.32 {q13}, [r5]
\n\t
"
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
kc2
]
"r"
(
kc2
),
[
step
]
"r"
(
step
)
:
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
}
void
Gemm
::
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
/
4
;
int
kc2
=
k
%
4
;
int
step
=
4
*
ldc
;
asm
volatile
(
"pld [%[a_ptr]]
\n\t
"
"pld [%[b_ptr]]
\n\t
"
"vmov.f32 q8, #0.0
\n\t
"
"vmov.f32 q9, #0.0
\n\t
"
"vmov.f32 q10, #0.0
\n\t
"
"vmov.f32 q11, #0.0
\n\t
"
"vmov.f32 q12, #0.0
\n\t
"
"vmov.f32 q13, #0.0
\n\t
"
"vmov.f32 q14, #0.0
\n\t
"
"vmov.f32 q15, #0.0
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"blt end_kc1_%=
\n\t
"
"loop_kc1_%=:
\n\t
"
"pld [%[a_ptr], #64]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vld1.32 {q0, q1}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vld1.32 {q4, q5}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q8, q2, d0[0]
\n\t
"
"vmla.f32 q9, q3, d0[0]
\n\t
"
"vmla.f32 q10, q2, d0[1]
\n\t
"
"vmla.f32 q11, q3, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q3, d1[0]
\n\t
"
"vmla.f32 q14, q2, d1[1]
\n\t
"
"vmla.f32 q15, q3, d1[1]
\n\t
"
"vmla.f32 q8, q4, d2[0]
\n\t
"
"vmla.f32 q9, q5, d2[0]
\n\t
"
"vmla.f32 q10, q4, d2[1]
\n\t
"
"vmla.f32 q11, q5, d2[1]
\n\t
"
"vmla.f32 q12, q4, d3[0]
\n\t
"
"vmla.f32 q13, q5, d3[0]
\n\t
"
"vmla.f32 q14, q4, d3[1]
\n\t
"
"vmla.f32 q15, q5, d3[1]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vld1.32 {q0, q1}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vld1.32 {q4, q5}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q8, q2, d0[0]
\n\t
"
"vmla.f32 q9, q3, d0[0]
\n\t
"
"vmla.f32 q10, q2, d0[1]
\n\t
"
"vmla.f32 q11, q3, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q3, d1[0]
\n\t
"
"vmla.f32 q14, q2, d1[1]
\n\t
"
"vmla.f32 q15, q3, d1[1]
\n\t
"
"vmla.f32 q8, q4, d2[0]
\n\t
"
"vmla.f32 q9, q5, d2[0]
\n\t
"
"vmla.f32 q10, q4, d2[1]
\n\t
"
"vmla.f32 q11, q5, d2[1]
\n\t
"
"vmla.f32 q12, q4, d3[0]
\n\t
"
"vmla.f32 q13, q5, d3[0]
\n\t
"
"vmla.f32 q14, q4, d3[1]
\n\t
"
"vmla.f32 q15, q5, d3[1]
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge loop_kc1_%=
\n\t
"
"end_kc1_%=:
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"blt end_kc2_%=
\n\t
"
"loop_kc2_%=:
\n\t
"
"vld1.32 {q0}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q8, q2, d0[0]
\n\t
"
"vmla.f32 q9, q3, d0[0]
\n\t
"
"vmla.f32 q10, q2, d0[1]
\n\t
"
"vmla.f32 q11, q3, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q3, d1[0]
\n\t
"
"vmla.f32 q14, q2, d1[1]
\n\t
"
"vmla.f32 q15, q3, d1[1]
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"bge loop_kc2_%=
\n\t
"
"end_kc2_%=:
\n\t
"
"mov r5, %[c]
\n\t
"
"mov r6, %[step]
\n\t
"
"vst1.32 {q8, q9}, [r5], r6
\n\t
"
"vst1.32 {q10, q11}, [r5], r6
\n\t
"
"vst1.32 {q12, q13}, [r5], r6
\n\t
"
"vst1.32 {q14, q15}, [r5]
\n\t
"
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
kc2
]
"r"
(
kc2
),
[
step
]
"r"
(
step
)
:
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
}
void
Gemm
::
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
/
8
;
int
kc2
=
k
%
8
;
int
step
=
sizeof
(
float
)
*
ldc
;
asm
volatile
(
"pld [%[a_ptr]]
\n\t
"
"pld [%[a_ptr], #64]
\n\t
"
"pld [%[b_ptr]]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vmov.f32 q4, #0.0
\n\t
"
"vmov.f32 q5, #0.0
\n\t
"
"vmov.f32 q6, #0.0
\n\t
"
"vmov.f32 q7, #0.0
\n\t
"
"vmov.f32 q8, #0.0
\n\t
"
"vmov.f32 q9, #0.0
\n\t
"
"vmov.f32 q10, #0.0
\n\t
"
"vmov.f32 q11, #0.0
\n\t
"
"vmov.f32 q12, #0.0
\n\t
"
"vmov.f32 q13, #0.0
\n\t
"
"vmov.f32 q14, #0.0
\n\t
"
"vmov.f32 q15, #0.0
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"blt 2f
\n\t
"
"1:
\n\t
"
"pld [%[a_ptr], #128]
\n\t
"
"pld [%[b_ptr], #128]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"pld [%[a_ptr], #128]
\n\t
"
"pld [%[b_ptr], #128]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"pld [%[a_ptr], #128]
\n\t
"
"pld [%[b_ptr], #128]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"pld [%[a_ptr], #128]
\n\t
"
"pld [%[b_ptr], #128]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge 1b
\n\t
"
"2:
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"blt 4f
\n\t
"
"3:
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"bge 3b
\n\t
"
"4:
\n\t
"
"mov r5, %[c]
\n\t
"
"mov r6, %[step]
\n\t
"
"vst1.32 {q4, q5}, [r5], r6
\n\t
"
"vst1.32 {q6, q7}, [r5], r6
\n\t
"
"vst1.32 {q8, q9}, [r5], r6
\n\t
"
"vst1.32 {q10, q11}, [r5], r6
\n\t
"
"vst1.32 {q12, q13}, [r5], r6
\n\t
"
"vst1.32 {q14, q15}, [r5]
\n\t
"
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
kc2
]
"r"
(
kc2
),
[
step
]
"r"
(
step
)
:
"cc"
,
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
}
#endif // __aarch64__
#endif // __ARM_NEON
#if __ARM_NEON
#if __aarch64__
// 分块矩阵乘法结果回写
// C = A * B
void
Gemm
::
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
float
*
c_ptr
,
*
C_ptr
;
float32x4_t
cv
;
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
c_ptr
=
c
+
i
*
NC
;
C_ptr
=
C
+
i
*
ldc
;
for
(
int
j
=
0
;
j
<
nc1
;
++
j
)
{
cv
=
vld1q_f32
(
c_ptr
);
vst1q_f32
(
C_ptr
,
cv
);
c_ptr
+=
4
;
C_ptr
+=
4
;
}
if
(
_nc1
!=
0
)
{
cv
=
vld1q_f32
(
c_ptr
);
if
(
_nc1
>=
1
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
0
);
C_ptr
++
;
}
if
(
_nc1
>=
2
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
1
);
C_ptr
++
;
}
if
(
_nc1
>=
3
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
2
);
}
}
}
}
// C = alpha * A * B + beta * C
void
Gemm
::
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
// C = A * B + C
void
Gemm
::
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
float
*
c_ptr
,
*
C_ptr
;
float32x4_t
cv
;
float32x4_t
cv1
;
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
c_ptr
=
c
+
i
*
NC
;
C_ptr
=
C
+
i
*
ldc
;
for
(
int
j
=
0
;
j
<
nc1
;
++
j
)
{
cv
=
vld1q_f32
(
c_ptr
);
cv1
=
vld1q_f32
(
C_ptr
);
cv
=
vaddq_f32
(
cv
,
cv1
);
vst1q_f32
(
C_ptr
,
cv
);
c_ptr
+=
4
;
C_ptr
+=
4
;
}
if
(
_nc1
!=
0
)
{
cv
=
vld1q_f32
(
c_ptr
);
cv1
=
vld1q_f32
(
C_ptr
);
cv
=
vaddq_f32
(
cv
,
cv1
);
if
(
_nc1
>=
1
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
0
);
C_ptr
++
;
}
if
(
_nc1
>=
2
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
1
);
C_ptr
++
;
}
if
(
_nc1
>=
3
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
2
);
}
}
}
}
// C = A * B + bias
void
Gemm
::
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
float
*
c_ptr
,
*
C_ptr
;
float32x4_t
cv
;
float32x4_t
biasv
;
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
c_ptr
=
c
+
i
*
NC
;
C_ptr
=
C
+
i
*
ldc
;
biasv
=
vld1q_dup_f32
(
bias
+
i
);
for
(
int
j
=
0
;
j
<
nc1
;
++
j
)
{
cv
=
vld1q_f32
(
c_ptr
);
cv
=
vaddq_f32
(
cv
,
biasv
);
vst1q_f32
(
C_ptr
,
cv
);
c_ptr
+=
4
;
C_ptr
+=
4
;
}
if
(
_nc1
!=
0
)
{
cv
=
vld1q_f32
(
c_ptr
);
cv
=
vaddq_f32
(
cv
,
biasv
);
if
(
_nc1
>=
1
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
0
);
C_ptr
++
;
}
if
(
_nc1
>=
2
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
1
);
C_ptr
++
;
}
if
(
_nc1
>=
3
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
2
);
C_ptr
++
;
}
}
}
}
// C = A * B + C, relu(C)
void
Gemm
::
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
float
*
c_ptr
,
*
C_ptr
;
float32x4_t
cv
;
float32x4_t
cv1
;
float32x4_t
zero
=
vdupq_n_f32
(
0.0
);
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
c_ptr
=
c
+
i
*
NC
;
C_ptr
=
C
+
i
*
ldc
;
for
(
int
j
=
0
;
j
<
nc1
;
++
j
)
{
...
...
@@ -1188,82 +1755,8 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
}
}
void
Gemm
::
VectorKernel
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
)
{}
#else
void
Gemm
::
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
/
4
;
int
kc2
=
k
%
4
;
int
step
=
4
*
ldc
;
asm
volatile
(
"pld [%[a_ptr]]
\n\t
"
"pld [%[b_ptr]]
\n\t
"
"vmov.f32 q10, #0.0
\n\t
"
"vmov.f32 q11, #0.0
\n\t
"
"vmov.f32 q12, #0.0
\n\t
"
"vmov.f32 q13, #0.0
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"blt end_kc1_%=
\n\t
"
"loop_kc1_%=:
\n\t
"
"pld [%[a_ptr], #64]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vld1.32 {q0, q1}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q10, q2, d0[0]
\n\t
"
"vmla.f32 q11, q2, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q2, d1[1]
\n\t
"
"vmla.f32 q10, q3, d2[0]
\n\t
"
"vmla.f32 q11, q3, d2[1]
\n\t
"
"vmla.f32 q12, q3, d3[0]
\n\t
"
"vmla.f32 q13, q3, d3[1]
\n\t
"
"vld1.32 {q4, q5}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q6, q7}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q10, q6, d8[0]
\n\t
"
"vmla.f32 q11, q6, d8[1]
\n\t
"
"vmla.f32 q12, q6, d9[0]
\n\t
"
"vmla.f32 q13, q6, d9[1]
\n\t
"
"vmla.f32 q10, q7, d10[0]
\n\t
"
"vmla.f32 q11, q7, d10[1]
\n\t
"
"vmla.f32 q12, q7, d11[0]
\n\t
"
"vmla.f32 q13, q7, d11[1]
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge loop_kc1_%=
\n\t
"
"end_kc1_%=:
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"blt end_kc2_%=
\n\t
"
"loop_kc2_%=:
\n\t
"
"vld1.32 {q0}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q1}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q10, q1, d0[0]
\n\t
"
"vmla.f32 q11, q1, d0[1]
\n\t
"
"vmla.f32 q12, q1, d1[0]
\n\t
"
"vmla.f32 q13, q1, d1[1]
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"bge loop_kc2_%=
\n\t
"
"end_kc2_%=:
\n\t
"
"mov r5, %[c]
\n\t
"
"mov r6, %[step]
\n\t
"
"vst1.32 {q10}, [r5], r6
\n\t
"
"vst1.32 {q11}, [r5], r6
\n\t
"
"vst1.32 {q12}, [r5], r6
\n\t
"
"vst1.32 {q13}, [r5]
\n\t
"
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
kc2
]
"r"
(
kc2
),
[
step
]
"r"
(
step
)
:
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
}
void
Gemm
::
VectorKernel
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
)
{
...
...
@@ -1486,10 +1979,10 @@ void Gemm::VectorKernel(int m, int n, int k, float alpha, const float *A,
}
}
/*
void
Gemm
::
VectorKernelWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int lda, const float *B, int ldb, float beta, float *C,
int ldc, bool relu, float *new_scale, float *new_bias) {
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
)
{
float
*
bufferC
=
static_cast
<
float
*>
(
memory
::
Alloc
(
sizeof
(
float
)
*
n
));
const
float
*
a0
,
*
b0
,
*
b1
,
*
b2
,
*
b3
;
...
...
@@ -1697,114 +2190,6 @@ void Gemm::VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
VecWriteWithBn
(
n
,
bufferC
,
C
,
ldc
,
new_scale
,
new_bias
);
}
}
*/
void
Gemm
::
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
/
4
;
int
kc2
=
k
%
4
;
int
step
=
4
*
ldc
;
asm
volatile
(
"pld [%[a_ptr]]
\n\t
"
"pld [%[b_ptr]]
\n\t
"
"vmov.f32 q8, #0.0
\n\t
"
"vmov.f32 q9, #0.0
\n\t
"
"vmov.f32 q10, #0.0
\n\t
"
"vmov.f32 q11, #0.0
\n\t
"
"vmov.f32 q12, #0.0
\n\t
"
"vmov.f32 q13, #0.0
\n\t
"
"vmov.f32 q14, #0.0
\n\t
"
"vmov.f32 q15, #0.0
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"blt end_kc1_%=
\n\t
"
"loop_kc1_%=:
\n\t
"
"pld [%[a_ptr], #64]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vld1.32 {q0, q1}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vld1.32 {q4, q5}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q8, q2, d0[0]
\n\t
"
"vmla.f32 q9, q3, d0[0]
\n\t
"
"vmla.f32 q10, q2, d0[1]
\n\t
"
"vmla.f32 q11, q3, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q3, d1[0]
\n\t
"
"vmla.f32 q14, q2, d1[1]
\n\t
"
"vmla.f32 q15, q3, d1[1]
\n\t
"
"vmla.f32 q8, q4, d2[0]
\n\t
"
"vmla.f32 q9, q5, d2[0]
\n\t
"
"vmla.f32 q10, q4, d2[1]
\n\t
"
"vmla.f32 q11, q5, d2[1]
\n\t
"
"vmla.f32 q12, q4, d3[0]
\n\t
"
"vmla.f32 q13, q5, d3[0]
\n\t
"
"vmla.f32 q14, q4, d3[1]
\n\t
"
"vmla.f32 q15, q5, d3[1]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vld1.32 {q0, q1}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vld1.32 {q4, q5}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q8, q2, d0[0]
\n\t
"
"vmla.f32 q9, q3, d0[0]
\n\t
"
"vmla.f32 q10, q2, d0[1]
\n\t
"
"vmla.f32 q11, q3, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q3, d1[0]
\n\t
"
"vmla.f32 q14, q2, d1[1]
\n\t
"
"vmla.f32 q15, q3, d1[1]
\n\t
"
"vmla.f32 q8, q4, d2[0]
\n\t
"
"vmla.f32 q9, q5, d2[0]
\n\t
"
"vmla.f32 q10, q4, d2[1]
\n\t
"
"vmla.f32 q11, q5, d2[1]
\n\t
"
"vmla.f32 q12, q4, d3[0]
\n\t
"
"vmla.f32 q13, q5, d3[0]
\n\t
"
"vmla.f32 q14, q4, d3[1]
\n\t
"
"vmla.f32 q15, q5, d3[1]
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"bge loop_kc1_%=
\n\t
"
"end_kc1_%=:
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"blt end_kc2_%=
\n\t
"
"loop_kc2_%=:
\n\t
"
"vld1.32 {q0}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q8, q2, d0[0]
\n\t
"
"vmla.f32 q9, q3, d0[0]
\n\t
"
"vmla.f32 q10, q2, d0[1]
\n\t
"
"vmla.f32 q11, q3, d0[1]
\n\t
"
"vmla.f32 q12, q2, d1[0]
\n\t
"
"vmla.f32 q13, q3, d1[0]
\n\t
"
"vmla.f32 q14, q2, d1[1]
\n\t
"
"vmla.f32 q15, q3, d1[1]
\n\t
"
"subs %[kc2], %[kc2], #1
\n\t
"
"bge loop_kc2_%=
\n\t
"
"end_kc2_%=:
\n\t
"
"mov r5, %[c]
\n\t
"
"mov r6, %[step]
\n\t
"
"vst1.32 {q8, q9}, [r5], r6
\n\t
"
"vst1.32 {q10, q11}, [r5], r6
\n\t
"
"vst1.32 {q12, q13}, [r5], r6
\n\t
"
"vst1.32 {q14, q15}, [r5]
\n\t
"
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
kc2
]
"r"
(
kc2
),
[
step
]
"r"
(
step
)
:
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
}
// C = A * B
void
Gemm
::
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{
...
...
@@ -2567,162 +2952,25 @@ void Gemm::WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc,
cv
=
vld1q_f32
(
c_ptr
);
biasv
=
vld1q_f32
(
bias_ptr
);
cv
=
vmlaq_n_f32
(
nbias
,
cv
,
scale0
);
cv
=
vaddq_f32
(
cv
,
biasv
);
cv
=
vmaxq_f32
(
cv
,
zero
);
if
(
_nc1
>=
1
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
0
);
C_ptr
++
;
}
if
(
_nc1
>=
2
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
1
);
C_ptr
++
;
}
if
(
_nc1
>=
3
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
2
);
}
}
}
}
// C = A * B
void
Gemm
::
VecWriteBasic
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
int
nc2
=
_nc1
/
4
;
int
nc3
=
16
-
4
*
(
_nc1
%
4
);
asm
volatile
(
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vst1.32 {q0, q1}, [%[C]]!
\n\t
"
"vld1.32 {q2, q3}, [%[c]]!
\n\t
"
"vst1.32 {q2, q3}, [%[C]]!
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"subs %[nc2], %[nc2], #1
\n\t
"
"blt end_nc2_%=
\n\t
"
"loop_nc2_%=:
\n\t
"
"vld1.32 {q4}, [%[c]]!
\n\t
"
"vst1.32 {q4}, [%[C]]!
\n\t
"
"subs %[nc2], %[nc2], #1
\n\t
"
"bge loop_nc2_%=
\n\t
"
"end_nc2_%=:
\n\t
"
"cmp %[nc3], #16
\n\t
"
"beq end_nc3_%=
\n\t
"
"sub %[c], %[c], %[nc3]
\n\t
"
"sub %[C], %[C], %[nc3]
\n\t
"
"vld1.32 {q5}, [%[c]]!
\n\t
"
"vst1.32 {q5}, [%[C]]!
\n\t
"
"end_nc3_%=:
\n\t
"
:
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
nc1
]
"r"
(
nc1
),
[
nc2
]
"r"
(
nc2
),
[
nc3
]
"r"
(
nc3
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
);
}
// C = alpha * A * B + beta * C
void
Gemm
::
VecWriteWithAlphaBeta
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
// C = A * B + C
void
Gemm
::
VecWriteWithAdd
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
asm
volatile
(
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vld1.32 {q2, q3}, [%[C]]
\n\t
"
"vadd.f32 q10, q0, q2
\n\t
"
"vadd.f32 q11, q1, q3
\n\t
"
"vst1.32 {q10, q11}, [%[C]]!
\n\t
"
"vld1.32 {q4, q5}, [%[c]]!
\n\t
"
"vld1.32 {q6, q7}, [%[C]]
\n\t
"
"vadd.f32 q12, q4, q6
\n\t
"
"vadd.f32 q13, q5, q7
\n\t
"
"vst1.32 {q12, q13}, [%[C]]!
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
:
[
C
]
"+r"
(
C
),
[
c
]
"+r"
(
c
)
:
[
nc1
]
"r"
(
nc1
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
if
(
_nc1
!=
0
)
{
for
(
int
j
=
0
;
j
<
_nc1
;
j
++
)
{
*
C
++
+=
*
c
++
;
}
}
}
// C = A * B + C, relu(C)
void
Gemm
::
VecWriteWithAddRelu
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
asm
volatile
(
"vmov.f32 q14, #0.0
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vld1.32 {q2, q3}, [%[C]]
\n\t
"
"vadd.f32 q10, q0, q2
\n\t
"
"vadd.f32 q11, q1, q3
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vmax.f32 q11, q11, q14
\n\t
"
"vst1.32 {q10, q11}, [%[C]]!
\n\t
"
"vld1.32 {q4, q5}, [%[c]]!
\n\t
"
"vld1.32 {q6, q7}, [%[C]]
\n\t
"
"vadd.f32 q12, q4, q6
\n\t
"
"vadd.f32 q13, q5, q7
\n\t
"
"vmax.f32 q12, q12, q14
\n\t
"
"vmax.f32 q13, q13, q14
\n\t
"
"vst1.32 {q12, q13}, [%[C]]!
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
:
[
C
]
"+r"
(
C
),
[
c
]
"+r"
(
c
)
:
[
nc1
]
"r"
(
nc1
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
if
(
_nc1
!=
0
)
{
for
(
int
j
=
0
;
j
<
_nc1
;
j
++
)
{
*
C
+=
*
c
;
if
(
*
C
<
0
)
{
*
C
=
0
;
cv
=
vaddq_f32
(
cv
,
biasv
);
cv
=
vmaxq_f32
(
cv
,
zero
);
if
(
_nc1
>=
1
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
0
);
C_ptr
++
;
}
if
(
_nc1
>=
2
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
1
);
C_ptr
++
;
}
if
(
_nc1
>=
3
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
2
);
}
C
++
;
c
++
;
}
}
}
/*
// C = A * B, batchnorm(C)
void Gemm::VecWriteWithBn(int n, float *c, float *C, int ldc, float *scale,
float *bias) {
// C = A * B
void
Gemm
::
VecWriteBasic
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
int
nc2
=
_nc1
/
4
;
...
...
@@ -2734,18 +2982,10 @@ void Gemm::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vld1.32 {q2, q3}, [%[scale]]! \n\t"
"vld1.32 {q10, q11}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q2 \n\t"
"vmla.f32 q11, q1, q3 \n\t"
"vst1.32 {q10, q11}, [%[C]]! \n\t"
"vst1.32 {q0, q1}, [%[C]]!
\n\t
"
"vld1.32 {q4, q5}, [%[c]]! \n\t"
"vld1.32 {q6, q7}, [%[scale]]! \n\t"
"vld1.32 {q12, q13}, [%[bias]]! \n\t"
"vmla.f32 q12, q4, q6 \n\t"
"vmla.f32 q13, q5, q7 \n\t"
"vst1.32 {q12, q13}, [%[C]]! \n\t"
"vld1.32 {q2, q3}, [%[c]]!
\n\t
"
"vst1.32 {q2, q3}, [%[C]]!
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
...
...
@@ -2755,11 +2995,8 @@ void Gemm::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
"blt end_nc2_%=
\n\t
"
"loop_nc2_%=:
\n\t
"
"vld1.32 {q0}, [%[c]]! \n\t"
"vld1.32 {q1}, [%[scale]]! \n\t"
"vld1.32 {q10}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q1 \n\t"
"vst1.32 {q10}, [%[C]]! \n\t"
"vld1.32 {q4}, [%[c]]!
\n\t
"
"vst1.32 {q4}, [%[C]]!
\n\t
"
"subs %[nc2], %[nc2], #1
\n\t
"
"bge loop_nc2_%=
\n\t
"
...
...
@@ -2767,663 +3004,264 @@ void Gemm::VecWriteWithAddRelu(int n, float *c, float *C, int ldc) {
"cmp %[nc3], #16
\n\t
"
"beq end_nc3_%=
\n\t
"
"sub %[c], %[c], %[nc3]
\n\t
"
"sub %[scale], %[scale], %[nc3] \n\t"
"sub %[bias], %[bias], %[nc3] \n\t"
"sub %[C], %[C], %[nc3]
\n\t
"
"vld1.32 {q0}, [%[c]]! \n\t"
"vld1.32 {q1}, [%[scale]]! \n\t"
"vld1.32 {q10}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q1 \n\t"
"vst1.32 {q10}, [%[C]]! \n\t"
"vld1.32 {q5}, [%[c]]!
\n\t
"
"vst1.32 {q5}, [%[C]]!
\n\t
"
"end_nc3_%=:
\n\t
"
:
: [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3]
"r"(nc3), [scale] "r"(scale), [bias] "r"(bias) : "memory", "q0", "q1", "q2",
"q3", "q4", "q5", "q6", "q7", "q10", "q11", "q12", "q13");
}
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
nc1
]
"r"
(
nc1
),
[
nc2
]
"r"
(
nc2
),
[
nc3
]
"r"
(
nc3
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
);
}
// C = alpha * A * B + beta * C
void
Gemm
::
VecWriteWithAlphaBeta
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
// C = A * B, batchnorm(C), relu(C)
void Gemm::VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float
*scale, float *bias) { int nc1 = n / 16; int _nc1 = n % 16; int nc2 = _nc1 /
4; int nc3 = 16 - 4 * (_nc1 % 4)
;
// C = A * B + C
void
Gemm
::
VecWriteWithAdd
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
asm
volatile
(
"vmov.f32 q14, #0.0 \n\t"
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vld1.32 {q2, q3}, [%[scale]]! \n\t"
"vld1.32 {q10, q11}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q2 \n\t"
"vmla.f32 q11, q1, q3 \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vmax.f32 q11, q11, q14 \n\t"
"vld1.32 {q2, q3}, [%[C]]
\n\t
"
"vadd.f32 q10, q0, q2
\n\t
"
"vadd.f32 q11, q1, q3
\n\t
"
"vst1.32 {q10, q11}, [%[C]]!
\n\t
"
"vld1.32 {q4, q5}, [%[c]]!
\n\t
"
"vld1.32 {q6, q7}, [%[scale]]! \n\t"
"vld1.32 {q12, q13}, [%[bias]]! \n\t"
"vmla.f32 q12, q4, q6 \n\t"
"vmla.f32 q13, q5, q7 \n\t"
"vmax.f32 q12, q12, q14 \n\t"
"vmax.f32 q13, q13, q14 \n\t"
"vld1.32 {q6, q7}, [%[C]]
\n\t
"
"vadd.f32 q12, q4, q6
\n\t
"
"vadd.f32 q13, q5, q7
\n\t
"
"vst1.32 {q12, q13}, [%[C]]!
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"subs %[nc2], %[nc2], #1 \n\t"
"blt end_nc2_%= \n\t"
"loop_nc2_%=: \n\t"
"vld1.32 {q0}, [%[c]]! \n\t"
"vld1.32 {q1}, [%[scale]]! \n\t"
"vld1.32 {q10}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q1 \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vst1.32 {q10}, [%[C]]! \n\t"
"subs %[nc2], %[nc2], #1 \n\t"
"bge loop_nc2_%= \n\t"
"end_nc2_%=: \n\t"
"cmp %[nc3], #16 \n\t"
"beq end_nc3_%= \n\t"
"sub %[c], %[c], %[nc3] \n\t"
"sub %[scale], %[scale], %[nc3] \n\t"
"sub %[bias], %[bias], %[nc3] \n\t"
"sub %[C], %[C], %[nc3] \n\t"
"vld1.32 {q0}, [%[c]]! \n\t"
"vld1.32 {q1}, [%[scale]]! \n\t"
"vld1.32 {q10}, [%[bias]]! \n\t"
"vmla.f32 q10, q0, q1 \n\t"
"vmax.f32 q10, q10, q14 \n\t"
"vst1.32 {q10}, [%[C]]! \n\t"
"end_nc3_%=: \n\t"
:
: [C] "r"(C), [c] "r"(c), [nc1] "r"(nc1), [nc2] "r"(nc2), [nc3]
"r"(nc3), [scale] "r"(scale), [bias] "r"(bias) : "memory", "q0", "q1", "q2",
"q3", "q4", "q5", "q6", "q7", "q10", "q11", "q12", "q13", "q14");
}
*/
#endif // __aarch64__
#else
void
Gemm
::
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
float
*
c0
,
*
c1
,
*
c2
,
*
c3
;
c0
=
c
;
c1
=
c
+
ldc
;
c2
=
c
+
2
*
ldc
;
c3
=
c
+
3
*
ldc
;
for
(
int
p
=
0
;
p
<
k
;
p
+=
1
)
{
// first row
c0
[
0
]
+=
a
[
0
]
*
b
[
0
];
c0
[
1
]
+=
a
[
0
]
*
b
[
1
];
c0
[
2
]
+=
a
[
0
]
*
b
[
2
];
c0
[
3
]
+=
a
[
0
]
*
b
[
3
];
// second row
c1
[
0
]
+=
a
[
1
]
*
b
[
0
];
c1
[
1
]
+=
a
[
1
]
*
b
[
1
];
c1
[
2
]
+=
a
[
1
]
*
b
[
2
];
c1
[
3
]
+=
a
[
1
]
*
b
[
3
];
// third row
c2
[
0
]
+=
a
[
2
]
*
b
[
0
];
c2
[
1
]
+=
a
[
2
]
*
b
[
1
];
c2
[
2
]
+=
a
[
2
]
*
b
[
2
];
c2
[
3
]
+=
a
[
2
]
*
b
[
3
];
// fourth row
c3
[
0
]
+=
a
[
3
]
*
b
[
0
];
c3
[
1
]
+=
a
[
3
]
*
b
[
1
];
c3
[
2
]
+=
a
[
3
]
*
b
[
2
];
c3
[
3
]
+=
a
[
3
]
*
b
[
3
];
a
+=
4
;
b
+=
4
;
}
}
void
Gemm
::
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
}
void
Gemm
::
WriteBasic
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
Gemm
::
WriteWithAlphaBeta
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
Gemm
::
WriteWithAdd
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
Gemm
::
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{}
void
Gemm
::
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
)
{}
void
Gemm
::
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{}
void
Gemm
::
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{}
void
Gemm
::
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
)
{}
void
Gemm
::
WriteWithBnRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
)
{}
void
Gemm
::
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias1
)
{
}
#endif // __ARM_NEON
// 32位 float 矩阵乘法
void
Gemm
::
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
32
*
1024
;
int
L2
=
512
*
1024
;
KC
=
k
;
MC
=
L1
/
(
KC
*
sizeof
(
float
));
NC
=
L2
/
(
KC
*
sizeof
(
float
));
// make sure MC is multiple of MR, and NC is multiple of NR
if
(
MC
==
0
)
{
MC
=
MR
;
}
else
{
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
}
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if
(
NC
==
0
)
{
NC
=
NR
;
}
else
{
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
}
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
));
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
int
mc
,
nc
;
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
#if __aarch64__
// PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
PackMatrixB_16c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
#else
PackMatrixB_8c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
#endif
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
#if __aarch64__
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
// PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#else
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
#endif
if
(
bias
==
nullptr
)
{
InnerKernelWithBias
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
,
nullptr
);
}
else
{
InnerKernelWithBias
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
,
bias
+
i
);
}
}
}
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
void
Gemm
::
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
32
*
1024
;
int
L2
=
512
*
1024
;
KC
=
k
;
MC
=
L1
/
(
KC
*
sizeof
(
float
));
NC
=
L2
/
(
KC
*
sizeof
(
float
));
// make sure MC is multiple of MR, and NC is multiple of NR
if
(
MC
==
0
)
{
MC
=
MR
;
}
else
{
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
}
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if
(
NC
==
0
)
{
NC
=
NR
;
}
else
{
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
}
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
));
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
:
[
C
]
"+r"
(
C
),
[
c
]
"+r"
(
c
)
:
[
nc1
]
"r"
(
nc1
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
int
mc
,
nc
;
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
#if __aarch64__
// PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
PackMatrixB_16c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
#else
PackMatrixB_8c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
#endif
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
#if __aarch64__
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
// PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#else
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
#endif
if
(
bias
==
nullptr
)
{
InnerKernelWithBn
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
);
}
else
{
InnerKernelWithBnAdd
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
,
bias
+
i
*
ldc
+
j
);
}
if
(
_nc1
!=
0
)
{
for
(
int
j
=
0
;
j
<
_nc1
;
j
++
)
{
*
C
++
+=
*
c
++
;
}
}
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
void
Gemm
::
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
32
*
1024
;
int
L2
=
0.5
*
1024
*
1024
;
// C = A * B + C, relu(C)
void
Gemm
::
VecWriteWithAddRelu
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
)
{
int
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
KC
=
k
;
MC
=
L1
/
(
KC
*
sizeof
(
float
));
NC
=
L2
/
(
KC
*
sizeof
(
float
));
asm
volatile
(
"vmov.f32 q14, #0.0
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
// make sure MC is multiple of MR, and NC is multiple of NR
if
(
MC
==
0
)
{
MC
=
MR
;
}
else
{
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
}
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if
(
NC
==
0
)
{
NC
=
NR
;
}
else
{
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
}
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vld1.32 {q2, q3}, [%[C]]
\n\t
"
"vadd.f32 q10, q0, q2
\n\t
"
"vadd.f32 q11, q1, q3
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vmax.f32 q11, q11, q14
\n\t
"
"vst1.32 {q10, q11}, [%[C]]!
\n\t
"
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
));
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
"vld1.32 {q4, q5}, [%[c]]!
\n\t
"
"vld1.32 {q6, q7}, [%[C]]
\n\t
"
"vadd.f32 q12, q4, q6
\n\t
"
"vadd.f32 q13, q5, q7
\n\t
"
"vmax.f32 q12, q12, q14
\n\t
"
"vmax.f32 q13, q13, q14
\n\t
"
"vst1.32 {q12, q13}, [%[C]]!
\n\t
"
for
(
int
l
=
0
;
l
<
KC
;
++
l
)
{
zero
[
l
]
=
0
;
}
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
int
mc
,
nc
;
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
#if __aarch64__
// PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
PackMatrixB_16c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
#else
PackMatrixB_8c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
#endif
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
#if __aarch64__
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
// PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#else
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
#endif
if
(
bias1
==
nullptr
)
{
InnerKernelWithPRelu
(
mc
,
nc
,
packedA
,
packedB
,
packedC
,
&
C
(
i
,
j
),
ldc
,
p
+
i
,
mode
,
bias
+
i
,
nullptr
);
}
else
{
InnerKernelWithPRelu
(
mc
,
nc
,
packedA
,
packedB
,
packedC
,
&
C
(
i
,
j
),
ldc
,
p
+
i
,
mode
,
bias
+
i
,
bias1
+
i
*
ldc
+
j
);
:
[
C
]
"+r"
(
C
),
[
c
]
"+r"
(
c
)
:
[
nc1
]
"r"
(
nc1
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
if
(
_nc1
!=
0
)
{
for
(
int
j
=
0
;
j
<
_nc1
;
j
++
)
{
*
C
+=
*
c
;
if
(
*
C
<
0
)
{
*
C
=
0
;
}
C
++
;
c
++
;
}
}
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
// 32位 float 矩阵乘法
void
Gemm
::
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
)
{
#ifndef __aarch64__
if
(
m
==
1
&&
bias
==
nullptr
)
{
return
VectorKernel
(
m
,
n
,
k
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
,
relu
);
}
#endif // __aarch64__
#ifdef _OPENMP
int
max_threads
=
omp_get_max_threads
();
#else
int
max_threads
=
1
;
#endif
// C = A * B, batchnorm(C)
void
Gemm
::
VecWriteWithBn
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
scale
,
float
*
bias
)
{
int
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
int
nc2
=
_nc1
/
4
;
int
nc3
=
16
-
4
*
(
_nc1
%
4
);
// int L1 = 64 / max_threads * 1024;
int
L
=
(
max_threads
>
2
)
?
64
:
32
;
int
L1
=
L
/
max_threads
*
1024
;
KC
=
k
;
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
if
(
m
>
n
)
{
// 对 A 分块
MC
=
L1
/
(
KC
*
sizeof
(
float
));
if
(
MC
==
0
)
{
MC
=
MR
;
}
else
{
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
}
// 补齐 B
NC
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
asm
volatile
(
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
#if __aarch64__
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vld1.32 {q2, q3}, [%[scale]]!
\n\t
"
"vld1.32 {q10, q11}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q2
\n\t
"
"vmla.f32 q11, q1, q3
\n\t
"
"vst1.32 {q10, q11}, [%[C]]!
\n\t
"
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
(
*
this
.
*
procPackB
)(
KC
,
n
,
n
%
NR
,
B
,
ldb
,
packedB
);
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
*
max_threads
));
}
else
{
// 对 B 分块
NC
=
L1
/
(
KC
*
sizeof
(
float
));
if
(
NC
==
0
)
{
NC
=
NR
;
}
else
{
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
}
// 补齐 A
MC
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
"vld1.32 {q4, q5}, [%[c]]!
\n\t
"
"vld1.32 {q6, q7}, [%[scale]]!
\n\t
"
"vld1.32 {q12, q13}, [%[bias]]!
\n\t
"
"vmla.f32 q12, q4, q6
\n\t
"
"vmla.f32 q13, q5, q7
\n\t
"
"vst1.32 {q12, q13}, [%[C]]!
\n\t
"
#if __aarch64__
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
"subs %[nc2], %[nc2], #1
\n\t
"
"blt end_nc2_%=
\n\t
"
"loop_nc2_%=:
\n\t
"
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
(
*
this
.
*
procPackA
)(
m
,
KC
,
m
%
MR
,
A
,
lda
,
packedA
);
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
*
max_threads
));
}
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
*
max_threads
));
"vld1.32 {q0}, [%[c]]!
\n\t
"
"vld1.32 {q1}, [%[scale]]!
\n\t
"
"vld1.32 {q10}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q1
\n\t
"
"vst1.32 {q10}, [%[C]]!
\n\t
"
if
(
m
>
n
)
{
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
#else
int
local_threads
=
0
;
#endif
"subs %[nc2], %[nc2], #1
\n\t
"
"bge loop_nc2_%=
\n\t
"
"end_nc2_%=:
\n\t
"
int
mc
;
mc
=
s_min
(
m
-
i
,
MC
);
float
*
local_A
=
packedA
+
MC
*
KC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
(
*
this
.
*
procPackA
)(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
if
(
bias
==
nullptr
)
{
InnerKernelWithBias
(
mc
,
n
,
alpha
,
local_A
,
packedB
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
nullptr
);
}
else
{
InnerKernelWithBias
(
mc
,
n
,
alpha
,
local_A
,
packedB
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
bias
+
i
);
}
}
}
else
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
#else
int
local_threads
=
0
;
#endif
"cmp %[nc3], #16
\n\t
"
"beq end_nc3_%=
\n\t
"
int
nc
;
nc
=
s_min
(
n
-
j
,
NC
);
float
*
local_B
=
packedB
+
KC
*
NC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
(
*
this
.
*
procPackB
)(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
InnerKernelWithBias
(
m
,
nc
,
alpha
,
packedA
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
bias
);
}
}
"sub %[c], %[c], %[nc3]
\n\t
"
"sub %[scale], %[scale], %[nc3]
\n\t
"
"sub %[bias], %[bias], %[nc3]
\n\t
"
"sub %[C], %[C], %[nc3]
\n\t
"
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
"vld1.32 {q0}, [%[c]]!
\n\t
"
"vld1.32 {q1}, [%[scale]]!
\n\t
"
"vld1.32 {q10}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q1
\n\t
"
"vst1.32 {q10}, [%[C]]!
\n\t
"
"end_nc3_%=:
\n\t
"
:
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
nc1
]
"r"
(
nc1
),
[
nc2
]
"r"
(
nc2
),
[
nc3
]
"r"
(
nc3
),
[
scale
]
"r"
(
scale
),
[
bias
]
"r"
(
bias
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
);
}
void
Gemm
::
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
#ifdef _OPENMP
int
max_threads
=
omp_get_max_threads
();
#else
int
max_threads
=
1
;
#endif
// C = A * B, batchnorm(C), relu(C)
void
Gemm
::
VecWriteWithBnRelu
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
scale
,
float
*
bias
)
{
int
nc1
=
n
/
16
;
int
_nc1
=
n
%
16
;
int
nc2
=
_nc1
/
4
;
int
nc3
=
16
-
4
*
(
_nc1
%
4
);
int
L1
=
64
/
max_threads
*
1024
;
KC
=
k
;
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
if
(
m
>
n
)
{
// 对 A 分块
MC
=
L1
/
(
KC
*
sizeof
(
float
));
if
(
MC
==
0
)
{
MC
=
MR
;
}
else
{
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
}
// 补齐 B
NC
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
asm
volatile
(
"vmov.f32 q14, #0.0
\n\t
"
"subs %[nc1], %[nc1], #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
#if __aarch64__
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
"vld1.32 {q0, q1}, [%[c]]!
\n\t
"
"vld1.32 {q2, q3}, [%[scale]]!
\n\t
"
"vld1.32 {q10, q11}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q2
\n\t
"
"vmla.f32 q11, q1, q3
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vmax.f32 q11, q11, q14
\n\t
"
"vst1.32 {q10, q11}, [%[C]]!
\n\t
"
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
(
*
this
.
*
procPackB
)(
KC
,
n
,
n
%
NR
,
B
,
ldb
,
packedB
);
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
*
max_threads
));
}
else
{
// 对 B 分块
NC
=
L1
/
(
KC
*
sizeof
(
float
));
if
(
NC
==
0
)
{
NC
=
NR
;
}
else
{
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
}
// 补齐 A
MC
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
"vld1.32 {q4, q5}, [%[c]]!
\n\t
"
"vld1.32 {q6, q7}, [%[scale]]!
\n\t
"
"vld1.32 {q12, q13}, [%[bias]]!
\n\t
"
"vmla.f32 q12, q4, q6
\n\t
"
"vmla.f32 q13, q5, q7
\n\t
"
"vmax.f32 q12, q12, q14
\n\t
"
"vmax.f32 q13, q13, q14
\n\t
"
"vst1.32 {q12, q13}, [%[C]]!
\n\t
"
#if __aarch64__
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
"subs %[nc1], %[nc1], #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
(
*
this
.
*
procPackA
)(
m
,
KC
,
m
%
MR
,
A
,
lda
,
packedA
);
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
*
max_threads
));
}
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
*
max_threads
));
"subs %[nc2], %[nc2], #1
\n\t
"
"blt end_nc2_%=
\n\t
"
"loop_nc2_%=:
\n\t
"
if
(
m
>
n
)
{
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
#else
int
local_threads
=
0
;
#endif
"vld1.32 {q0}, [%[c]]!
\n\t
"
"vld1.32 {q1}, [%[scale]]!
\n\t
"
"vld1.32 {q10}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q1
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vst1.32 {q10}, [%[C]]!
\n\t
"
int
mc
;
mc
=
s_min
(
m
-
i
,
MC
);
float
*
local_A
=
packedA
+
MC
*
KC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
(
*
this
.
*
procPackA
)(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
if
(
bias
==
nullptr
)
{
InnerKernelWithBn
(
mc
,
n
,
alpha
,
local_A
,
packedB
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
);
}
else
{
InnerKernelWithBnAdd
(
mc
,
n
,
alpha
,
local_A
,
packedB
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
,
bias
+
i
*
ldc
);
}
}
}
else
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
#else
int
local_threads
=
0
;
#endif
"subs %[nc2], %[nc2], #1
\n\t
"
"bge loop_nc2_%=
\n\t
"
"end_nc2_%=:
\n\t
"
int
nc
;
nc
=
s_min
(
n
-
j
,
NC
);
float
*
local_B
=
packedB
+
KC
*
NC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
(
*
this
.
*
procPackB
)(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
if
(
bias
==
nullptr
)
{
InnerKernelWithBn
(
m
,
nc
,
alpha
,
packedA
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
new_scale
,
new_bias
);
}
else
{
InnerKernelWithBnAdd
(
m
,
nc
,
alpha
,
packedA
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
new_scale
,
new_bias
,
bias
+
j
);
}
}
}
"cmp %[nc3], #16
\n\t
"
"beq end_nc3_%=
\n\t
"
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
"sub %[c], %[c], %[nc3]
\n\t
"
"sub %[scale], %[scale], %[nc3]
\n\t
"
"sub %[bias], %[bias], %[nc3]
\n\t
"
"sub %[C], %[C], %[nc3]
\n\t
"
"vld1.32 {q0}, [%[c]]!
\n\t
"
"vld1.32 {q1}, [%[scale]]!
\n\t
"
"vld1.32 {q10}, [%[bias]]!
\n\t
"
"vmla.f32 q10, q0, q1
\n\t
"
"vmax.f32 q10, q10, q14
\n\t
"
"vst1.32 {q10}, [%[C]]!
\n\t
"
"end_nc3_%=:
\n\t
"
:
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
nc1
]
"r"
(
nc1
),
[
nc2
]
"r"
(
nc2
),
[
nc3
]
"r"
(
nc3
),
[
scale
]
"r"
(
scale
),
[
bias
]
"r"
(
bias
)
:
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
);
}
void
Gemm
::
SgemmWithPRelu_omp
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
#ifdef _OPENMP
int
max_threads
=
omp_get_max_threads
();
#else
int
max_threads
=
1
;
#endif
#endif // __aarch64__
#endif // __ARM_NEON
// 32位 float 矩阵乘法
void
Gemm
::
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
32
*
1024
;
int
L2
=
512
*
1024
;
int
L1
=
8
*
1024
;
KC
=
k
;
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
if
(
m
>
n
)
{
// 对 A 分块
MC
=
L1
/
(
KC
*
sizeof
(
float
));
NC
=
L2
/
(
KC
*
sizeof
(
float
));
// make sure MC is multiple of MR, and NC is multiple of NR
if
(
MC
==
0
)
{
MC
=
MR
;
}
else
{
...
...
@@ -3431,27 +3269,7 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
}
// 补齐 B
NC
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
#if __aarch64__
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
(
*
this
.
*
procPackB
)(
KC
,
n
,
n
%
NR
,
B
,
ldb
,
packedB
);
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
*
max_threads
));
}
else
{
// 对 B 分块
NC
=
L1
/
(
KC
*
sizeof
(
float
));
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if
(
NC
==
0
)
{
NC
=
NR
;
}
else
{
...
...
@@ -3459,553 +3277,582 @@ void Gemm::SgemmWithPRelu_omp(int m, int n, int k, const float *A, int lda,
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
}
// 补齐 A
MC
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
#if __aarch64__
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
(
*
this
.
*
procPackA
)(
m
,
KC
,
m
%
MR
,
A
,
lda
,
packedA
);
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
*
max_threads
));
}
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
*
max_threads
));
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
));
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
if
(
m
>
n
)
{
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
int
mc
,
nc
;
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
#if __aarch64__
// PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
PackMatrixB_16c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
#else
int
local_threads
=
0
;
PackMatrixB_8c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
)
;
#endif
int
mc
;
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
float
*
local_A
=
packedA
+
MC
*
KC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
(
*
this
.
*
procPackA
)(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
if
(
bias1
==
nullptr
)
{
InnerKernelWithPRelu
(
mc
,
n
,
local_A
,
packedB
,
local_C
,
&
C
(
i
,
0
),
ldc
,
p
+
i
,
mode
,
bias
+
i
,
nullptr
);
}
else
{
InnerKernelWithPRelu
(
mc
,
n
,
local_A
,
packedB
,
local_C
,
&
C
(
i
,
0
),
ldc
,
p
+
i
,
mode
,
bias
+
i
,
bias1
+
i
*
ldc
);
}
}
}
else
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
#if __aarch64__
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
// PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#else
int
local_threads
=
0
;
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
)
;
#endif
int
nc
;
nc
=
s_min
(
n
-
j
,
NC
);
float
*
local_B
=
packedB
+
KC
*
NC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
(
*
this
.
*
procPackB
)(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
if
(
bias1
==
nullptr
)
{
InnerKernelWithPRelu
(
m
,
nc
,
packedA
,
local_B
,
local_C
,
&
C
(
0
,
j
),
ldc
,
p
,
mode
,
bias
,
nullptr
);
if
(
bias
==
nullptr
)
{
InnerKernelWithBias
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
,
nullptr
);
}
else
{
InnerKernelWithPRelu
(
m
,
nc
,
packedA
,
local_B
,
local_C
,
&
C
(
0
,
j
),
ldc
,
p
,
mode
,
bias
,
bias1
+
j
);
}
InnerKernelWithBias
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
,
bias
+
i
);
}
}
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
void
Gemm
::
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
#if __ARM_NEON
#if __aarch64__
// init C
float32x4_t
cv0
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv1
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv2
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv3
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv4
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv5
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv6
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv7
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv8
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv9
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv10
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv11
=
vdupq_n_f32
(
0.0
);
float32x4_t
av
;
float32x4_t
bv0
;
float32x4_t
bv1
;
float32x2_t
av01
;
float32x2_t
av23
;
float32x2_t
av45
;
for
(
int
p
=
0
;
p
<
k
;
p
+=
1
)
{
av
=
vld1q_f32
(
a
);
av01
=
vget_low_f32
(
av
);
av23
=
vget_high_f32
(
av
);
av45
=
vld1_f32
(
a
+
4
);
bv0
=
vld1q_f32
(
b
);
bv1
=
vld1q_f32
(
b
+
4
);
cv0
=
vmlaq_lane_f32
(
cv0
,
bv0
,
av01
,
0
);
cv1
=
vmlaq_lane_f32
(
cv1
,
bv1
,
av01
,
0
);
cv2
=
vmlaq_lane_f32
(
cv2
,
bv0
,
av01
,
1
);
cv3
=
vmlaq_lane_f32
(
cv3
,
bv1
,
av01
,
1
);
cv4
=
vmlaq_lane_f32
(
cv4
,
bv0
,
av23
,
0
);
cv5
=
vmlaq_lane_f32
(
cv5
,
bv1
,
av23
,
0
);
cv6
=
vmlaq_lane_f32
(
cv6
,
bv0
,
av23
,
1
);
cv7
=
vmlaq_lane_f32
(
cv7
,
bv1
,
av23
,
1
);
cv8
=
vmlaq_lane_f32
(
cv8
,
bv0
,
av45
,
0
);
cv9
=
vmlaq_lane_f32
(
cv9
,
bv1
,
av45
,
0
);
cv10
=
vmlaq_lane_f32
(
cv10
,
bv0
,
av45
,
1
);
cv11
=
vmlaq_lane_f32
(
cv11
,
bv1
,
av45
,
1
);
a
+=
MR
;
b
+=
NR
;
}
vst1q_f32
(
c
,
cv0
);
vst1q_f32
(
c
+
4
,
cv1
);
vst1q_f32
(
c
+
ldc
,
cv2
);
vst1q_f32
(
c
+
ldc
+
4
,
cv3
);
vst1q_f32
(
c
+
2
*
ldc
,
cv4
);
vst1q_f32
(
c
+
2
*
ldc
+
4
,
cv5
);
vst1q_f32
(
c
+
3
*
ldc
,
cv6
);
vst1q_f32
(
c
+
3
*
ldc
+
4
,
cv7
);
vst1q_f32
(
c
+
4
*
ldc
,
cv8
);
vst1q_f32
(
c
+
4
*
ldc
+
4
,
cv9
);
vst1q_f32
(
c
+
5
*
ldc
,
cv10
);
vst1q_f32
(
c
+
5
*
ldc
+
4
,
cv11
);
#else
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
/
8
;
int
kc2
=
k
%
8
;
int
step
=
sizeof
(
float
)
*
ldc
;
asm
volatile
(
"pld [%[a_ptr]]
\n\t
"
"pld [%[a_ptr], #64]
\n\t
"
"pld [%[b_ptr]]
\n\t
"
"pld [%[b_ptr], #64]
\n\t
"
"vmov.f32 q4, #0.0
\n\t
"
"vmov.f32 q5, #0.0
\n\t
"
"vmov.f32 q6, #0.0
\n\t
"
"vmov.f32 q7, #0.0
\n\t
"
"vmov.f32 q8, #0.0
\n\t
"
"vmov.f32 q9, #0.0
\n\t
"
"vmov.f32 q10, #0.0
\n\t
"
"vmov.f32 q11, #0.0
\n\t
"
"vmov.f32 q12, #0.0
\n\t
"
"vmov.f32 q13, #0.0
\n\t
"
"vmov.f32 q14, #0.0
\n\t
"
"vmov.f32 q15, #0.0
\n\t
"
"subs %[kc1], %[kc1], #1
\n\t
"
"blt 2f
\n\t
"
"1:
\n\t
"
"pld [%[a_ptr], #128]
\n\t
"
"pld [%[b_ptr], #128]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"pld [%[a_ptr], #128]
\n\t
"
"pld [%[b_ptr], #128]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
void
Gemm
::
SgemmWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
32
*
1024
;
int
L2
=
512
*
1024
;
"pld [%[a_ptr], #128]
\n\t
"
"pld [%[b_ptr], #128]
\n\t
"
KC
=
k
;
MC
=
L1
/
(
KC
*
sizeof
(
float
));
NC
=
L2
/
(
KC
*
sizeof
(
float
));
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
// make sure MC is multiple of MR, and NC is multiple of NR
if
(
MC
==
0
)
{
MC
=
MR
;
}
else
{
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
}
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if
(
NC
==
0
)
{
NC
=
NR
;
}
else
{
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
}
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
));
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
int
mc
,
nc
;
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
#if __aarch64__
// PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
PackMatrixB_16c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
#else
PackMatrixB_8c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
#endif
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
#if __aarch64__
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
// PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#else
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
#endif
if
(
bias
==
nullptr
)
{
InnerKernelWithBn
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
);
}
else
{
InnerKernelWithBnAdd
(
mc
,
nc
,
alpha
,
packedA
,
packedB
,
beta
,
packedC
,
&
C
(
i
,
j
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
,
bias
+
i
*
ldc
+
j
);
}
}
}
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
"pld [%[a_ptr], #128]
\n\t
"
"pld [%[b_ptr], #128]
\n\t
"
void
Gemm
::
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
32
*
1024
;
int
L2
=
0.5
*
1024
*
1024
;
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
KC
=
k
;
MC
=
L1
/
(
KC
*
sizeof
(
float
));
NC
=
L2
/
(
KC
*
sizeof
(
float
));
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
// make sure MC is multiple of MR, and NC is multiple of NR
if
(
MC
==
0
)
{
MC
=
MR
;
}
else
{
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
}
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
if
(
NC
==
0
)
{
NC
=
NR
;
}
else
{
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
}
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
));
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
for
(
int
l
=
0
;
l
<
KC
;
++
l
)
{
zero
[
l
]
=
0
;
}
"subs %[kc1], %[kc1], #1
\n\t
"
"bge 1b
\n\t
"
"2:
\n\t
"
int
mc
,
nc
;
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
#if __aarch64__
// PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
PackMatrixB_16c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
#else
PackMatrixB_8c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
#endif
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
#if __aarch64__
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
// PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#else
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
#endif
if
(
bias1
==
nullptr
)
{
InnerKernelWithPRelu
(
mc
,
nc
,
packedA
,
packedB
,
packedC
,
&
C
(
i
,
j
),
ldc
,
p
+
i
,
mode
,
bias
+
i
,
nullptr
);
}
else
{
InnerKernelWithPRelu
(
mc
,
nc
,
packedA
,
packedB
,
packedC
,
&
C
(
i
,
j
),
ldc
,
p
+
i
,
mode
,
bias
+
i
,
bias1
+
i
*
ldc
+
j
);
}
}
}
"subs %[kc2], %[kc2], #1
\n\t
"
"blt 4f
\n\t
"
"3:
\n\t
"
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
"vld1.32 {d0-d2}, [%[a_ptr]]!
\n\t
"
"vld1.32 {q2, q3}, [%[b_ptr]]!
\n\t
"
// 32位 float 矩阵乘法
void
Gemm
::
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
bias
)
{
#ifndef __aarch64__
if
(
m
==
1
&&
bias
==
nullptr
)
{
return
VectorKernel
(
m
,
n
,
k
,
alpha
,
A
,
lda
,
B
,
ldb
,
beta
,
C
,
ldc
,
relu
);
}
#endif // __aarch64__
#ifdef _OPENMP
int
max_threads
=
omp_get_max_threads
();
#else
int
max_threads
=
1
;
#endif
"vmla.f32 q4, q2, d0[0]
\n\t
"
"vmla.f32 q5, q3, d0[0]
\n\t
"
"vmla.f32 q6, q2, d0[1]
\n\t
"
"vmla.f32 q7, q3, d0[1]
\n\t
"
"vmla.f32 q8, q2, d1[0]
\n\t
"
"vmla.f32 q9, q3, d1[0]
\n\t
"
"vmla.f32 q10, q2, d1[1]
\n\t
"
"vmla.f32 q11, q3, d1[1]
\n\t
"
"vmla.f32 q12, q2, d2[0]
\n\t
"
"vmla.f32 q13, q3, d2[0]
\n\t
"
"vmla.f32 q14, q2, d2[1]
\n\t
"
"vmla.f32 q15, q3, d2[1]
\n\t
"
// int L1 = 64 / max_threads * 1024;
int
L
=
(
max_threads
>
2
)
?
64
:
32
;
int
L1
=
L
/
max_threads
*
1024
;
KC
=
k
;
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
if
(
m
>
n
)
{
// 对 A 分块
MC
=
L1
/
(
KC
*
sizeof
(
float
));
if
(
MC
==
0
)
{
MC
=
MR
;
}
else
{
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
}
// 补齐 B
NC
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
"subs %[kc2], %[kc2], #1
\n\t
"
"bge 3b
\n\t
"
"4:
\n\t
"
#if __aarch64__
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
"mov r5, %[c]
\n\t
"
"mov r6, %[step]
\n\t
"
"vst1.32 {q4, q5}, [r5], r6
\n\t
"
"vst1.32 {q6, q7}, [r5], r6
\n\t
"
"vst1.32 {q8, q9}, [r5], r6
\n\t
"
"vst1.32 {q10, q11}, [r5], r6
\n\t
"
"vst1.32 {q12, q13}, [r5], r6
\n\t
"
"vst1.32 {q14, q15}, [r5]
\n\t
"
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
(
*
this
.
*
procPackB
)(
KC
,
n
,
n
%
NR
,
B
,
ldb
,
packedB
);
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
*
max_threads
));
}
else
{
// 对 B 分块
NC
=
L1
/
(
KC
*
sizeof
(
float
));
if
(
NC
==
0
)
{
NC
=
NR
;
}
else
{
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
}
// 补齐 A
MC
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
kc2
]
"r"
(
kc2
),
[
step
]
"r"
(
step
)
:
"cc"
,
"memory"
,
"r5"
,
"r6"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
);
#if __aarch64__
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
#endif // __aarch64__
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
#endif // __ARM_NEON
}
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
(
*
this
.
*
procPackA
)(
m
,
KC
,
m
%
MR
,
A
,
lda
,
packedA
);
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
*
max_threads
));
}
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
*
max_threads
));
#if __aarch64__
void
Gemm
::
AddDot8x12
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
;
int
step
=
4
*
ldc
;
asm
volatile
(
"dup v5.4s, wzr
\n\t
"
"dup v6.4s, wzr
\n\t
"
"dup v7.4s, wzr
\n\t
"
"dup v8.4s, wzr
\n\t
"
"dup v9.4s, wzr
\n\t
"
"dup v10.4s, wzr
\n\t
"
"dup v11.4s, wzr
\n\t
"
"dup v12.4s, wzr
\n\t
"
"dup v13.4s, wzr
\n\t
"
"dup v14.4s, wzr
\n\t
"
"dup v15.4s, wzr
\n\t
"
"dup v16.4s, wzr
\n\t
"
if
(
m
>
n
)
{
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
#else
int
local_threads
=
0
;
#endif
"dup v17.4s, wzr
\n\t
"
"dup v18.4s, wzr
\n\t
"
"dup v19.4s, wzr
\n\t
"
"dup v20.4s, wzr
\n\t
"
"dup v21.4s, wzr
\n\t
"
"dup v22.4s, wzr
\n\t
"
"dup v23.4s, wzr
\n\t
"
"dup v24.4s, wzr
\n\t
"
"dup v25.4s, wzr
\n\t
"
"dup v26.4s, wzr
\n\t
"
"dup v27.4s, wzr
\n\t
"
"dup v28.4s, wzr
\n\t
"
int
mc
;
mc
=
s_min
(
m
-
i
,
MC
);
float
*
local_A
=
packedA
+
MC
*
KC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
(
*
this
.
*
procPackA
)(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
if
(
bias
==
nullptr
)
{
InnerKernelWithBias
(
mc
,
n
,
alpha
,
local_A
,
packedB
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
nullptr
);
}
else
{
InnerKernelWithBias
(
mc
,
n
,
alpha
,
local_A
,
packedB
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
bias
+
i
);
}
}
}
else
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
#else
int
local_threads
=
0
;
#endif
"subs %[kc1], %[kc1], #1
\n\t
"
"blt 2f
\n\t
"
"1:
\n\t
"
int
nc
;
nc
=
s_min
(
n
-
j
,
NC
);
float
*
local_B
=
packedB
+
KC
*
NC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
(
*
this
.
*
procPackB
)(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
InnerKernelWithBias
(
m
,
nc
,
alpha
,
packedA
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
bias
);
}
}
"prfm pldl1keep, [%[a_ptr], #32]
\n\t
"
"prfm pldl1keep, [%[b_ptr], #48]
\n\t
"
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
"ld1 {v0.4s, v1.4s}, [%[a_ptr]], #32
\n\t
"
"ld1 {v2.4s, v3.4s, v4.4s}, [%[b_ptr]], #48
\n\t
"
void
Gemm
::
SgemmWithBn_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias
)
{
#ifdef _OPENMP
int
max_threads
=
omp_get_max_threads
();
#else
int
max_threads
=
1
;
#endif
"fmla v5.4s, v2.4s, v0.s[0]
\n\t
"
"fmla v6.4s, v3.4s, v0.s[0]
\n\t
"
"fmla v7.4s, v4.4s, v0.s[0]
\n\t
"
"fmla v8.4s, v2.4s, v0.s[1]
\n\t
"
"fmla v9.4s, v3.4s, v0.s[1]
\n\t
"
"fmla v10.4s, v4.4s, v0.s[1]
\n\t
"
"fmla v11.4s, v2.4s, v0.s[2]
\n\t
"
"fmla v12.4s, v3.4s, v0.s[2]
\n\t
"
"fmla v13.4s, v4.4s, v0.s[2]
\n\t
"
"fmla v14.4s, v2.4s, v0.s[3]
\n\t
"
"fmla v15.4s, v3.4s, v0.s[3]
\n\t
"
"fmla v16.4s, v4.4s, v0.s[3]
\n\t
"
int
L1
=
64
/
max_threads
*
1024
;
KC
=
k
;
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
if
(
m
>
n
)
{
// 对 A 分块
MC
=
L1
/
(
KC
*
sizeof
(
float
));
if
(
MC
==
0
)
{
MC
=
MR
;
}
else
{
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
}
// 补齐 B
NC
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
"fmla v17.4s, v2.4s, v1.s[0]
\n\t
"
"fmla v18.4s, v3.4s, v1.s[0]
\n\t
"
"fmla v19.4s, v4.4s, v1.s[0]
\n\t
"
"fmla v20.4s, v2.4s, v1.s[1]
\n\t
"
"fmla v21.4s, v3.4s, v1.s[1]
\n\t
"
"fmla v22.4s, v4.4s, v1.s[1]
\n\t
"
"fmla v23.4s, v2.4s, v1.s[2]
\n\t
"
"fmla v24.4s, v3.4s, v1.s[2]
\n\t
"
"fmla v25.4s, v4.4s, v1.s[2]
\n\t
"
"fmla v26.4s, v2.4s, v1.s[3]
\n\t
"
"fmla v27.4s, v3.4s, v1.s[3]
\n\t
"
"fmla v28.4s, v4.4s, v1.s[3]
\n\t
"
#if __aarch64__
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
"subs %[kc1], %[kc1], #1
\n\t
"
"bge 1b
\n\t
"
"2:
\n\t
"
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
(
*
this
.
*
procPackB
)(
KC
,
n
,
n
%
NR
,
B
,
ldb
,
packedB
);
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
*
max_threads
));
}
else
{
// 对 B 分块
NC
=
L1
/
(
KC
*
sizeof
(
float
));
if
(
NC
==
0
)
{
NC
=
NR
;
}
else
{
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
}
// 补齐 A
MC
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
"st1 {v5.4s, v6.4s, v7.4s}, [%[c]], %[step]
\n\t
"
"st1 {v8.4s, v9.4s, v10.4s}, [%[c]], %[step]
\n\t
"
"st1 {v11.4s, v12.4s, v13.4s}, [%[c]], %[step]
\n\t
"
"st1 {v14.4s, v15.4s, v16.4s}, [%[c]], %[step]
\n\t
"
"st1 {v17.4s, v18.4s, v19.4s}, [%[c]], %[step]
\n\t
"
"st1 {v20.4s, v21.4s, v22.4s}, [%[c]], %[step]
\n\t
"
"st1 {v23.4s, v24.4s, v25.4s}, [%[c]], %[step]
\n\t
"
"st1 {v26.4s, v27.4s, v28.4s}, [%[c]], %[step]
\n\t
"
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
step
]
"r"
(
step
)
:
"memory"
,
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v18"
,
"v19"
,
"v20"
,
"v21"
,
"v22"
,
"v23"
,
"v24"
,
"v25"
,
"v26"
,
"v27"
,
"v28"
);
}
#if __aarch64__
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
void
Gemm
::
AddDot6x16
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
const
float
*
a_ptr
,
*
b_ptr
;
a_ptr
=
a
;
b_ptr
=
b
;
int
kc1
=
k
;
int
step
=
4
*
ldc
;
int
step1
=
4
*
6
;
asm
volatile
(
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
(
*
this
.
*
procPackA
)(
m
,
KC
,
m
%
MR
,
A
,
lda
,
packedA
);
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
*
max_threads
));
}
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
*
max_threads
));
"dup v6.4s, wzr
\n\t
"
"dup v7.4s, wzr
\n\t
"
"dup v8.4s, wzr
\n\t
"
"dup v9.4s, wzr
\n\t
"
"dup v10.4s, wzr
\n\t
"
"dup v11.4s, wzr
\n\t
"
"dup v12.4s, wzr
\n\t
"
"dup v13.4s, wzr
\n\t
"
if
(
m
>
n
)
{
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
#else
int
local_threads
=
0
;
#endif
"dup v14.4s, wzr
\n\t
"
"dup v15.4s, wzr
\n\t
"
"dup v16.4s, wzr
\n\t
"
"dup v17.4s, wzr
\n\t
"
"dup v18.4s, wzr
\n\t
"
"dup v19.4s, wzr
\n\t
"
"dup v20.4s, wzr
\n\t
"
"dup v21.4s, wzr
\n\t
"
int
mc
;
mc
=
s_min
(
m
-
i
,
MC
);
float
*
local_A
=
packedA
+
MC
*
KC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
(
*
this
.
*
procPackA
)(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
if
(
bias
==
nullptr
)
{
InnerKernelWithBn
(
mc
,
n
,
alpha
,
local_A
,
packedB
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
);
}
else
{
InnerKernelWithBnAdd
(
mc
,
n
,
alpha
,
local_A
,
packedB
,
beta
,
local_C
,
&
C
(
i
,
0
),
ldc
,
relu
,
new_scale
+
i
,
new_bias
+
i
,
bias
+
i
*
ldc
);
}
}
}
else
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
#else
int
local_threads
=
0
;
#endif
"dup v22.4s, wzr
\n\t
"
"dup v23.4s, wzr
\n\t
"
"dup v24.4s, wzr
\n\t
"
"dup v25.4s, wzr
\n\t
"
"dup v26.4s, wzr
\n\t
"
"dup v27.4s, wzr
\n\t
"
"dup v28.4s, wzr
\n\t
"
"dup v29.4s, wzr
\n\t
"
int
nc
;
nc
=
s_min
(
n
-
j
,
NC
);
float
*
local_B
=
packedB
+
KC
*
NC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
(
*
this
.
*
procPackB
)(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
if
(
bias
==
nullptr
)
{
InnerKernelWithBn
(
m
,
nc
,
alpha
,
packedA
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
new_scale
,
new_bias
);
}
else
{
InnerKernelWithBnAdd
(
m
,
nc
,
alpha
,
packedA
,
local_B
,
beta
,
local_C
,
&
C
(
0
,
j
),
ldc
,
relu
,
new_scale
,
new_bias
,
bias
+
j
);
}
}
}
"subs %[kc1], %[kc1], #1
\n\t
"
"blt 2f
\n\t
"
"1:
\n\t
"
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
"prfm pldl1keep, [%[a_ptr], #24]
\n\t
"
"prfm pldl1keep, [%[b_ptr], #64]
\n\t
"
void
Gemm
::
SgemmWithPRelu_omp
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
#ifdef _OPENMP
int
max_threads
=
omp_get_max_threads
();
#else
int
max_threads
=
1
;
#endif
"ld1 {v0.4s, v1.4s}, [%[a_ptr]], %[step1]
\n\t
"
"ld1 {v2.4s, v3.4s, v4.4s, v5.4s}, [%[b_ptr]], #64
\n\t
"
int
L1
=
8
*
1024
;
KC
=
k
;
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
if
(
m
>
n
)
{
// 对 A 分块
MC
=
L1
/
(
KC
*
sizeof
(
float
));
if
(
MC
==
0
)
{
MC
=
MR
;
}
else
{
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
}
// 补齐 B
NC
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
"fmla v6.4s, v2.4s, v0.s[0]
\n\t
"
"fmla v7.4s, v3.4s, v0.s[0]
\n\t
"
"fmla v8.4s, v4.4s, v0.s[0]
\n\t
"
"fmla v9.4s, v5.4s, v0.s[0]
\n\t
"
#if __aarch64__
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
&
Gemm
::
PackMatrixA_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_omp_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
"fmla v10.4s, v2.4s, v0.s[1]
\n\t
"
"fmla v11.4s, v3.4s, v0.s[1]
\n\t
"
"fmla v12.4s, v4.4s, v0.s[1]
\n\t
"
"fmla v13.4s, v5.4s, v0.s[1]
\n\t
"
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
(
*
this
.
*
procPackB
)(
KC
,
n
,
n
%
NR
,
B
,
ldb
,
packedB
);
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
*
max_threads
));
}
else
{
// 对 B 分块
NC
=
L1
/
(
KC
*
sizeof
(
float
));
if
(
NC
==
0
)
{
NC
=
NR
;
}
else
{
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
}
// 补齐 A
MC
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
"fmla v14.4s, v2.4s, v0.s[2]
\n\t
"
"fmla v15.4s, v3.4s, v0.s[2]
\n\t
"
"fmla v16.4s, v4.4s, v0.s[2]
\n\t
"
"fmla v17.4s, v5.4s, v0.s[2]
\n\t
"
#if __aarch64__
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_16c
;
procAddDot
=
&
Gemm
::
AddDot6x16
;
#else
procPackA
=
&
Gemm
::
PackMatrixA_omp_6r
;
procPackB
=
&
Gemm
::
PackMatrixB_8c
;
procAddDot
=
&
Gemm
::
AddDot6x8
;
#endif
"fmla v18.4s, v2.4s, v0.s[3]
\n\t
"
"fmla v19.4s, v3.4s, v0.s[3]
\n\t
"
"fmla v20.4s, v4.4s, v0.s[3]
\n\t
"
"fmla v21.4s, v5.4s, v0.s[3]
\n\t
"
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
(
*
this
.
*
procPackA
)(
m
,
KC
,
m
%
MR
,
A
,
lda
,
packedA
);
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
*
max_threads
));
}
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
*
max_threads
));
"fmla v22.4s, v2.4s, v1.s[0]
\n\t
"
"fmla v23.4s, v3.4s, v1.s[0]
\n\t
"
"fmla v24.4s, v4.4s, v1.s[0]
\n\t
"
"fmla v25.4s, v5.4s, v1.s[0]
\n\t
"
if
(
m
>
n
)
{
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
#else
int
local_threads
=
0
;
#endif
"fmla v26.4s, v2.4s, v1.s[1]
\n\t
"
"fmla v27.4s, v3.4s, v1.s[1]
\n\t
"
"fmla v28.4s, v4.4s, v1.s[1]
\n\t
"
"fmla v29.4s, v5.4s, v1.s[1]
\n\t
"
int
mc
;
mc
=
s_min
(
m
-
i
,
MC
);
float
*
local_A
=
packedA
+
MC
*
KC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
(
*
this
.
*
procPackA
)(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
if
(
bias1
==
nullptr
)
{
InnerKernelWithPRelu
(
mc
,
n
,
local_A
,
packedB
,
local_C
,
&
C
(
i
,
0
),
ldc
,
p
+
i
,
mode
,
bias
+
i
,
nullptr
);
}
else
{
InnerKernelWithPRelu
(
mc
,
n
,
local_A
,
packedB
,
local_C
,
&
C
(
i
,
0
),
ldc
,
p
+
i
,
mode
,
bias
+
i
,
bias1
+
i
*
ldc
);
}
}
}
else
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
#else
int
local_threads
=
0
;
#endif
"subs %[kc1], %[kc1], #1
\n\t
"
"bge 1b
\n\t
"
"2:
\n\t
"
int
nc
;
nc
=
s_min
(
n
-
j
,
NC
);
float
*
local_B
=
packedB
+
KC
*
NC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
(
*
this
.
*
procPackB
)(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
if
(
bias1
==
nullptr
)
{
InnerKernelWithPRelu
(
m
,
nc
,
packedA
,
local_B
,
local_C
,
&
C
(
0
,
j
),
ldc
,
p
,
mode
,
bias
,
nullptr
);
}
else
{
InnerKernelWithPRelu
(
m
,
nc
,
packedA
,
local_B
,
local_C
,
&
C
(
0
,
j
),
ldc
,
p
,
mode
,
bias
,
bias1
+
j
);
}
}
}
"st1 {v6.4s, v7.4s, v8.4s, v9.4s}, [%[c]], %[step]
\n\t
"
"st1 {v10.4s, v11.4s, v12.4s, v13.4s}, [%[c]], %[step]
\n\t
"
"st1 {v14.4s, v15.4s, v16.4s, v17.4s}, [%[c]], %[step]
\n\t
"
"st1 {v18.4s, v19.4s, v20.4s, v21.4s}, [%[c]], %[step]
\n\t
"
"st1 {v22.4s, v23.4s, v24.4s, v25.4s}, [%[c]], %[step]
\n\t
"
"st1 {v26.4s, v27.4s, v28.4s, v29.4s}, [%[c]], %[step]
\n\t
"
:
:
[
a_ptr
]
"r"
(
a_ptr
),
[
b_ptr
]
"r"
(
b_ptr
),
[
c
]
"r"
(
c
),
[
kc1
]
"r"
(
kc1
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
)
:
"memory"
,
"v0"
,
"v1"
,
"v2"
,
"v3"
,
"v4"
,
"v5"
,
"v6"
,
"v7"
,
"v8"
,
"v9"
,
"v10"
,
"v11"
,
"v12"
,
"v13"
,
"v14"
,
"v15"
,
"v16"
,
"v17"
,
"v18"
,
"v19"
,
"v20"
,
"v21"
,
"v22"
,
"v23"
,
"v24"
,
"v25"
,
"v26"
,
"v27"
,
"v28"
,
"v29"
);
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
#endif // __aarch64__
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
src/operators/math/gemm.h
浏览文件 @
5acae32b
...
...
@@ -46,15 +46,6 @@ namespace math {
class
Gemm
{
public:
/*
// 将 A 矩阵分块复制到连续内存(ColMajor)
void PackMatrixA(int m, int k, int m_tail, const float *A, int lda,
float *buffer);
// 将 B 矩阵分块复制到连续内存(ColMajor)
void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float *buffer);
*/
typedef
void
(
Gemm
::*
FnPack
)(
int
,
int
,
int
,
const
float
*
,
int
,
float
*
);
typedef
void
(
Gemm
::*
FnAddDot
)(
int
,
const
float
*
,
const
float
*
,
float
*
,
int
);
...
...
@@ -62,31 +53,31 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
FnPack
procPackB
;
FnAddDot
procAddDot
;
// 将 A 矩阵分块复制到连续内存(RowMajor)
// 将 A
\B
矩阵分块复制到连续内存(RowMajor)
void
PackMatrixA_4r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_omp_6r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
void
PackMatrixA_omp_8r
(
int
m
,
int
k
,
int
m_tail
,
const
float
*
A
,
int
lda
,
float
*
buffer
);
// 将 B 矩阵分块复制到连续内存(RowMajor)
void
PackMatrixB_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_omp_8c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
#if __aarch64__
void
PackMatrixB_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_omp_12c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
void
PackMatrixB_omp_16c
(
int
k
,
int
n
,
int
n_tail
,
const
float
*
B
,
int
ldb
,
float
*
buffer
);
#endif
// 分块矩阵乘法
void
InnerKernel
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
...
...
@@ -106,22 +97,16 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
// 向量矩阵乘法 (M = 1)
void
VectorKernel
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
);
/*
void VectorKernelWithBn(int m, int n, int k, float alpha, const float *A,
int lda, const float *B, int ldb, float beta, float
*C, int ldc, bool relu, float *new_scale, float *new_bias);
*/
// 计算一个更小的 C 矩阵分块
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
#if __aarch64__
void
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot8x12
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot6x16
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
#else
void
AddDot4x4
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot4x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
void
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
);
#endif
// 分块矩阵乘法结果回写
// C = A * B
...
...
@@ -149,6 +134,18 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
void
WriteWithBnAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
,
float
*
bias1
);
// 向量矩阵乘法 (M = 1)
#if __aarch64__
#else
void
VectorKernel
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
);
void
VectorKernelWithBn
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
// 向量矩阵乘法结果回写
// C = A * B
void
VecWriteBasic
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
);
...
...
@@ -158,14 +155,13 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
void
VecWriteWithAdd
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C, relu(C)
void
VecWriteWithAddRelu
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
);
/*
// C = A * B, batchnorm(C)
void
VecWriteWithBn
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
// C = A * B, batchnorm(C), relu(C)
void VecWriteWithBnRelu(int n, float *c, float *C, int ldc, float
*new_scale,
float *new_bias);
*/
void
VecWriteWithBnRelu
(
int
n
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
);
#endif
// 32位 float 矩阵乘法
void
Sgemm
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
...
...
src/operators/op_param.h
浏览文件 @
5acae32b
...
...
@@ -1521,33 +1521,20 @@ class SliceParam : public OpParam {
public:
SliceParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
input_x_
=
InputXFrom
<
GType
>
(
inputs
,
scope
);
input_shape_
=
InputShapeFrom
<
GType
>
(
inputs
,
scope
);
out_
=
OutFrom
<
GType
>
(
outputs
,
scope
);
axis_
=
GetAttr
<
int
>
(
"axis"
,
attrs
);
slice_points_
=
GetAttr
<
vector
<
int
>>
(
"slice_points"
,
attrs
);
inplace_
=
GetAttr
<
bool
>
(
"inplace"
,
attrs
);
}
const
RType
*
InputX
()
const
{
return
input_x_
;
}
const
RType
*
InputShape
()
const
{
return
input_shape_
;
}
RType
*
Out
()
const
{
return
out_
;
}
const
int
&
Axis
()
const
{
return
axis_
;
}
const
vector
<
int
>
&
SlicePoints
()
const
{
return
slice_points_
;
}
input_
=
InputFrom
<
GType
>
(
inputs
,
scope
);
output_
=
OutFrom
<
GType
>
(
outputs
,
scope
);
const
bool
&
Inplace
()
const
{
return
inplace_
;
}
axes_
=
GetAttr
<
std
::
vector
<
int
>>
(
"axes"
,
attrs
);
starts_
=
GetAttr
<
std
::
vector
<
int
>>
(
"starts"
,
attrs
);
ends_
=
GetAttr
<
std
::
vector
<
int
>>
(
"ends"
,
attrs
);
}
private:
RType
*
input_x_
;
RType
*
input_shape_
;
RType
*
out_
;
int
axis_
;
vector
<
int
>
slice_points_
;
bool
inplace_
;
public:
GType
*
input_
;
GType
*
output_
;
std
::
vector
<
int
>
axes_
;
std
::
vector
<
int
>
starts_
;
std
::
vector
<
int
>
ends_
;
};
#endif
...
...
tools/op.cmake
浏览文件 @
5acae32b
...
...
@@ -290,6 +290,9 @@ if(NOT FOUND_MATCH)
set
(
READ_FROM_ARRAY_OP ON
)
set
(
IS_EMPTY_OP ON
)
set
(
INCREMENT_OP ON
)
set
(
ANCHOR_GENERATOR_OP ON
)
set
(
PROPOSAL_OP ON
)
set
(
PSROI_POOL_OP ON
)
endif
()
# option(BATCHNORM_OP "" ON)
...
...
@@ -580,3 +583,13 @@ endif()
if
(
INCREMENT_OP
)
add_definitions
(
-DINCREMENT_OP
)
endif
()
if
(
ANCHOR_GENERATOR_OP
)
add_definitions
(
-DANCHOR_GENERATOR_OP
)
endif
()
if
(
PROPOSAL_OP
)
add_definitions
(
-DPROPOSAL_OP
)
endif
()
if
(
PSROI_POOL_OP
)
add_definitions
(
-DPSROI_POOL_OP
)
endif
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录