Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
1d475a2c
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1d475a2c
编写于
3月 10, 2019
作者:
H
hjchen2
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update
上级
e0f97f83
变更
49
隐藏空白更改
内联
并排
Showing
49 changed file
with
570 addition
and
1794 deletion
+570
-1794
src/framework/load_ops.h
src/framework/load_ops.h
+0
-8
src/operators/conv_op.cpp
src/operators/conv_op.cpp
+4
-4
src/operators/depthwise_conv_op.cpp
src/operators/depthwise_conv_op.cpp
+4
-4
src/operators/fusion_conv_add_add_prelu_op.cpp
src/operators/fusion_conv_add_add_prelu_op.cpp
+0
-61
src/operators/fusion_conv_add_add_prelu_op.h
src/operators/fusion_conv_add_add_prelu_op.h
+0
-79
src/operators/fusion_conv_add_bn_op.cpp
src/operators/fusion_conv_add_bn_op.cpp
+4
-4
src/operators/fusion_conv_add_bn_relu_op.cpp
src/operators/fusion_conv_add_bn_relu_op.cpp
+4
-4
src/operators/fusion_conv_add_op.cpp
src/operators/fusion_conv_add_op.cpp
+4
-4
src/operators/fusion_conv_add_prelu_op.cpp
src/operators/fusion_conv_add_prelu_op.cpp
+0
-61
src/operators/fusion_conv_add_prelu_op.h
src/operators/fusion_conv_add_prelu_op.h
+0
-71
src/operators/fusion_conv_add_relu_op.cpp
src/operators/fusion_conv_add_relu_op.cpp
+4
-4
src/operators/fusion_conv_bn_add_relu_op.cpp
src/operators/fusion_conv_bn_add_relu_op.cpp
+4
-4
src/operators/fusion_conv_bn_op.cpp
src/operators/fusion_conv_bn_op.cpp
+4
-3
src/operators/fusion_conv_bn_relu_op.cpp
src/operators/fusion_conv_bn_relu_op.cpp
+4
-4
src/operators/fusion_dwconv_bn_relu_op.cpp
src/operators/fusion_dwconv_bn_relu_op.cpp
+4
-4
src/operators/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp
...tors/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp
+0
-39
src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
...rators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
+5
-14
src/operators/kernel/arm/convolution/conv_add_kernel.cpp
src/operators/kernel/arm/convolution/conv_add_kernel.cpp
+3
-12
src/operators/kernel/arm/convolution/conv_add_prelu_kernel.cpp
...perators/kernel/arm/convolution/conv_add_prelu_kernel.cpp
+0
-38
src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
...operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
+4
-10
src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
...rators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
+5
-14
src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
+5
-14
src/operators/kernel/arm/convolution/conv_kernel.cpp
src/operators/kernel/arm/convolution/conv_kernel.cpp
+2
-6
src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
...perators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
+6
-14
src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
...ors/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
+0
-128
src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
...erators/kernel/central-arm-func/conv_add_prelu_arm_func.h
+0
-124
src/operators/kernel/central-arm-func/conv_arm_func.cpp
src/operators/kernel/central-arm-func/conv_arm_func.cpp
+242
-0
src/operators/kernel/central-arm-func/conv_arm_func.h
src/operators/kernel/central-arm-func/conv_arm_func.h
+11
-366
src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
...ators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
+0
-122
src/operators/kernel/conv_add_add_prelu_kernel.h
src/operators/kernel/conv_add_add_prelu_kernel.h
+0
-45
src/operators/kernel/conv_add_bn_kernel.h
src/operators/kernel/conv_add_bn_kernel.h
+0
-1
src/operators/kernel/conv_add_bn_relu_kernel.h
src/operators/kernel/conv_add_bn_relu_kernel.h
+0
-1
src/operators/kernel/conv_add_kernel.h
src/operators/kernel/conv_add_kernel.h
+0
-1
src/operators/kernel/conv_add_prelu_kernel.h
src/operators/kernel/conv_add_prelu_kernel.h
+0
-45
src/operators/kernel/conv_add_relu_kernel.h
src/operators/kernel/conv_add_relu_kernel.h
+0
-1
src/operators/kernel/conv_bn_add_relu_kernel.h
src/operators/kernel/conv_bn_add_relu_kernel.h
+0
-1
src/operators/kernel/conv_bn_kernel.h
src/operators/kernel/conv_bn_kernel.h
+0
-1
src/operators/kernel/conv_bn_relu_kernel.h
src/operators/kernel/conv_bn_relu_kernel.h
+0
-1
src/operators/kernel/dwconv_bn_relu_kernel.h
src/operators/kernel/dwconv_bn_relu_kernel.h
+0
-1
src/operators/kernel/lrn_kernel.h
src/operators/kernel/lrn_kernel.h
+6
-9
src/operators/math/activation.h
src/operators/math/activation.h
+1
-1
src/operators/math/channel_wise.h
src/operators/math/channel_wise.h
+2
-77
src/operators/math/depthwise_conv3x3.h
src/operators/math/depthwise_conv3x3.h
+0
-1
src/operators/math/depthwise_conv3x3_int8.cpp
src/operators/math/depthwise_conv3x3_int8.cpp
+21
-8
src/operators/math/depthwise_conv3x3_int8_arm64.cpp
src/operators/math/depthwise_conv3x3_int8_arm64.cpp
+0
-56
src/operators/math/depthwise_conv5x5.h
src/operators/math/depthwise_conv5x5.h
+0
-1
src/operators/math/gemm/pack_kernel.h
src/operators/math/gemm/pack_kernel.h
+216
-322
src/operators/math/math.h
src/operators/math/math.h
+0
-0
src/operators/math/softmax.cpp
src/operators/math/softmax.cpp
+1
-1
未找到文件。
src/framework/load_ops.h
浏览文件 @
1d475a2c
...
...
@@ -125,10 +125,6 @@ LOAD_OP1(prior_box, CPU);
LOAD_OP2
(
fusion_conv_add_relu
,
CPU
,
FPGA
);
LOAD_FUSION_MATCHER
(
fusion_conv_add_relu
);
#endif
#ifdef FUSION_CONVADDADDPRELU_OP
LOAD_OP2
(
fusion_conv_add_add_prelu
,
CPU
,
FPGA
);
LOAD_FUSION_MATCHER
(
fusion_conv_add_add_prelu
);
#endif
#ifdef FUSION_CONVADD_OP
LOAD_OP2
(
fusion_conv_add
,
CPU
,
MALI_GPU
);
LOAD_FUSION_MATCHER
(
fusion_conv_add
);
...
...
@@ -178,10 +174,6 @@ LOAD_FUSION_MATCHER(fusion_conv_add_bn);
#ifdef DROPOUT_OP
LOAD_OP2
(
dropout
,
CPU
,
FPGA
);
#endif
#ifdef FUSION_CONVADDPRELU_OP
LOAD_OP2
(
fusion_conv_add_prelu
,
CPU
,
FPGA
);
LOAD_FUSION_MATCHER
(
fusion_conv_add_prelu
);
#endif
#ifdef FUSION_DWCONVBNRELU_OP
LOAD_OP1
(
fusion_dwconv_bn_relu
,
CPU
);
LOAD_FUSION_MATCHER
(
fusion_dwconv_bn_relu
);
...
...
src/operators/conv_op.cpp
浏览文件 @
1d475a2c
...
...
@@ -18,7 +18,7 @@ limitations under the License. */
#include <vector>
#include "framework/op_proto_maker.h"
#include "framework/op_registry.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -39,9 +39,9 @@ void ConvOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
paddings
[
i
],
strides
[
i
]));
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
padding
s
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/depthwise_conv_op.cpp
浏览文件 @
1d475a2c
...
...
@@ -19,7 +19,7 @@ limitations under the License. */
#include "framework/op_proto_maker.h"
#include "framework/op_registry.h"
#include "operators/conv_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -40,9 +40,9 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
paddings
[
i
],
strides
[
i
]));
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
padding
s
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_conv_add_add_prelu_op.cpp
已删除
100644 → 0
浏览文件 @
e0f97f83
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#include "operators/fusion_conv_add_add_prelu_op.h"
#include "operators/math/conv_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
Dtype
,
typename
T
>
void
FusionConvAddAddPReluOp
<
Dtype
,
T
>::
InferShape
()
const
{
auto
in_dims
=
this
->
param_
.
Input
()
->
dims
();
auto
filter_dims
=
this
->
param_
.
Filter
()
->
dims
();
const
std
::
vector
<
int
>
&
strides
=
this
->
param_
.
Strides
();
std
::
vector
<
int
>
paddings
=
this
->
param_
.
Paddings
();
int
groups
=
this
->
param_
.
Groups
();
std
::
vector
<
int
>
dilations
=
this
->
param_
.
Dilations
();
PADDLE_MOBILE_ENFORCE
((
in_dims
.
size
()
==
filter_dims
.
size
()
&&
dilations
.
size
()
==
paddings
.
size
()
&&
paddings
.
size
()
==
strides
.
size
()),
"ConvParam is not suitable"
);
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
paddings
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
this
->
param_
.
Output
()
->
Resize
(
ddim
);
}
}
// namespace operators
}
// namespace paddle_mobile
namespace
ops
=
paddle_mobile
::
operators
;
REGISTER_FUSION_MATCHER
(
fusion_conv_add_add_prelu
,
ops
::
FusionConvAddAddPReluOpMatcher
);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU
(
fusion_conv_add_add_prelu
,
ops
::
FusionConvAddAddPReluOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA
(
fusion_conv_add_add_prelu
,
ops
::
FusionConvAddAddPReluOp
);
#endif
#endif // FUSION_CONVADDADDPRELU_OP
src/operators/fusion_conv_add_add_prelu_op.h
已删除
100644 → 0
浏览文件 @
e0f97f83
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#pragma once
#include <string>
#include <utility>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/conv_add_add_prelu_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
class
FusionConvAddAddPReluOpMatcher
:
public
framework
::
FusionOpMatcher
{
public:
FusionConvAddAddPReluOpMatcher
()
{
node_
=
framework
::
Node
(
G_OP_TYPE_CONV
);
node_
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_ELEMENTWISE_ADD
)
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_ELEMENTWISE_ADD
)
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_PRELU
);
}
void
FolderNodes
(
framework
::
Node
*
node
,
std
::
vector
<
std
::
shared_ptr
<
framework
::
Node
>>
*
removed_nodes
)
{
node
->
Folder
(
node_
.
Depth
(),
Type
(),
{{
G_OP_TYPE_ELEMENTWISE_ADD
,
{{
"Y"
,
"Y"
},
{
"Out"
,
"addOut"
},
{
"X"
,
"addX"
}}},
{
G_OP_TYPE_PRELU
,
{{
"Alpha"
,
"Alpha"
}}}},
removed_nodes
);
}
std
::
string
Type
()
{
return
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
;
}
std
::
vector
<
std
::
pair
<
int
,
std
::
string
>>
NeedCheck
()
{
DLOG
<<
" conv add add prelu check add X "
;
return
{{
2
,
"Y"
},
{
2
,
"X"
}};
}
};
template
<
typename
DeviceType
,
typename
T
>
class
FusionConvAddAddPReluOp
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddAddPReluKernel
<
DeviceType
,
T
>>
{
public:
FusionConvAddAddPReluOp
(
const
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
framework
::
Scope
*
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddAddPReluKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
void
InferShape
()
const
override
;
protected:
};
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/fusion_conv_add_bn_op.cpp
浏览文件 @
1d475a2c
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDBN_OP
#include "operators/fusion_conv_add_bn_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -36,9 +36,9 @@ void FusionConvAddBNOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
paddings
[
i
],
strides
[
i
]));
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
padding
s
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_conv_add_bn_relu_op.cpp
浏览文件 @
1d475a2c
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDBNRELU_OP
#include "operators/fusion_conv_add_bn_relu_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -36,9 +36,9 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
paddings
[
i
],
strides
[
i
]));
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
padding
s
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_conv_add_op.cpp
浏览文件 @
1d475a2c
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADD_OP
#include "operators/fusion_conv_add_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -36,9 +36,9 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
paddings
[
i
],
strides
[
i
]));
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
padding
s
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_conv_add_prelu_op.cpp
已删除
100644 → 0
浏览文件 @
e0f97f83
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#include "operators/fusion_conv_add_prelu_op.h"
#include "operators/math/conv_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
Dtype
,
typename
T
>
void
FusionConvAddPReluOp
<
Dtype
,
T
>::
InferShape
()
const
{
auto
in_dims
=
this
->
param_
.
Input
()
->
dims
();
auto
filter_dims
=
this
->
param_
.
Filter
()
->
dims
();
const
std
::
vector
<
int
>
&
strides
=
this
->
param_
.
Strides
();
std
::
vector
<
int
>
paddings
=
this
->
param_
.
Paddings
();
int
groups
=
this
->
param_
.
Groups
();
std
::
vector
<
int
>
dilations
=
this
->
param_
.
Dilations
();
PADDLE_MOBILE_ENFORCE
((
in_dims
.
size
()
==
filter_dims
.
size
()
&&
dilations
.
size
()
==
paddings
.
size
()
&&
paddings
.
size
()
==
strides
.
size
()),
"ConvParam is not suitable"
);
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
paddings
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
this
->
param_
.
Output
()
->
Resize
(
ddim
);
}
}
// namespace operators
}
// namespace paddle_mobile
namespace
ops
=
paddle_mobile
::
operators
;
REGISTER_FUSION_MATCHER
(
fusion_conv_add_prelu
,
ops
::
FusionConvAddPReluOpMatcher
);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU
(
fusion_conv_add_prelu
,
ops
::
FusionConvAddPReluOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA
(
fusion_conv_add_prelu
,
ops
::
FusionConvAddPReluOp
);
#endif
#endif
src/operators/fusion_conv_add_prelu_op.h
已删除
100644 → 0
浏览文件 @
e0f97f83
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/conv_add_prelu_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
class
FusionConvAddPReluOpMatcher
:
public
framework
::
FusionOpMatcher
{
public:
FusionConvAddPReluOpMatcher
()
{
node_
=
framework
::
Node
(
G_OP_TYPE_CONV
);
node_
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_ELEMENTWISE_ADD
)
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_PRELU
);
}
void
FolderNodes
(
framework
::
Node
*
node
,
std
::
vector
<
std
::
shared_ptr
<
framework
::
Node
>>
*
removed_nodes
)
{
node
->
Folder
(
node_
.
Depth
(),
Type
(),
{{
G_OP_TYPE_ELEMENTWISE_ADD
,
{{
"Y"
,
"Y"
}}},
{
G_OP_TYPE_PRELU
,
{{
"Alpha"
,
"Alpha"
}}}},
removed_nodes
);
}
std
::
string
Type
()
{
return
G_OP_TYPE_FUSION_CONV_ADD_PRELU
;
}
};
template
<
typename
DeviceType
,
typename
T
>
class
FusionConvAddPReluOp
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddPReluKernel
<
DeviceType
,
T
>>
{
public:
FusionConvAddPReluOp
(
const
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
framework
::
Scope
*
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddPReluKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
void
InferShape
()
const
override
;
protected:
};
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/fusion_conv_add_relu_op.cpp
浏览文件 @
1d475a2c
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDRELU_OP
#include "operators/fusion_conv_add_relu_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -36,9 +36,9 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
paddings
[
i
],
strides
[
i
]));
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
padding
s
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
this
->
param_
.
Output
()
->
Resize
(
ddim
);
...
...
src/operators/fusion_conv_bn_add_relu_op.cpp
浏览文件 @
1d475a2c
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#include "operators/fusion_conv_bn_add_relu_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -36,9 +36,9 @@ void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
paddings
[
i
],
strides
[
i
]));
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
padding
s
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_conv_bn_op.cpp
浏览文件 @
1d475a2c
...
...
@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVBN_OP
#include "operators/fusion_conv_bn_op.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -35,9 +36,9 @@ void FusionConvBNOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
paddings
[
i
],
strides
[
i
]));
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
padding
s
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_conv_bn_relu_op.cpp
浏览文件 @
1d475a2c
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVBNRELU_OP
#include "operators/fusion_conv_bn_relu_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -36,9 +36,9 @@ void FusionConvBNReluOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
paddings
[
i
],
strides
[
i
]));
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
padding
s
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_dwconv_bn_relu_op.cpp
浏览文件 @
1d475a2c
...
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_DWCONVBNRELU_OP
#include "operators/fusion_dwconv_bn_relu_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -36,9 +36,9 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
paddings
[
i
],
strides
[
i
]));
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
padding
s
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp
已删除
100644 → 0
浏览文件 @
e0f97f83
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#include "operators/kernel/conv_add_add_prelu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConvAddAddPReluKernel
<
CPU
,
float
>::
Init
(
FusionConvAddAddPReluParam
<
CPU
>
*
param
)
{
return
true
;
}
template
<
>
void
ConvAddAddPReluKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddAddPReluParam
<
CPU
>
&
param
)
{
ConvAddAddPReluCompute
<
float
>
(
param
);
}
template
class
ConvAddAddPReluKernel
<
CPU
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
浏览文件 @
1d475a2c
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <cmath>
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -62,34 +63,24 @@ void ConvAddBNReluKernel<CPU, float>::Compute(
const
FusionConvAddBNReluParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
DepthwiseConv3x3
<
float
,
float
>
(
param
);
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
WinogradConv3x3
<
8
,
3
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
ConvBNReluBasic
<
FusionConvAddBNReluParam
<
CPU
>
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
}
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
}
template
class
ConvAddBNReluKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/convolution/conv_add_kernel.cpp
浏览文件 @
1d475a2c
...
...
@@ -16,8 +16,8 @@ limitations under the License. */
#include "operators/kernel/conv_add_kernel.h"
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_add_arm_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -32,34 +32,25 @@ template <>
void
ConvAddKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
AddChannelWise
<
IDENTITY
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
AddChannelWise
<
IDENTITY
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
math
::
AddChannelWise
<
IDENTITY
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
WinogradConv3x3
<
8
,
3
>
(
param
);
math
::
AddChannelWise
<
IDENTITY
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
ConvAddBasic
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
}
math
::
AddChannelWise
<
IDENTITY
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
}
template
class
ConvAddKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/convolution/conv_add_prelu_kernel.cpp
已删除
100644 → 0
浏览文件 @
e0f97f83
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#include "operators/kernel/conv_add_prelu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_prelu_arm_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConvAddPReluKernel
<
CPU
,
float
>::
Init
(
FusionConvAddPReluParam
<
CPU
>
*
param
)
{
return
true
;
}
template
<
>
void
ConvAddPReluKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddPReluParam
<
CPU
>
&
param
)
{
ConvAddPReluCompute
<
float
>
(
param
);
}
template
class
ConvAddPReluKernel
<
CPU
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
浏览文件 @
1d475a2c
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -32,30 +33,23 @@ void ConvAddReluKernel<CPU, float>::Compute(
const
FusionConvAddReluParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
AddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
AddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
DepthwiseConv3x3
<
float
,
float
>
(
param
);
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
math
::
AddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
WinogradConv3x3
<
8
,
3
>
(
param
);
math
::
AddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
ConvAddReluBasic
<
FusionConvAddReluParam
<
CPU
>
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
}
math
::
AddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
}
template
class
ConvAddReluKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
浏览文件 @
1d475a2c
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <cmath>
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -62,34 +63,24 @@ void ConvBNAddReluKernel<CPU, float>::Compute(
const
FusionConvBNAddReluParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
DepthwiseConv3x3
<
float
,
float
>
(
param
);
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
WinogradConv3x3
<
8
,
3
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
ConvBNReluBasic
<
FusionConvBNAddReluParam
<
CPU
>
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
}
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
}
template
class
ConvBNAddReluKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
浏览文件 @
1d475a2c
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <cmath>
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -61,34 +62,24 @@ void ConvBNReluKernel<CPU, float>::Compute(
const
FusionConvBNReluParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
DepthwiseConv3x3
<
float
,
float
>
(
param
);
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
WinogradConv3x3
<
8
,
3
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
ConvBNReluBasic
<
FusionConvBNReluParam
<
CPU
>
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
}
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
}
template
class
ConvBNReluKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/convolution/conv_kernel.cpp
浏览文件 @
1d475a2c
...
...
@@ -32,10 +32,10 @@ bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
template
<
>
void
ConvKernel
<
CPU
,
float
>::
Compute
(
const
ConvParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
#ifndef __aarch64__
case
ConvParam
<
CPU
>::
EXEC_GEMM_INT8
:
GemmConv
<
int8_t
,
int32_t
>
(
param
);
break
;
#ifndef __aarch64__
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3_INT8
:
DepthwiseConv3x3
<
int8_t
,
int32_t
>
(
param
);
break
;
...
...
@@ -44,12 +44,8 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> ¶m) {
break
;
#endif // __aarch64__
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
DepthwiseConv3x3
<
float
,
float
>
(
param
);
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
...
...
src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
浏览文件 @
1d475a2c
...
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <cmath>
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
@@ -61,37 +62,28 @@ void DWConvBNReluKernel<CPU, float>::Compute(
const
FusionDWConvBNReluParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
DepthwiseConv3x3
<
float
,
float
>
(
param
);
break
;
#ifndef __aarch64__
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
WinogradConv3x3
<
8
,
3
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
#endif // __aarch64__
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
ConvBNReluBasic
<
FusionDWConvBNReluParam
<
CPU
>
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
}
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
}
template
class
DWConvBNReluKernel
<
CPU
,
float
>;
}
// namespace operators
...
...
src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
已删除
100644 → 0
浏览文件 @
e0f97f83
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#pragma once
#include <string>
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
ConvAddAddPReluCompute
(
const
FusionConvAddAddPReluParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
Tensor
bias1
=
*
param
.
Bias1
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
float
*
biase_data
=
bias
.
data
<
float
>
();
int
axis
=
param
.
Axis
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
Tensor
aa
=
*
param
.
InputAlpha
();
float
*
p
=
aa
.
data
<
float
>
();
std
::
string
mode
=
param
.
Mode
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
Tensor
bias1_batch
=
bias1
.
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
bias1_slice
=
bias1_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
float
*
biase_data1
=
bias1_slice
.
data
<
float
>
();
math
::
MatMulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
p
,
mode
,
biase_data
,
biase_data1
);
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif // FUSION_CONVADDADDPRELU_OP
src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
已删除
100644 → 0
浏览文件 @
e0f97f83
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#pragma once
#include <string>
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
ConvAddPReluCompute
(
const
FusionConvAddPReluParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
float
*
biase_data
=
bias
.
data
<
float
>
();
int
axis
=
param
.
Axis
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
Tensor
aa
=
*
param
.
InputAlpha
();
float
*
p
=
aa
.
data
<
float
>
();
std
::
string
mode
=
param
.
Mode
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
MatMulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
p
,
mode
,
biase_data
,
nullptr
);
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif // FUSION_CONVADDPRELU_OP
src/operators/kernel/central-arm-func/conv_a
dd_arm_func.h
→
src/operators/kernel/central-arm-func/conv_a
rm_func.cpp
浏览文件 @
1d475a2c
...
...
@@ -12,38 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADD_OP
#pragma once
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/depthwise_conv5x5.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/pad.h"
#include "operators/math/vol2col.h"
#include "operators/math/winograd/winograd_transform.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
void
ConvAddBasic
(
const
FusionConvAddParam
<
CPU
>
&
param
)
{
int
ConvOutputSize
(
int
input_size
,
int
filter_size
,
int
dilation
,
int
padding
,
int
stride
)
{
const
int
dkernel
=
dilation
*
(
filter_size
-
1
)
+
1
;
int
output_size
=
(
input_size
+
2
*
padding
-
dkernel
)
/
stride
+
1
;
return
output_size
;
}
bool
IsExpand
(
const
std
::
vector
<
int64_t
>
&
filter_dim
,
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
dilations
)
{
bool
filter_1
=
true
,
strides_1
=
true
,
padding_0
=
true
,
dilation_1
=
true
;
for
(
size_t
j
=
0
;
j
<
strides
.
size
();
++
j
)
{
filter_1
=
filter_1
&&
(
static_cast
<
int
>
(
filter_dim
[
j
+
2
])
==
1
);
strides_1
=
strides_1
&&
(
strides
[
j
]
==
1
);
padding_0
=
padding_0
&&
(
paddings
[
j
]
==
0
);
dilation_1
=
dilation_1
&&
(
dilations
[
j
]
==
1
);
}
return
!
(
filter_1
&&
strides_1
&&
padding_0
&&
dilation_1
);
}
template
<
typename
Itype
,
typename
Otype
>
void
GemmConv
(
const
ConvParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
float
*
biase_data
=
bias
.
data
<
float
>
();
output
->
mutable_data
<
Otype
>
();
int
axis
=
param
.
Axis
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
const
std
::
vector
<
int
>
strides
=
param
.
Strides
();
const
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
const
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
...
...
@@ -57,12 +73,11 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
bool
is_expand
=
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col
.
mutable_data
<
Itype
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
...
...
@@ -81,9 +96,10 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
math
::
Vol2ColFunctor
<
CPU
,
Itype
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
Itype
>
im2col
;
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
...
...
@@ -92,8 +108,8 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
)
;
// col_matrix
.ShareDataWith(in_slice);
col_matrix
=
in_slice
;
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
...
...
@@ -105,17 +121,122 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
MatMul
<
float
,
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
math
::
MatMul
<
Itype
,
Otype
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
false
,
biase_data
);
static_cast
<
float
>
(
0
),
false
,
static_cast
<
Otype
*>
(
nullptr
));
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
template
<
int
tile
,
int
kernel
>
void
WinogradConv3x3
(
const
ConvParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
filter
=
param
.
transformed_filter_
;
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
batch_size
=
input
->
dims
()[
0
];
int
groups
=
param
.
Groups
();
const
std
::
vector
<
int
>
&
paddings
=
param
.
Paddings
();
auto
winograd_pad
=
[
&
](
int
width
,
int
pad
)
{
int
output_tile
=
tile
-
kernel
+
1
;
// int tiles = (width + pad - kernel) / output_tile + 1;
// return (tiles - 1) * output_tile + tile - width;
int
pad_width
=
(
width
+
2
*
pad
-
kernel
)
/
output_tile
*
output_tile
;
return
pad_width
+
tile
-
width
;
};
math
::
PadFunctor
<
CPU
,
float
>
pad
;
Tensor
input_pad
;
framework
::
Tensor
transformed_input
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
);
// int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
// int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
int
pad_bottom
=
paddings
[
0
];
int
pad_right
=
paddings
[
1
];
if
(
paddings
[
0
]
||
paddings
[
1
]
||
pad_bottom
||
pad_right
)
{
framework
::
DDim
pad_shape
=
in_batch
.
dims
();
pad_shape
[
2
]
+=
paddings
[
0
]
+
pad_bottom
;
pad_shape
[
3
]
+=
paddings
[
1
]
+
pad_right
;
input_pad
.
mutable_data
<
float
>
(
pad_shape
);
pad
(
in_batch
,
paddings
[
0
],
pad_bottom
,
paddings
[
1
],
pad_right
,
&
input_pad
);
}
else
{
input_pad
=
in_batch
;
}
// tile input and transform
math
::
winograd_transform_input
<
tile
,
kernel
>
(
input_pad
,
&
transformed_input
);
// caculate output
math
::
winograd_transform_output
<
tile
,
kernel
>
(
transformed_input
,
*
filter
,
output
);
}
}
template
<
typename
Itype
,
typename
Otype
>
void
DepthwiseConv3x3
(
const
ConvParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
filter
=
param
.
Filter
();
const
std
::
vector
<
int
>
&
paddings
=
param
.
Paddings
();
const
std
::
vector
<
int
>
&
strides
=
param
.
Strides
();
const
int
batch_size
=
input
->
dims
()[
0
];
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
Otype
>
();
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
);
if
(
strides
[
0
]
==
1
)
{
math
::
DepthwiseConv3x3S1
<
Itype
,
Otype
>
(
in_batch
,
*
filter
,
paddings
,
&
out_batch
);
}
else
if
(
strides
[
0
]
==
2
)
{
math
::
DepthwiseConv3x3S2
<
Itype
,
Otype
>
(
in_batch
,
*
filter
,
paddings
,
&
out_batch
);
}
else
{
GemmConv
<
Itype
,
Otype
>
(
param
);
}
}
}
template
<
typename
Itype
,
typename
Otype
>
void
DepthwiseConv5x5
(
const
ConvParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
filter
=
param
.
Filter
();
const
std
::
vector
<
int
>
&
paddings
=
param
.
Paddings
();
const
std
::
vector
<
int
>
&
strides
=
param
.
Strides
();
const
int
batch_size
=
input
->
dims
()[
0
];
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
Otype
>
();
// if (strides[0] == 1) {
// for (int i = 0; i < batch_size; i++) {
// Tensor in_batch = input->Slice(i, i + 1);
// Tensor out_batch = output->Slice(i, i + 1);
// math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
// &out_batch);
// }
// } else {
GemmConv
<
Itype
,
Otype
>
(
param
);
// }
}
template
void
GemmConv
<
float
,
float
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
WinogradConv3x3
<
8
,
3
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
DepthwiseConv3x3
<
float
,
float
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
DepthwiseConv5x5
<
float
,
float
>(
const
ConvParam
<
CPU
>
&
param
);
#ifndef __aarch64__
template
void
GemmConv
<
int8_t
,
int32_t
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
DepthwiseConv3x3
<
int8_t
,
int32_t
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
DepthwiseConv5x5
<
int8_t
,
int32_t
>(
const
ConvParam
<
CPU
>
&
param
);
#endif
}
// namespace operators
}
// namespace paddle_mobile
src/operators/kernel/central-arm-func/conv_arm_func.h
浏览文件 @
1d475a2c
...
...
@@ -15,386 +15,31 @@ limitations under the License. */
#ifdef CONV_OP
#pragma once
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/depthwise_conv5x5.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/pad.h"
#include "operators/math/vol2col.h"
#include "operators/math/winograd/winograd_transform.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
Itype
,
typename
Otype
>
inline
void
GemmConv
(
const
ConvParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
Otype
>
();
int
groups
=
param
.
Groups
();
const
std
::
vector
<
int
>
strides
=
param
.
Strides
();
const
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
const
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
Itype
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
Itype
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
Itype
>
im2col
;
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
// col_matrix.ShareDataWith(in_slice);
col_matrix
=
in_slice
;
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
int
ConvOutputSize
(
int
input_size
,
int
filter_size
,
int
dilation
,
int
padding
,
int
stride
);
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
bool
IsExpand
(
const
std
::
vector
<
int64_t
>
&
filter_dim
,
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
dilations
);
math
::
MatMul
<
Itype
,
Otype
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
),
false
,
static_cast
<
Otype
*>
(
nullptr
));
}
}
}
template
<
typename
Itype
,
typename
Otype
>
void
GemmConv
(
const
ConvParam
<
CPU
>
&
param
);
template
<
int
tile
,
int
kernel
>
inline
void
WinogradConv3x3
(
const
ConvParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
filter
=
param
.
transformed_filter_
;
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
batch_size
=
input
->
dims
()[
0
];
int
groups
=
param
.
Groups
();
const
std
::
vector
<
int
>
&
paddings
=
param
.
Paddings
();
auto
winograd_pad
=
[
&
](
int
width
,
int
pad
)
{
int
output_tile
=
tile
-
kernel
+
1
;
// int tiles = (width + pad - kernel) / output_tile + 1;
// return (tiles - 1) * output_tile + tile - width;
int
pad_width
=
(
width
+
2
*
pad
-
kernel
)
/
output_tile
*
output_tile
;
return
pad_width
+
tile
-
width
;
};
void
WinogradConv3x3
(
const
ConvParam
<
CPU
>
&
param
);
math
::
PadFunctor
<
CPU
,
float
>
pad
;
Tensor
input_pad
;
framework
::
Tensor
transformed_input
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
);
// int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
// int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
int
pad_bottom
=
paddings
[
0
];
int
pad_right
=
paddings
[
1
];
if
(
paddings
[
0
]
||
paddings
[
1
]
||
pad_bottom
||
pad_right
)
{
framework
::
DDim
pad_shape
=
in_batch
.
dims
();
pad_shape
[
2
]
+=
paddings
[
0
]
+
pad_bottom
;
pad_shape
[
3
]
+=
paddings
[
1
]
+
pad_right
;
input_pad
.
mutable_data
<
float
>
(
pad_shape
);
pad
(
in_batch
,
paddings
[
0
],
pad_bottom
,
paddings
[
1
],
pad_right
,
&
input_pad
);
}
else
{
input_pad
=
in_batch
;
}
// tile input and transform
math
::
winograd_transform_input
<
tile
,
kernel
>
(
input_pad
,
&
transformed_input
);
// caculate output
math
::
winograd_transform_output
<
tile
,
kernel
>
(
transformed_input
,
*
filter
,
output
);
}
}
#ifndef __aarch64__
// int8 DepthwiseConv3x3
template
<
typename
Itype
,
typename
Otype
>
inline
void
DepthwiseConv3x3
(
const
ConvParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
filter
=
param
.
Filter
();
const
std
::
vector
<
int
>
&
paddings
=
param
.
Paddings
();
const
std
::
vector
<
int
>
&
strides
=
param
.
Strides
();
const
int
batch_size
=
input
->
dims
()[
0
];
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
Otype
>
();
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
);
if
(
strides
[
0
]
==
1
)
{
math
::
DepthwiseConv3x3S1
<
Itype
,
Otype
>
(
in_batch
,
*
filter
,
paddings
,
&
out_batch
);
}
else
if
(
strides
[
0
]
==
2
)
{
math
::
DepthwiseConv3x3S2
<
Itype
,
Otype
>
(
in_batch
,
*
filter
,
paddings
,
&
out_batch
);
}
else
{
GemmConv
<
Itype
,
Otype
>
(
param
);
}
}
}
#endif // __aarch64__
void
DepthwiseConv3x3
(
const
ConvParam
<
CPU
>
&
param
);
template
<
typename
Itype
,
typename
Otype
>
inline
void
DepthwiseConv5x5
(
const
ConvParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
filter
=
param
.
Filter
();
const
std
::
vector
<
int
>
&
paddings
=
param
.
Paddings
();
const
std
::
vector
<
int
>
&
strides
=
param
.
Strides
();
const
int
batch_size
=
input
->
dims
()[
0
];
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
Otype
>
();
// if (strides[0] == 1) {
// for (int i = 0; i < batch_size; i++) {
// Tensor in_batch = input->Slice(i, i + 1);
// Tensor out_batch = output->Slice(i, i + 1);
// math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
// &out_batch);
// }
// } else {
GemmConv
<
Itype
,
Otype
>
(
param
);
// }
}
template
<
typename
ParamType
>
void
ConvAddReluBasic
(
const
ParamType
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
float
alpha
=
1.0
f
;
float
beta
=
1.0
f
;
int32_t
groups
=
param
.
Groups
();
int32_t
axis
=
param
.
Axis
();
std
::
vector
<
int32_t
>
strides
=
param
.
Strides
();
std
::
vector
<
int32_t
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int32_t
>
dilations
=
param
.
Dilations
();
const
int32_t
batch_size
=
static_cast
<
int32_t
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int32_t
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int32_t
in_step
=
static_cast
<
int32_t
>
(
input
->
dims
()[
1
])
/
groups
;
int32_t
out_step
=
static_cast
<
int32_t
>
(
output
->
dims
()[
1
])
/
groups
;
float
*
bias_data
=
bias
.
data
<
float
>
();
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int32_t
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int32_t
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col_matrix
=
in_slice
;
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int32_t
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
MatMul
<
float
,
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
alpha
,
&
out_slice
,
beta
,
true
,
bias_data
);
}
}
}
template
<
typename
ParamType
>
void
ConvBNReluBasic
(
const
ParamType
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
new_bias
=
*
param
.
NewBias
();
Tensor
new_scale
=
*
param
.
NewScale
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col_matrix
=
in_slice
;
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
MatMulWithBn
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
}
}
}
void
DepthwiseConv5x5
(
const
ConvParam
<
CPU
>
&
param
);
}
// namespace operators
}
// namespace paddle_mobile
...
...
src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
已删除
100644 → 0
浏览文件 @
e0f97f83
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#pragma once
#include <vector>
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
void
ConvBNAddReluBasic
(
const
FusionConvBNAddReluParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
new_bias
=
*
param
.
NewBias
();
Tensor
new_scale
=
*
param
.
NewScale
();
Tensor
*
bias1
=
param
.
Bias
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
Tensor
bias_batch
=
bias1
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
bias_data
=
bias_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
MatMulWithBn
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
true
,
&
new_scale
,
&
new_bias
,
g
,
bias_data
.
data
<
float
>
());
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/conv_add_add_prelu_kernel.h
已删除
100644 → 0
浏览文件 @
e0f97f83
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_CONVADDADDPRELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
using
framework
::
DDim
;
using
framework
::
OpKernelBase
;
template
<
typename
DeviceType
,
typename
T
>
class
ConvAddAddPReluKernel
:
public
OpKernelBase
<
DeviceType
,
FusionConvAddAddPReluParam
<
DeviceType
>>
{
public:
void
Compute
(
const
FusionConvAddAddPReluParam
<
DeviceType
>
&
param
);
bool
Init
(
FusionConvAddAddPReluParam
<
DeviceType
>
*
param
);
};
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/conv_add_bn_kernel.h
浏览文件 @
1d475a2c
...
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/conv_add_bn_relu_kernel.h
浏览文件 @
1d475a2c
...
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/conv_add_kernel.h
浏览文件 @
1d475a2c
...
...
@@ -23,7 +23,6 @@ limitations under the License. */
#include "common/common.h"
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
...
...
src/operators/kernel/conv_add_prelu_kernel.h
已删除
100644 → 0
浏览文件 @
e0f97f83
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_CONVADDPRELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
using
framework
::
DDim
;
using
framework
::
OpKernelBase
;
template
<
typename
DeviceType
,
typename
T
>
class
ConvAddPReluKernel
:
public
OpKernelBase
<
DeviceType
,
FusionConvAddPReluParam
<
DeviceType
>>
{
public:
void
Compute
(
const
FusionConvAddPReluParam
<
DeviceType
>
&
param
);
bool
Init
(
FusionConvAddPReluParam
<
DeviceType
>
*
param
);
};
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/conv_add_relu_kernel.h
浏览文件 @
1d475a2c
...
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/conv_bn_add_relu_kernel.h
浏览文件 @
1d475a2c
...
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/conv_bn_kernel.h
浏览文件 @
1d475a2c
...
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/conv_bn_relu_kernel.h
浏览文件 @
1d475a2c
...
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/dwconv_bn_relu_kernel.h
浏览文件 @
1d475a2c
...
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/lrn_kernel.h
浏览文件 @
1d475a2c
...
...
@@ -15,24 +15,21 @@ limitations under the License. */
#pragma once
#ifdef LRN_OP
#include <cmath>
#ifdef _OPENMP
#include <omp.h>
#endif
#include "framework/operator.h"
#include "operators/op_param.h"
#include <cmath>
#ifdef __ARM_NEON
#include
"arm_neon.h"
#include "operators/math/math
_func_neon
.h"
#include
<arm_neon.h>
#include "operators/math/math.h"
#endif
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
using
namespace
framework
;
template
<
typename
T
>
struct
LRNFunctor
{
void
operator
()(
const
framework
::
Tensor
&
input
,
framework
::
Tensor
*
out
,
int
N
,
...
...
src/operators/math/activation.h
浏览文件 @
1d475a2c
...
...
@@ -21,7 +21,7 @@ limitations under the License. */
#include "common/types.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#include "operators/math/math
_func_neon
.h"
#include "operators/math/math.h"
#endif
namespace
paddle_mobile
{
...
...
src/operators/math/c
onv_func
.h
→
src/operators/math/c
hannel_wise
.h
浏览文件 @
1d475a2c
...
...
@@ -14,91 +14,16 @@ limitations under the License. */
#pragma once
#include <vector>
#include "framework/tensor.h"
#include "operators/math/activation.h"
#ifdef __ARM_NEON
#include <arm_neon.h>
#endif
#include "framework/ddim.h"
#include "framework/tensor.h"
#include "operators/math/activation.h"
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
using
framework
::
DDim
;
using
framework
::
Tensor
;
inline
int
ConvOutputSize
(
int
input_size
,
int
filter_size
,
int
dilation
,
int
padding
,
int
stride
)
{
const
int
dkernel
=
dilation
*
(
filter_size
-
1
)
+
1
;
int
output_size
=
(
input_size
+
2
*
padding
-
dkernel
)
/
stride
+
1
;
return
output_size
;
}
inline
void
expand_bias
(
Tensor
&
bias
,
int
axis
,
const
DDim
&
dDim
)
{
// NOLINT
const
auto
bias_ptr
=
bias
.
data
<
float
>
();
const
DDim
bias_ddim
=
bias
.
dims
();
PADDLE_MOBILE_ENFORCE
(
bias
.
dims
().
size
()
==
1
,
"the bias tensor's dims size != 1"
)
DDim
outer_ddim
=
paddle_mobile
::
framework
::
slice_ddim
(
dDim
,
0
,
axis
+
1
);
DDim
inner_ddim
=
paddle_mobile
::
framework
::
slice_ddim
(
dDim
,
axis
+
1
,
dDim
.
size
());
int
outer_size
=
paddle_mobile
::
framework
::
product
(
outer_ddim
);
int
inner_size
=
paddle_mobile
::
framework
::
product
(
inner_ddim
);
bias
.
Resize
(
dDim
);
auto
new_ptr
=
bias
.
mutable_data
<
float
>
();
int
axis_size
=
dDim
[
axis
];
#ifdef __ARM_NEON
for
(
int
i
=
0
;
i
<
outer_size
;
++
i
)
{
int
inner_num
=
inner_size
>>
4
;
int
remain
=
inner_size
-
(
inner_num
<<
4
);
float
v_bias
=
bias_ptr
[
i
*
axis_size
/
outer_size
];
for
(;
inner_num
>
0
;
inner_num
--
)
{
float32x4_t
v_newptr1
=
vdupq_n_f32
(
v_bias
);
float32x4_t
v_newptr2
=
vdupq_n_f32
(
v_bias
);
float32x4_t
v_newptr3
=
vdupq_n_f32
(
v_bias
);
float32x4_t
v_newptr4
=
vdupq_n_f32
(
v_bias
);
vst1q_f32
(
new_ptr
,
v_newptr1
);
new_ptr
+=
4
;
vst1q_f32
(
new_ptr
,
v_newptr2
);
new_ptr
+=
4
;
vst1q_f32
(
new_ptr
,
v_newptr3
);
new_ptr
+=
4
;
vst1q_f32
(
new_ptr
,
v_newptr4
);
new_ptr
+=
4
;
}
for
(;
remain
>
0
;
remain
--
)
{
*
new_ptr
=
v_bias
;
new_ptr
++
;
}
}
#else
for
(
int
i
=
0
;
i
<
outer_size
;
++
i
)
{
float
v_bias
=
bias_ptr
[
i
*
axis_size
/
outer_size
];
for
(
int
j
=
0
;
j
<
inner_size
;
++
j
)
{
new_ptr
[
i
*
inner_size
+
j
]
=
v_bias
;
}
}
#endif
}
inline
bool
IsExpand
(
const
std
::
vector
<
int64_t
>
&
filter_dim
,
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
dilations
)
{
bool
filter_1
=
true
,
strides_1
=
true
,
padding_0
=
true
,
dilation_1
=
true
;
for
(
size_t
j
=
0
;
j
<
strides
.
size
();
++
j
)
{
filter_1
=
filter_1
&&
(
static_cast
<
int
>
(
filter_dim
[
j
+
2
])
==
1
);
strides_1
=
strides_1
&&
(
strides
[
j
]
==
1
);
padding_0
=
padding_0
&&
(
paddings
[
j
]
==
0
);
dilation_1
=
dilation_1
&&
(
dilations
[
j
]
==
1
);
}
return
!
(
filter_1
&&
strides_1
&&
padding_0
&&
dilation_1
);
}
template
<
ActivationType
Act
>
void
AddChannelWise
(
const
framework
::
Tensor
*
input
,
const
framework
::
Tensor
*
bias
,
framework
::
Tensor
*
output
)
{
...
...
src/operators/math/depthwise_conv3x3.h
浏览文件 @
1d475a2c
...
...
@@ -17,7 +17,6 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "framework/tensor.h"
#include "operators/math/conv_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
src/operators/math/depthwise_conv3x3_int8.cpp
浏览文件 @
1d475a2c
...
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(__ARM_NEON__)
&& !defined(__aarch64__
)
#if defined(__ARM_NEON__)
|| defined(__ARM_NEON
)
#include <arm_neon.h>
#include "operators/math/depthwise_conv3x3.h"
...
...
@@ -70,7 +70,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
DEPTHWISE_CONV_NORMAL_BORDER
(
0
,
valid_w_start
)
// middle
int
remain_start
=
valid_w_start
;
#ifdef __ARM_NEON__
int
output_tiles
=
(
valid_w_end
-
valid_w_start
)
/
6
;
remain_start
=
valid_w_start
+
output_tiles
*
6
;
int32x4_t
_sum0
,
_sum1
;
...
...
@@ -94,7 +93,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
vst1q_s32
(
output_ptr
+
output_offset
,
_sum0
);
vst1_s32
(
output_ptr
+
output_offset
+
4
,
vget_low_s32
(
_sum1
));
}
#endif // __ARM_NEON__
for
(
int
w
=
remain_start
;
w
<
valid_w_end
;
++
w
)
{
int32_t
value
=
0
;
int
input_start
=
-
padding_w
+
w
*
Stride_w
;
...
...
@@ -215,6 +213,8 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
output_ptr2
+=
valid_w_start
;
output_ptr3
+=
valid_w_start
;
}
#if __aarch64__
#else
// valid
int
loop
=
output_w_tiles
;
asm
volatile
(
...
...
@@ -525,6 +525,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
#endif // __aarch64__
// pad right
if
(
padding_w
)
{
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
-
2
)));
...
...
@@ -618,7 +619,9 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
output_ptr0
+=
valid_w_start
;
output_ptr1
+=
valid_w_start
;
}
// valid
// valid
#if __aarch64__
#else
int
loop
=
output_w_tiles
;
asm
volatile
(
"cmp %[loop], #0
\n
"
...
...
@@ -804,6 +807,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
#endif // __aarch64__
// pad right
if
(
padding_w
)
{
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
-
2
)));
...
...
@@ -869,7 +873,9 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
}
output_ptr0
+=
valid_w_start
;
}
// valid
// valid
#if __aarch64__
#else
int
loop
=
output_w_tiles
;
asm
volatile
(
"cmp %[loop], #0
\n
"
...
...
@@ -993,6 +999,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
#endif // __aarch64__
// pad right
if
(
padding_w
)
{
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
-
2
)));
...
...
@@ -1152,7 +1159,9 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
output_ptr1
+=
valid_w_start
;
output_ptr2
+=
valid_w_start
;
}
// valid
// valid
#if __aarch64__
#else
int
loop
=
output_w_tiles
;
asm
volatile
(
"cmp %[loop], #0
\n
"
...
...
@@ -1411,6 +1420,7 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
#endif // __aarch64__
// pad right
if
(
padding_w
>
0
)
{
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
)));
...
...
@@ -1490,7 +1500,9 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
input_ptr2
+=
valid_input_w_start
;
output_ptr0
+=
valid_w_start
;
}
// valid
// valid
#if __aarch64__
#else
int
loop
=
output_w_tiles
;
asm
volatile
(
"cmp %[loop], #0
\n
"
...
...
@@ -1608,6 +1620,7 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
#endif // __aarch64__
// pad right
if
(
padding_w
>
0
)
{
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
)));
...
...
@@ -1645,4 +1658,4 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
}
// namespace operators
}
// namespace paddle_mobile
#endif
#endif
// __ARM_NEON__
src/operators/math/depthwise_conv3x3_int8_arm64.cpp
已删除
100644 → 0
浏览文件 @
e0f97f83
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(__ARM_NEON__) && defined(__aarch64__)
#include "operators/math/depthwise_conv3x3.h"
#ifdef __ARM_NEON__
#include <arm_neon.h>
#endif
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
// template<>
// void DepthwiseConv3x3<int8_t, int32_t>(
// const framework::Tensor *input, const framework::Tensor *filter,
// const std::vector<int> &strides, framework::Tensor *output) {
// PADDLE_MOBILE_THROW_EXCEPTION(
// "Depthwise conv with generic strides has not been implemented.");
// }
template
<
>
void
DepthwiseConv3x3S1
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
filter
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
)
{
PADDLE_MOBILE_THROW_EXCEPTION
(
"Depthwise conv3x3 with stride 1 for arm v8 has not been implemented."
);
}
template
<
>
void
DepthwiseConv3x3S2
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
filter
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
)
{
PADDLE_MOBILE_THROW_EXCEPTION
(
"Depthwise conv3x3 with stride 2 for arm v8 has not been implemented."
);
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/depthwise_conv5x5.h
浏览文件 @
1d475a2c
...
...
@@ -17,7 +17,6 @@ limitations under the License. */
#include <algorithm>
#include <vector>
#include "framework/tensor.h"
#include "operators/math/conv_func.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
src/operators/math/gemm/pack_kernel.h
浏览文件 @
1d475a2c
...
...
@@ -31,345 +31,239 @@ inline float32x4_t vandq_f32_u32(float32x4_t x, uint32x4_t mask) {
void
pack_lhs_6r
(
const
int
m
,
const
int
k
,
const
float
*
A
,
const
int
lda
,
float
*
output
,
const
bool
unroll
)
{
float
*
zero
=
new
float
[
k
];
memset
(
zero
,
0
,
k
*
sizeof
(
float
));
uint32_t
mask
[
8
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
4
,
5
};
int
remain_k
=
k
&
0x3
;
uint32x4_t
vzero
=
vdupq_n_u32
(
0
);
uint32x4_t
vmask1
=
vcltq_u32
(
vld1q_u32
(
mask
),
vdupq_n_u32
(
remain_k
));
const
int
m_tail
=
m
%
6
;
const
int
i_length
=
m
-
m_tail
;
for
(
int
i
=
0
;
i
<
i_length
;
i
+=
6
)
{
#pragma omp parallel for if (unroll)
for
(
int
i
=
0
;
i
<
m
-
5
;
i
+=
6
)
{
const
float
*
a0
=
A
+
i
*
lda
;
const
float
*
a1
=
A
+
(
i
+
1
)
*
lda
;
const
float
*
a2
=
A
+
(
i
+
2
)
*
lda
;
const
float
*
a3
=
A
+
(
i
+
3
)
*
lda
;
const
float
*
a4
=
A
+
(
i
+
4
)
*
lda
;
const
float
*
a5
=
A
+
(
i
+
5
)
*
lda
;
float
*
local_buffer
=
output
+
i
*
k
;
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
*
local_buffer
++
=
*
a0
++
;
*
local_buffer
++
=
*
a1
++
;
*
local_buffer
++
=
*
a2
++
;
*
local_buffer
++
=
*
a3
++
;
*
local_buffer
++
=
*
a4
++
;
*
local_buffer
++
=
*
a5
++
;
float
*
out_ptr
=
output
+
i
*
k
;
int
loops
=
k
>>
2
;
if
(
loops
>
0
)
{
#if __aarch64__
for
(
int
l
=
0
;
l
<
loops
;
++
l
)
{
float32x4_t
_d0
=
vld1q_f32
(
a0
);
float32x4_t
_d1
=
vld1q_f32
(
a1
);
float32x4_t
_d2
=
vld1q_f32
(
a2
);
float32x4_t
_d3
=
vld1q_f32
(
a3
);
float32x4_t
_d4
=
vld1q_f32
(
a4
);
float32x4_t
_d5
=
vld1q_f32
(
a5
);
float32x4x2_t
_q0
=
vtrnq_f32
(
_d0
,
_d1
);
float32x4x2_t
_q1
=
vtrnq_f32
(
_d2
,
_d3
);
float32x4x2_t
_q3
=
vtrnq_f32
(
_d4
,
_d5
);
_d0
=
vcombine_f32
(
vget_low_f32
(
_q0
.
val
[
0
]),
vget_low_f32
(
_q1
.
val
[
0
]));
_d1
=
vcombine_f32
(
vget_low_f32
(
_q0
.
val
[
1
]),
vget_low_f32
(
_q1
.
val
[
1
]));
_d2
=
vcombine_f32
(
vget_high_f32
(
_q0
.
val
[
0
]),
vget_high_f32
(
_q1
.
val
[
0
]));
_d3
=
vcombine_f32
(
vget_high_f32
(
_q0
.
val
[
1
]),
vget_high_f32
(
_q1
.
val
[
1
]));
vst1q_f32
(
out_ptr
,
_d0
);
vst1_f32
(
out_ptr
+
4
,
vget_low_f32
(
_q3
.
val
[
0
]));
vst1q_f32
(
out_ptr
+
6
,
_d1
);
vst1_f32
(
out_ptr
+
10
,
vget_low_f32
(
_q3
.
val
[
1
]));
vst1q_f32
(
out_ptr
+
12
,
_d2
);
vst1_f32
(
out_ptr
+
16
,
vget_high_f32
(
_q3
.
val
[
0
]));
vst1q_f32
(
out_ptr
+
18
,
_d3
);
vst1_f32
(
out_ptr
+
22
,
vget_high_f32
(
_q3
.
val
[
1
]));
a0
+=
4
;
a1
+=
4
;
a2
+=
4
;
a3
+=
4
;
a4
+=
4
;
a5
+=
4
;
out_ptr
+=
24
;
}
#else
asm
volatile
(
"loop_4k_%=:
\n
"
"vld1.32 {d0-d1}, [%[a0]]!
\n
"
"vld1.32 {d2-d3}, [%[a1]]!
\n
"
"vld1.32 {d4-d5}, [%[a2]]!
\n
"
"vld1.32 {d6-d7}, [%[a3]]!
\n
"
"vld1.32 {d8-d9}, [%[a4]]!
\n
"
"vld1.32 {d10-d11}, [%[a5]]!
\n
"
"vtrn.32 q0, q1
\n
"
"vtrn.32 q2, q3
\n
"
"vtrn.32 q4, q5
\n
"
"vswp.32 d1, d4
\n
"
"vswp.32 d3, d6
\n
"
"vst1.32 {q0}, [%[out]]!
\n
"
"vst1.32 {d8}, [%[out]]!
\n
"
"vst1.32 {q1}, [%[out]]!
\n
"
"vst1.32 {d10}, [%[out]]!
\n
"
"vst1.32 {q2}, [%[out]]!
\n
"
"vst1.32 {d9}, [%[out]]!
\n
"
"vst1.32 {q3}, [%[out]]!
\n
"
"vst1.32 {d11}, [%[out]]!
\n
"
"subs %[loops], #1
\n
"
"bne loop_4k_%=
\n
"
:
[
out
]
"+r"
(
out_ptr
),
[
a0
]
"+r"
(
a0
),
[
a1
]
"+r"
(
a1
),
[
a2
]
"+r"
(
a2
),
[
a3
]
"+r"
(
a3
),
[
a4
]
"+r"
(
a4
),
[
a5
]
"+r"
(
a5
),
[
loops
]
"+r"
(
loops
)
:
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
);
#endif
}
if
(
remain_k
>
0
)
{
float32x4_t
_d0
=
vld1q_f32
(
a0
);
float32x4_t
_d1
=
vld1q_f32
(
a1
);
float32x4_t
_d2
=
vld1q_f32
(
a2
);
float32x4_t
_d3
=
vld1q_f32
(
a3
);
float32x4_t
_d4
=
vld1q_f32
(
a4
);
float32x4_t
_d5
=
vld1q_f32
(
a5
);
_d0
=
vandq_f32_u32
(
_d0
,
vmask1
);
_d1
=
vandq_f32_u32
(
_d1
,
vmask1
);
_d2
=
vandq_f32_u32
(
_d2
,
vmask1
);
_d3
=
vandq_f32_u32
(
_d3
,
vmask1
);
_d4
=
vandq_f32_u32
(
_d4
,
vmask1
);
_d5
=
vandq_f32_u32
(
_d5
,
vmask1
);
float32x4x2_t
_q0
=
vtrnq_f32
(
_d0
,
_d1
);
float32x4x2_t
_q1
=
vtrnq_f32
(
_d2
,
_d3
);
float32x4x2_t
_q3
=
vtrnq_f32
(
_d4
,
_d5
);
_d0
=
vcombine_f32
(
vget_low_f32
(
_q0
.
val
[
0
]),
vget_low_f32
(
_q1
.
val
[
0
]));
_d1
=
vcombine_f32
(
vget_low_f32
(
_q0
.
val
[
1
]),
vget_low_f32
(
_q1
.
val
[
1
]));
_d2
=
vcombine_f32
(
vget_high_f32
(
_q0
.
val
[
0
]),
vget_high_f32
(
_q1
.
val
[
0
]));
switch
(
remain_k
)
{
case
3
:
vst1q_f32
(
out_ptr
+
12
,
_d2
);
vst1_f32
(
out_ptr
+
16
,
vget_high_f32
(
_q3
.
val
[
0
]));
case
2
:
vst1q_f32
(
out_ptr
+
6
,
_d1
);
vst1_f32
(
out_ptr
+
10
,
vget_low_f32
(
_q3
.
val
[
1
]));
case
1
:
vst1q_f32
(
out_ptr
,
_d0
);
vst1_f32
(
out_ptr
+
4
,
vget_low_f32
(
_q3
.
val
[
0
]));
default:
break
;
}
}
}
if
(
m_tail
!=
0
)
{
const
float
*
a0
=
A
+
i_length
*
lda
;
int
remain_m
=
m
%
6
;
if
(
remain_m
)
{
int
remain_m_start
=
m
-
remain_m
;
const
float
*
a0
=
A
+
remain_m_start
*
lda
;
const
float
*
a1
=
a0
+
lda
;
const
float
*
a2
=
a0
+
2
*
lda
;
const
float
*
a3
=
a0
+
3
*
lda
;
const
float
*
a4
=
a0
+
4
*
lda
;
const
float
*
a5
=
a0
+
5
*
lda
;
float
*
local_buffer
=
output
+
i_length
*
k
;
switch
(
m_tail
)
{
case
1
:
a1
=
zero
;
case
2
:
a2
=
zero
;
case
3
:
a3
=
zero
;
case
4
:
a4
=
zero
;
case
5
:
a5
=
zero
;
break
;
default:
break
;
float
*
out_ptr
=
output
+
remain_m_start
*
k
;
uint32x4_t
vmask2
=
vcltq_u32
(
vld1q_u32
(
mask
),
vdupq_n_u32
(
remain_m
));
uint32x4_t
vmask3
=
vcltq_u32
(
vld1q_u32
(
mask
+
4
),
vdupq_n_u32
(
remain_m
));
const
float
zerobuff
[
4
]
=
{
0.
f
,
0.
f
,
0.
f
,
0.
f
};
int
lk
=
0
;
for
(;
lk
<
k
-
3
;
lk
+=
4
)
{
switch
(
remain_m
)
{
case
1
:
a1
=
zerobuff
;
case
2
:
a2
=
zerobuff
;
case
3
:
a3
=
zerobuff
;
case
4
:
a4
=
zerobuff
;
case
5
:
a5
=
zerobuff
;
default:
break
;
}
#if __aarch64__
float32x4_t
_d0
=
vld1q_f32
(
a0
);
float32x4_t
_d1
=
vld1q_f32
(
a1
);
float32x4_t
_d2
=
vld1q_f32
(
a2
);
float32x4_t
_d3
=
vld1q_f32
(
a3
);
float32x4_t
_d4
=
vld1q_f32
(
a4
);
float32x4_t
_d5
=
vld1q_f32
(
a5
);
float32x4x2_t
_q0
=
vtrnq_f32
(
_d0
,
_d1
);
float32x4x2_t
_q1
=
vtrnq_f32
(
_d2
,
_d3
);
float32x4x2_t
_q3
=
vtrnq_f32
(
_d4
,
_d5
);
_d0
=
vcombine_f32
(
vget_low_f32
(
_q0
.
val
[
0
]),
vget_low_f32
(
_q1
.
val
[
0
]));
_d1
=
vcombine_f32
(
vget_low_f32
(
_q0
.
val
[
1
]),
vget_low_f32
(
_q1
.
val
[
1
]));
_d2
=
vcombine_f32
(
vget_high_f32
(
_q0
.
val
[
0
]),
vget_high_f32
(
_q1
.
val
[
0
]));
_d3
=
vcombine_f32
(
vget_high_f32
(
_q0
.
val
[
1
]),
vget_high_f32
(
_q1
.
val
[
1
]));
_d0
=
vandq_f32_u32
(
_d0
,
vmask2
);
_d1
=
vandq_f32_u32
(
_d1
,
vmask2
);
_d2
=
vandq_f32_u32
(
_d2
,
vmask2
);
_d3
=
vandq_f32_u32
(
_d3
,
vmask2
);
_d4
=
vandq_f32_u32
(
_q3
.
val
[
0
],
vmask3
);
_d5
=
vandq_f32_u32
(
_q3
.
val
[
1
],
vmask3
);
vst1q_f32
(
out_ptr
,
_d0
);
vst1_f32
(
out_ptr
+
4
,
vget_low_f32
(
_d4
));
vst1q_f32
(
out_ptr
+
6
,
_d1
);
vst1_f32
(
out_ptr
+
10
,
vget_low_f32
(
_d5
));
vst1q_f32
(
out_ptr
+
12
,
_d2
);
vst1_f32
(
out_ptr
+
16
,
vget_high_f32
(
_d4
));
vst1q_f32
(
out_ptr
+
18
,
_d3
);
vst1_f32
(
out_ptr
+
22
,
vget_high_f32
(
_d5
));
out_ptr
+=
24
;
#else
asm
volatile
(
"vld1.32 {d0-d1}, [%[a0]]
\n
"
"vld1.32 {d2-d3}, [%[a1]]
\n
"
"vld1.32 {d4-d5}, [%[a2]]
\n
"
"vld1.32 {d6-d7}, [%[a3]]
\n
"
"vld1.32 {d8-d9}, [%[a4]]
\n
"
"vld1.32 {d10-d11}, [%[a5]]
\n
"
"vtrn.32 q0, q1
\n
"
"vtrn.32 q2, q3
\n
"
"vtrn.32 q4, q5
\n
"
"vswp.32 d1, d4
\n
"
"vswp.32 d3, d6
\n
"
"vbif q0, %q[vzero], %q[vmask2]
\n
"
"vbif q1, %q[vzero], %q[vmask2]
\n
"
"vbif q2, %q[vzero], %q[vmask2]
\n
"
"vbif q3, %q[vzero], %q[vmask2]
\n
"
"vbif q4, %q[vzero], %q[vmask3]
\n
"
"vbif q5, %q[vzero], %q[vmask3]
\n
"
"vst1.32 {q0}, [%[out]]!
\n
"
"vst1.32 {d8}, [%[out]]!
\n
"
"vst1.32 {q1}, [%[out]]!
\n
"
"vst1.32 {d10}, [%[out]]!
\n
"
"vst1.32 {q2}, [%[out]]!
\n
"
"vst1.32 {d9}, [%[out]]!
\n
"
"vst1.32 {q3}, [%[out]]!
\n
"
"vst1.32 {d11}, [%[out]]!
\n
"
:
[
out
]
"+r"
(
out_ptr
),
[
a0
]
"+r"
(
a0
),
[
a1
]
"+r"
(
a1
),
[
a2
]
"+r"
(
a2
),
[
a3
]
"+r"
(
a3
),
[
a4
]
"+r"
(
a4
),
[
a5
]
"+r"
(
a5
)
:
[
vmask2
]
"w"
(
vmask2
),
[
vmask3
]
"w"
(
vmask3
),
[
vzero
]
"w"
(
vzero
)
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
);
#endif
}
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
*
local_buffer
++
=
*
a0
++
;
*
local_buffer
++
=
*
a1
++
;
*
local_buffer
++
=
*
a2
++
;
*
local_buffer
++
=
*
a3
++
;
*
local_buffer
++
=
*
a4
++
;
*
local_buffer
++
=
*
a5
++
;
// remain k
for
(;
lk
<
k
;
++
lk
)
{
*
out_ptr
++
=
*
a0
++
;
*
out_ptr
++
=
*
a1
++
;
*
out_ptr
++
=
*
a2
++
;
*
out_ptr
++
=
*
a3
++
;
*
out_ptr
++
=
*
a4
++
;
*
out_ptr
++
=
*
a5
++
;
}
delete
[]
zero
;
}
// uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5};
// int remain_k = k & 0x3;
// uint32x4_t vzero = vdupq_n_u32(0);
// uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k));
//
// std::cout << "m: " << m << ", k: " << k << std::endl;
// #pragma omp parallel for if (unroll)
// for (int i = 0; i < m - 5; i += 6) {
// std::cout << "i: " << i << std::endl;
// const float *a0 = A + i * lda;
// const float *a1 = A + (i + 1) * lda;
// const float *a2 = A + (i + 2) * lda;
// const float *a3 = A + (i + 3) * lda;
// const float *a4 = A + (i + 4) * lda;
// const float *a5 = A + (i + 5) * lda;
// float *out_ptr = output + i * k;
//
// int loops = k >> 2;
// if (loops > 0) {
// #if __aarch64__
// for (int l = 0; l < loops; ++l) {
// float32x4_t _d0 = vld1q_f32(a0);
// float32x4_t _d1 = vld1q_f32(a1);
// float32x4_t _d2 = vld1q_f32(a2);
// float32x4_t _d3 = vld1q_f32(a3);
// float32x4_t _d4 = vld1q_f32(a4);
// float32x4_t _d5 = vld1q_f32(a5);
//
// float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
// float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
// float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
// _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
// vget_low_f32(_q1.val[0])); _d1 =
// vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
// _d2 =
// vcombine_f32(vget_high_f32(_q0.val[0]),
// vget_high_f32(_q1.val[0]));
// _d3 =
// vcombine_f32(vget_high_f32(_q0.val[1]),
// vget_high_f32(_q1.val[1]));
//
// vst1q_f32(out_ptr, _d0);
// vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
// vst1q_f32(out_ptr + 6, _d1);
// vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
// vst1q_f32(out_ptr + 12, _d2);
// vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
// vst1q_f32(out_ptr + 18, _d3);
// vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1]));
//
// a0 += 4;
// a1 += 4;
// a2 += 4;
// a3 += 4;
// a4 += 4;
// a5 += 4;
// out_ptr += 24;
// }
// #else
// asm volatile(
// "loop_4k_%=: \n"
// "vld1.32 {d0-d1}, [%[a0]]! \n"
// "vld1.32 {d2-d3}, [%[a1]]! \n"
// "vld1.32 {d4-d5}, [%[a2]]! \n"
// "vld1.32 {d6-d7}, [%[a3]]! \n"
// "vld1.32 {d8-d9}, [%[a4]]! \n"
// "vld1.32 {d10-d11}, [%[a5]]! \n"
// "vtrn.32 q0, q1 \n"
// "vtrn.32 q2, q3 \n"
// "vtrn.32 q4, q5 \n"
// "vswp.32 d1, d4 \n"
// "vswp.32 d3, d6 \n"
//
// "vst1.32 {q0}, [%[out]]! \n"
// "vst1.32 {d8}, [%[out]]! \n"
// "vst1.32 {q1}, [%[out]]! \n"
// "vst1.32 {d10}, [%[out]]! \n"
// "vst1.32 {q2}, [%[out]]! \n"
// "vst1.32 {d9}, [%[out]]! \n"
// "vst1.32 {q3}, [%[out]]! \n"
// "vst1.32 {d11}, [%[out]]! \n"
//
// "subs %[loops], #1 \n"
// "bne loop_4k_%= \n"
// : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2]
// "+r"(a2),
// [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
// :
// : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
// #endif
// }
//
// if (remain_k > 0) {
// float32x4_t _d0 = vld1q_f32(a0);
// float32x4_t _d1 = vld1q_f32(a1);
// float32x4_t _d2 = vld1q_f32(a2);
// float32x4_t _d3 = vld1q_f32(a3);
// float32x4_t _d4 = vld1q_f32(a4);
// float32x4_t _d5 = vld1q_f32(a5);
//
// _d0 = vandq_f32_u32(_d0, vmask1);
// _d1 = vandq_f32_u32(_d1, vmask1);
// _d2 = vandq_f32_u32(_d2, vmask1);
// _d3 = vandq_f32_u32(_d3, vmask1);
// _d4 = vandq_f32_u32(_d4, vmask1);
// _d5 = vandq_f32_u32(_d5, vmask1);
//
// float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
// float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
// float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
// _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
// vget_low_f32(_q1.val[0])); _d1 =
// vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); _d2
// = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
//
// switch (remain_k) {
// case 3:
// vst1q_f32(out_ptr + 12, _d2);
// vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
// case 2:
// vst1q_f32(out_ptr + 6, _d1);
// vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
// case 1:
// vst1q_f32(out_ptr, _d0);
// vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
// default:
// break;
// }
// }
// }
//
// int remain_m = m % 6;
// if (remain_m) {
// int remain_m_start = m - remain_m;
// std::cout << "remain_m_start: " << remain_m_start << std::endl;
// const float *a0 = A + remain_m_start * lda;
// const float *a1 = a0 + lda;
// const float *a2 = a0 + 2 * lda;
// const float *a3 = a0 + 3 * lda;
// const float *a4 = a0 + 4 * lda;
// const float *a5 = a0 + 5 * lda;
// float *out_ptr = output + remain_m_start * k;
//
// uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m));
// uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4),
// vdupq_n_u32(remain_m));
//
// int loops = k >> 2;
// if (loops > 0) {
// #if __aarch64__
// for (int l = 0; l < loops; ++l) {
// float32x4_t _d0 = vld1q_f32(a0);
// float32x4_t _d1 = vld1q_f32(a1);
// float32x4_t _d2 = vld1q_f32(a2);
// float32x4_t _d3 = vld1q_f32(a3);
// float32x4_t _d4 = vld1q_f32(a4);
// float32x4_t _d5 = vld1q_f32(a5);
//
// float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
// float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
// float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
// _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
// vget_low_f32(_q1.val[0])); _d1 =
// vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
// _d2 =
// vcombine_f32(vget_high_f32(_q0.val[0]),
// vget_high_f32(_q1.val[0]));
// _d3 =
// vcombine_f32(vget_high_f32(_q0.val[1]),
// vget_high_f32(_q1.val[1]));
//
// _d0 = vandq_f32_u32(_d0, vmask2);
// _d1 = vandq_f32_u32(_d1, vmask2);
// _d2 = vandq_f32_u32(_d2, vmask2);
// _d3 = vandq_f32_u32(_d3, vmask2);
// _d4 = vandq_f32_u32(_q3.val[0], vmask3);
// _d5 = vandq_f32_u32(_q3.val[1], vmask3);
//
// vst1q_f32(out_ptr, _d0);
// vst1_f32(out_ptr + 4, vget_low_f32(_d4));
// vst1q_f32(out_ptr + 6, _d1);
// vst1_f32(out_ptr + 10, vget_low_f32(_d5));
// vst1q_f32(out_ptr + 12, _d2);
// vst1_f32(out_ptr + 16, vget_high_f32(_d4));
// vst1q_f32(out_ptr + 18, _d3);
// vst1_f32(out_ptr + 22, vget_high_f32(_d5));
//
// a0 += 4;
// a1 += 4;
// a2 += 4;
// a3 += 4;
// a4 += 4;
// a5 += 4;
// out_ptr += 24;
// }
// #else
// asm volatile(
// "loop_4k_%=: \n"
// "vld1.32 {d0-d1}, [%[a0]]! \n"
// "vld1.32 {d2-d3}, [%[a1]]! \n"
// "vld1.32 {d4-d5}, [%[a2]]! \n"
// "vld1.32 {d6-d7}, [%[a3]]! \n"
// "vld1.32 {d8-d9}, [%[a4]]! \n"
// "vld1.32 {d10-d11}, [%[a5]]! \n"
// "vtrn.32 q0, q1 \n"
// "vtrn.32 q2, q3 \n"
// "vtrn.32 q4, q5 \n"
// "vswp.32 d1, d4 \n"
// "vswp.32 d3, d6 \n"
//
// "vbif q0, %q[vzero], %q[vmask2] \n"
// "vbif q1, %q[vzero], %q[vmask2] \n"
// "vbif q2, %q[vzero], %q[vmask2] \n"
// "vbif q3, %q[vzero], %q[vmask2] \n"
// "vbif q4, %q[vzero], %q[vmask3] \n"
// "vbif q5, %q[vzero], %q[vmask3] \n"
//
// "vst1.32 {q0}, [%[out]]! \n"
// "vst1.32 {d8}, [%[out]]! \n"
// "vst1.32 {q1}, [%[out]]! \n"
// "vst1.32 {d10}, [%[out]]! \n"
// "vst1.32 {q2}, [%[out]]! \n"
// "vst1.32 {d9}, [%[out]]! \n"
// "vst1.32 {q3}, [%[out]]! \n"
// "vst1.32 {d11}, [%[out]]! \n"
//
// "subs %[loops], #1 \n"
// "bne loop_4k_%= \n"
// : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2]
// "+r"(a2),
// [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
// : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero)
// : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
// #endif
// }
//
// if (remain_k > 0) {
// float32x4_t _d0 = vld1q_f32(a0);
// float32x4_t _d1 = vld1q_f32(a1);
// float32x4_t _d2 = vld1q_f32(a2);
// float32x4_t _d3 = vld1q_f32(a3);
// float32x4_t _d4 = vld1q_f32(a4);
// float32x4_t _d5 = vld1q_f32(a5);
//
// _d0 = vandq_f32_u32(_d0, vmask1);
// _d1 = vandq_f32_u32(_d1, vmask1);
// _d2 = vandq_f32_u32(_d2, vmask1);
// _d3 = vandq_f32_u32(_d3, vmask1);
// _d4 = vandq_f32_u32(_d4, vmask1);
// _d5 = vandq_f32_u32(_d5, vmask1);
//
// float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
// float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
// float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
// _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
// vget_low_f32(_q1.val[0])); _d1 =
// vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); _d2
// = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
// // _d3 = vcombine_f32(vget_high_f32(_q0.val[1]),
// // vget_high_f32(_q1.val[1]));
//
// _d0 = vandq_f32_u32(_d0, vmask2);
// _d1 = vandq_f32_u32(_d1, vmask2);
// _d2 = vandq_f32_u32(_d2, vmask2);
// // _d3 = vandq_f32_u32(_d3, vmask2);
// _d4 = vandq_f32_u32(_q3.val[0], vmask3);
// _d5 = vandq_f32_u32(_q3.val[1], vmask3);
//
// switch (remain_k) {
// case 3:
// vst1q_f32(out_ptr + 12, _d2);
// vst1_f32(out_ptr + 16, vget_high_f32(_d4));
// case 2:
// vst1q_f32(out_ptr + 6, _d1);
// vst1_f32(out_ptr + 10, vget_low_f32(_d5));
// case 1:
// vst1q_f32(out_ptr, _d0);
// vst1_f32(out_ptr + 4, vget_low_f32(_d4));
// default:
// break;
// }
// }
// }
}
#if __aarch64__
...
...
src/operators/math/math
_func_neon
.h
→
src/operators/math/math.h
浏览文件 @
1d475a2c
文件已移动
src/operators/math/softmax.cpp
浏览文件 @
1d475a2c
...
...
@@ -19,7 +19,7 @@ limitations under the License. */
#include <algorithm>
#include <limits>
#include "common/types.h"
#include "operators/math/math
_func_neon
.h"
#include "operators/math/math.h"
namespace
paddle_mobile
{
namespace
operators
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录