Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
dd575b09
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
dd575b09
编写于
3月 10, 2019
作者:
H
hjchen2
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update
上级
42e520bb
变更
49
显示空白变更内容
内联
并排
Showing
49 changed file
with
570 addition
and
1794 deletion
+570
-1794
src/framework/load_ops.h
src/framework/load_ops.h
+0
-8
src/operators/conv_op.cpp
src/operators/conv_op.cpp
+4
-4
src/operators/depthwise_conv_op.cpp
src/operators/depthwise_conv_op.cpp
+4
-4
src/operators/fusion_conv_add_add_prelu_op.cpp
src/operators/fusion_conv_add_add_prelu_op.cpp
+0
-61
src/operators/fusion_conv_add_add_prelu_op.h
src/operators/fusion_conv_add_add_prelu_op.h
+0
-79
src/operators/fusion_conv_add_bn_op.cpp
src/operators/fusion_conv_add_bn_op.cpp
+4
-4
src/operators/fusion_conv_add_bn_relu_op.cpp
src/operators/fusion_conv_add_bn_relu_op.cpp
+4
-4
src/operators/fusion_conv_add_op.cpp
src/operators/fusion_conv_add_op.cpp
+4
-4
src/operators/fusion_conv_add_prelu_op.cpp
src/operators/fusion_conv_add_prelu_op.cpp
+0
-61
src/operators/fusion_conv_add_prelu_op.h
src/operators/fusion_conv_add_prelu_op.h
+0
-71
src/operators/fusion_conv_add_relu_op.cpp
src/operators/fusion_conv_add_relu_op.cpp
+4
-4
src/operators/fusion_conv_bn_add_relu_op.cpp
src/operators/fusion_conv_bn_add_relu_op.cpp
+4
-4
src/operators/fusion_conv_bn_op.cpp
src/operators/fusion_conv_bn_op.cpp
+4
-3
src/operators/fusion_conv_bn_relu_op.cpp
src/operators/fusion_conv_bn_relu_op.cpp
+4
-4
src/operators/fusion_dwconv_bn_relu_op.cpp
src/operators/fusion_dwconv_bn_relu_op.cpp
+4
-4
src/operators/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp
...tors/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp
+0
-39
src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
...rators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
+5
-14
src/operators/kernel/arm/convolution/conv_add_kernel.cpp
src/operators/kernel/arm/convolution/conv_add_kernel.cpp
+3
-12
src/operators/kernel/arm/convolution/conv_add_prelu_kernel.cpp
...perators/kernel/arm/convolution/conv_add_prelu_kernel.cpp
+0
-38
src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
...operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
+4
-10
src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
...rators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
+5
-14
src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
+5
-14
src/operators/kernel/arm/convolution/conv_kernel.cpp
src/operators/kernel/arm/convolution/conv_kernel.cpp
+2
-6
src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
...perators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
+6
-14
src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
...ors/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
+0
-128
src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
...erators/kernel/central-arm-func/conv_add_prelu_arm_func.h
+0
-124
src/operators/kernel/central-arm-func/conv_arm_func.cpp
src/operators/kernel/central-arm-func/conv_arm_func.cpp
+242
-0
src/operators/kernel/central-arm-func/conv_arm_func.h
src/operators/kernel/central-arm-func/conv_arm_func.h
+11
-366
src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
...ators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
+0
-122
src/operators/kernel/conv_add_add_prelu_kernel.h
src/operators/kernel/conv_add_add_prelu_kernel.h
+0
-45
src/operators/kernel/conv_add_bn_kernel.h
src/operators/kernel/conv_add_bn_kernel.h
+0
-1
src/operators/kernel/conv_add_bn_relu_kernel.h
src/operators/kernel/conv_add_bn_relu_kernel.h
+0
-1
src/operators/kernel/conv_add_kernel.h
src/operators/kernel/conv_add_kernel.h
+0
-1
src/operators/kernel/conv_add_prelu_kernel.h
src/operators/kernel/conv_add_prelu_kernel.h
+0
-45
src/operators/kernel/conv_add_relu_kernel.h
src/operators/kernel/conv_add_relu_kernel.h
+0
-1
src/operators/kernel/conv_bn_add_relu_kernel.h
src/operators/kernel/conv_bn_add_relu_kernel.h
+0
-1
src/operators/kernel/conv_bn_kernel.h
src/operators/kernel/conv_bn_kernel.h
+0
-1
src/operators/kernel/conv_bn_relu_kernel.h
src/operators/kernel/conv_bn_relu_kernel.h
+0
-1
src/operators/kernel/dwconv_bn_relu_kernel.h
src/operators/kernel/dwconv_bn_relu_kernel.h
+0
-1
src/operators/kernel/lrn_kernel.h
src/operators/kernel/lrn_kernel.h
+6
-9
src/operators/math/activation.h
src/operators/math/activation.h
+1
-1
src/operators/math/channel_wise.h
src/operators/math/channel_wise.h
+2
-77
src/operators/math/depthwise_conv3x3.h
src/operators/math/depthwise_conv3x3.h
+0
-1
src/operators/math/depthwise_conv3x3_int8.cpp
src/operators/math/depthwise_conv3x3_int8.cpp
+21
-8
src/operators/math/depthwise_conv3x3_int8_arm64.cpp
src/operators/math/depthwise_conv3x3_int8_arm64.cpp
+0
-56
src/operators/math/depthwise_conv5x5.h
src/operators/math/depthwise_conv5x5.h
+0
-1
src/operators/math/gemm/pack_kernel.h
src/operators/math/gemm/pack_kernel.h
+216
-322
src/operators/math/math.h
src/operators/math/math.h
+0
-0
src/operators/math/softmax.cpp
src/operators/math/softmax.cpp
+1
-1
未找到文件。
src/framework/load_ops.h
浏览文件 @
dd575b09
...
@@ -125,10 +125,6 @@ LOAD_OP1(prior_box, CPU);
...
@@ -125,10 +125,6 @@ LOAD_OP1(prior_box, CPU);
LOAD_OP2
(
fusion_conv_add_relu
,
CPU
,
FPGA
);
LOAD_OP2
(
fusion_conv_add_relu
,
CPU
,
FPGA
);
LOAD_FUSION_MATCHER
(
fusion_conv_add_relu
);
LOAD_FUSION_MATCHER
(
fusion_conv_add_relu
);
#endif
#endif
#ifdef FUSION_CONVADDADDPRELU_OP
LOAD_OP2
(
fusion_conv_add_add_prelu
,
CPU
,
FPGA
);
LOAD_FUSION_MATCHER
(
fusion_conv_add_add_prelu
);
#endif
#ifdef FUSION_CONVADD_OP
#ifdef FUSION_CONVADD_OP
LOAD_OP2
(
fusion_conv_add
,
CPU
,
MALI_GPU
);
LOAD_OP2
(
fusion_conv_add
,
CPU
,
MALI_GPU
);
LOAD_FUSION_MATCHER
(
fusion_conv_add
);
LOAD_FUSION_MATCHER
(
fusion_conv_add
);
...
@@ -178,10 +174,6 @@ LOAD_FUSION_MATCHER(fusion_conv_add_bn);
...
@@ -178,10 +174,6 @@ LOAD_FUSION_MATCHER(fusion_conv_add_bn);
#ifdef DROPOUT_OP
#ifdef DROPOUT_OP
LOAD_OP2
(
dropout
,
CPU
,
FPGA
);
LOAD_OP2
(
dropout
,
CPU
,
FPGA
);
#endif
#endif
#ifdef FUSION_CONVADDPRELU_OP
LOAD_OP2
(
fusion_conv_add_prelu
,
CPU
,
FPGA
);
LOAD_FUSION_MATCHER
(
fusion_conv_add_prelu
);
#endif
#ifdef FUSION_DWCONVBNRELU_OP
#ifdef FUSION_DWCONVBNRELU_OP
LOAD_OP1
(
fusion_dwconv_bn_relu
,
CPU
);
LOAD_OP1
(
fusion_dwconv_bn_relu
,
CPU
);
LOAD_FUSION_MATCHER
(
fusion_dwconv_bn_relu
);
LOAD_FUSION_MATCHER
(
fusion_dwconv_bn_relu
);
...
...
src/operators/conv_op.cpp
浏览文件 @
dd575b09
...
@@ -18,7 +18,7 @@ limitations under the License. */
...
@@ -18,7 +18,7 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "framework/op_proto_maker.h"
#include "framework/op_proto_maker.h"
#include "framework/op_registry.h"
#include "framework/op_registry.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -39,9 +39,9 @@ void ConvOp<Dtype, T>::InferShape() const {
...
@@ -39,9 +39,9 @@ void ConvOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
dilations
[
i
],
padding
s
[
i
],
paddings
[
i
],
strides
[
i
]));
strides
[
i
]));
}
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/depthwise_conv_op.cpp
浏览文件 @
dd575b09
...
@@ -19,7 +19,7 @@ limitations under the License. */
...
@@ -19,7 +19,7 @@ limitations under the License. */
#include "framework/op_proto_maker.h"
#include "framework/op_proto_maker.h"
#include "framework/op_registry.h"
#include "framework/op_registry.h"
#include "operators/conv_op.h"
#include "operators/conv_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -40,9 +40,9 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
...
@@ -40,9 +40,9 @@ void DepthwiseConvOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
dilations
[
i
],
padding
s
[
i
],
paddings
[
i
],
strides
[
i
]));
strides
[
i
]));
}
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_conv_add_add_prelu_op.cpp
已删除
100644 → 0
浏览文件 @
42e520bb
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#include "operators/fusion_conv_add_add_prelu_op.h"
#include "operators/math/conv_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
Dtype
,
typename
T
>
void
FusionConvAddAddPReluOp
<
Dtype
,
T
>::
InferShape
()
const
{
auto
in_dims
=
this
->
param_
.
Input
()
->
dims
();
auto
filter_dims
=
this
->
param_
.
Filter
()
->
dims
();
const
std
::
vector
<
int
>
&
strides
=
this
->
param_
.
Strides
();
std
::
vector
<
int
>
paddings
=
this
->
param_
.
Paddings
();
int
groups
=
this
->
param_
.
Groups
();
std
::
vector
<
int
>
dilations
=
this
->
param_
.
Dilations
();
PADDLE_MOBILE_ENFORCE
((
in_dims
.
size
()
==
filter_dims
.
size
()
&&
dilations
.
size
()
==
paddings
.
size
()
&&
paddings
.
size
()
==
strides
.
size
()),
"ConvParam is not suitable"
);
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
paddings
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
this
->
param_
.
Output
()
->
Resize
(
ddim
);
}
}
// namespace operators
}
// namespace paddle_mobile
namespace
ops
=
paddle_mobile
::
operators
;
REGISTER_FUSION_MATCHER
(
fusion_conv_add_add_prelu
,
ops
::
FusionConvAddAddPReluOpMatcher
);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU
(
fusion_conv_add_add_prelu
,
ops
::
FusionConvAddAddPReluOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA
(
fusion_conv_add_add_prelu
,
ops
::
FusionConvAddAddPReluOp
);
#endif
#endif // FUSION_CONVADDADDPRELU_OP
src/operators/fusion_conv_add_add_prelu_op.h
已删除
100644 → 0
浏览文件 @
42e520bb
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#pragma once
#include <string>
#include <utility>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/conv_add_add_prelu_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
class
FusionConvAddAddPReluOpMatcher
:
public
framework
::
FusionOpMatcher
{
public:
FusionConvAddAddPReluOpMatcher
()
{
node_
=
framework
::
Node
(
G_OP_TYPE_CONV
);
node_
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_ELEMENTWISE_ADD
)
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_ELEMENTWISE_ADD
)
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_PRELU
);
}
void
FolderNodes
(
framework
::
Node
*
node
,
std
::
vector
<
std
::
shared_ptr
<
framework
::
Node
>>
*
removed_nodes
)
{
node
->
Folder
(
node_
.
Depth
(),
Type
(),
{{
G_OP_TYPE_ELEMENTWISE_ADD
,
{{
"Y"
,
"Y"
},
{
"Out"
,
"addOut"
},
{
"X"
,
"addX"
}}},
{
G_OP_TYPE_PRELU
,
{{
"Alpha"
,
"Alpha"
}}}},
removed_nodes
);
}
std
::
string
Type
()
{
return
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
;
}
std
::
vector
<
std
::
pair
<
int
,
std
::
string
>>
NeedCheck
()
{
DLOG
<<
" conv add add prelu check add X "
;
return
{{
2
,
"Y"
},
{
2
,
"X"
}};
}
};
template
<
typename
DeviceType
,
typename
T
>
class
FusionConvAddAddPReluOp
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddAddPReluKernel
<
DeviceType
,
T
>>
{
public:
FusionConvAddAddPReluOp
(
const
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
framework
::
Scope
*
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddAddPReluKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
void
InferShape
()
const
override
;
protected:
};
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/fusion_conv_add_bn_op.cpp
浏览文件 @
dd575b09
...
@@ -15,7 +15,7 @@ limitations under the License. */
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDBN_OP
#ifdef FUSION_CONVADDBN_OP
#include "operators/fusion_conv_add_bn_op.h"
#include "operators/fusion_conv_add_bn_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -36,9 +36,9 @@ void FusionConvAddBNOp<Dtype, T>::InferShape() const {
...
@@ -36,9 +36,9 @@ void FusionConvAddBNOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
dilations
[
i
],
padding
s
[
i
],
paddings
[
i
],
strides
[
i
]));
strides
[
i
]));
}
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_conv_add_bn_relu_op.cpp
浏览文件 @
dd575b09
...
@@ -15,7 +15,7 @@ limitations under the License. */
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDBNRELU_OP
#ifdef FUSION_CONVADDBNRELU_OP
#include "operators/fusion_conv_add_bn_relu_op.h"
#include "operators/fusion_conv_add_bn_relu_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -36,9 +36,9 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
...
@@ -36,9 +36,9 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
dilations
[
i
],
padding
s
[
i
],
paddings
[
i
],
strides
[
i
]));
strides
[
i
]));
}
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_conv_add_op.cpp
浏览文件 @
dd575b09
...
@@ -15,7 +15,7 @@ limitations under the License. */
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADD_OP
#ifdef FUSION_CONVADD_OP
#include "operators/fusion_conv_add_op.h"
#include "operators/fusion_conv_add_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -36,9 +36,9 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
...
@@ -36,9 +36,9 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
dilations
[
i
],
padding
s
[
i
],
paddings
[
i
],
strides
[
i
]));
strides
[
i
]));
}
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_conv_add_prelu_op.cpp
已删除
100644 → 0
浏览文件 @
42e520bb
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#include "operators/fusion_conv_add_prelu_op.h"
#include "operators/math/conv_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
Dtype
,
typename
T
>
void
FusionConvAddPReluOp
<
Dtype
,
T
>::
InferShape
()
const
{
auto
in_dims
=
this
->
param_
.
Input
()
->
dims
();
auto
filter_dims
=
this
->
param_
.
Filter
()
->
dims
();
const
std
::
vector
<
int
>
&
strides
=
this
->
param_
.
Strides
();
std
::
vector
<
int
>
paddings
=
this
->
param_
.
Paddings
();
int
groups
=
this
->
param_
.
Groups
();
std
::
vector
<
int
>
dilations
=
this
->
param_
.
Dilations
();
PADDLE_MOBILE_ENFORCE
((
in_dims
.
size
()
==
filter_dims
.
size
()
&&
dilations
.
size
()
==
paddings
.
size
()
&&
paddings
.
size
()
==
strides
.
size
()),
"ConvParam is not suitable"
);
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
paddings
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
this
->
param_
.
Output
()
->
Resize
(
ddim
);
}
}
// namespace operators
}
// namespace paddle_mobile
namespace
ops
=
paddle_mobile
::
operators
;
REGISTER_FUSION_MATCHER
(
fusion_conv_add_prelu
,
ops
::
FusionConvAddPReluOpMatcher
);
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU
(
fusion_conv_add_prelu
,
ops
::
FusionConvAddPReluOp
);
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA
(
fusion_conv_add_prelu
,
ops
::
FusionConvAddPReluOp
);
#endif
#endif
src/operators/fusion_conv_add_prelu_op.h
已删除
100644 → 0
浏览文件 @
42e520bb
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/conv_add_prelu_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
class
FusionConvAddPReluOpMatcher
:
public
framework
::
FusionOpMatcher
{
public:
FusionConvAddPReluOpMatcher
()
{
node_
=
framework
::
Node
(
G_OP_TYPE_CONV
);
node_
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_ELEMENTWISE_ADD
)
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_PRELU
);
}
void
FolderNodes
(
framework
::
Node
*
node
,
std
::
vector
<
std
::
shared_ptr
<
framework
::
Node
>>
*
removed_nodes
)
{
node
->
Folder
(
node_
.
Depth
(),
Type
(),
{{
G_OP_TYPE_ELEMENTWISE_ADD
,
{{
"Y"
,
"Y"
}}},
{
G_OP_TYPE_PRELU
,
{{
"Alpha"
,
"Alpha"
}}}},
removed_nodes
);
}
std
::
string
Type
()
{
return
G_OP_TYPE_FUSION_CONV_ADD_PRELU
;
}
};
template
<
typename
DeviceType
,
typename
T
>
class
FusionConvAddPReluOp
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddPReluKernel
<
DeviceType
,
T
>>
{
public:
FusionConvAddPReluOp
(
const
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
framework
::
Scope
*
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddPReluKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
void
InferShape
()
const
override
;
protected:
};
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/fusion_conv_add_relu_op.cpp
浏览文件 @
dd575b09
...
@@ -15,7 +15,7 @@ limitations under the License. */
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVADDRELU_OP
#ifdef FUSION_CONVADDRELU_OP
#include "operators/fusion_conv_add_relu_op.h"
#include "operators/fusion_conv_add_relu_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -36,9 +36,9 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {
...
@@ -36,9 +36,9 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
dilations
[
i
],
padding
s
[
i
],
paddings
[
i
],
strides
[
i
]));
strides
[
i
]));
}
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
this
->
param_
.
Output
()
->
Resize
(
ddim
);
this
->
param_
.
Output
()
->
Resize
(
ddim
);
...
...
src/operators/fusion_conv_bn_add_relu_op.cpp
浏览文件 @
dd575b09
...
@@ -15,7 +15,7 @@ limitations under the License. */
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#ifdef FUSION_CONVBNADDRELU_OP
#include "operators/fusion_conv_bn_add_relu_op.h"
#include "operators/fusion_conv_bn_add_relu_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -36,9 +36,9 @@ void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {
...
@@ -36,9 +36,9 @@ void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
dilations
[
i
],
padding
s
[
i
],
paddings
[
i
],
strides
[
i
]));
strides
[
i
]));
}
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_conv_bn_op.cpp
浏览文件 @
dd575b09
...
@@ -15,6 +15,7 @@ limitations under the License. */
...
@@ -15,6 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVBN_OP
#ifdef FUSION_CONVBN_OP
#include "operators/fusion_conv_bn_op.h"
#include "operators/fusion_conv_bn_op.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -35,9 +36,9 @@ void FusionConvBNOp<Dtype, T>::InferShape() const {
...
@@ -35,9 +36,9 @@ void FusionConvBNOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
dilations
[
i
],
padding
s
[
i
],
paddings
[
i
],
strides
[
i
]));
strides
[
i
]));
}
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_conv_bn_relu_op.cpp
浏览文件 @
dd575b09
...
@@ -15,7 +15,7 @@ limitations under the License. */
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_CONVBNRELU_OP
#ifdef FUSION_CONVBNRELU_OP
#include "operators/fusion_conv_bn_relu_op.h"
#include "operators/fusion_conv_bn_relu_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -36,9 +36,9 @@ void FusionConvBNReluOp<Dtype, T>::InferShape() const {
...
@@ -36,9 +36,9 @@ void FusionConvBNReluOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
dilations
[
i
],
padding
s
[
i
],
paddings
[
i
],
strides
[
i
]));
strides
[
i
]));
}
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/fusion_dwconv_bn_relu_op.cpp
浏览文件 @
dd575b09
...
@@ -15,7 +15,7 @@ limitations under the License. */
...
@@ -15,7 +15,7 @@ limitations under the License. */
#ifdef FUSION_DWCONVBNRELU_OP
#ifdef FUSION_DWCONVBNRELU_OP
#include "operators/fusion_dwconv_bn_relu_op.h"
#include "operators/fusion_dwconv_bn_relu_op.h"
#include "operators/
math/conv
_func.h"
#include "operators/
kernel/central-arm-func/conv_arm
_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -36,9 +36,9 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
...
@@ -36,9 +36,9 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
output_shape
.
push_back
(
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilation
s
[
i
],
dilations
[
i
],
padding
s
[
i
],
paddings
[
i
],
strides
[
i
]));
strides
[
i
]));
}
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
...
...
src/operators/kernel/arm/convolution/conv_add_add_prelu_kernel.cpp
已删除
100644 → 0
浏览文件 @
42e520bb
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#include "operators/kernel/conv_add_add_prelu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConvAddAddPReluKernel
<
CPU
,
float
>::
Init
(
FusionConvAddAddPReluParam
<
CPU
>
*
param
)
{
return
true
;
}
template
<
>
void
ConvAddAddPReluKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddAddPReluParam
<
CPU
>
&
param
)
{
ConvAddAddPReluCompute
<
float
>
(
param
);
}
template
class
ConvAddAddPReluKernel
<
CPU
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/arm/convolution/conv_add_bn_relu_kernel.cpp
浏览文件 @
dd575b09
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <cmath>
#include <cmath>
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -62,34 +63,24 @@ void ConvAddBNReluKernel<CPU, float>::Compute(
...
@@ -62,34 +63,24 @@ void ConvAddBNReluKernel<CPU, float>::Compute(
const
FusionConvAddBNReluParam
<
CPU
>
&
param
)
{
const
FusionConvAddBNReluParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
switch
(
param
.
ExecMode
())
{
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
DepthwiseConv3x3
<
float
,
float
>
(
param
);
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
DepthwiseConv5x5
<
float
,
float
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
WinogradConv3x3
<
8
,
3
>
(
param
);
WinogradConv3x3
<
8
,
3
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
ConvBNReluBasic
<
FusionConvAddBNReluParam
<
CPU
>
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
break
;
default:
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
param
.
ExecMode
());
}
}
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
}
}
template
class
ConvAddBNReluKernel
<
CPU
,
float
>;
template
class
ConvAddBNReluKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/convolution/conv_add_kernel.cpp
浏览文件 @
dd575b09
...
@@ -16,8 +16,8 @@ limitations under the License. */
...
@@ -16,8 +16,8 @@ limitations under the License. */
#include "operators/kernel/conv_add_kernel.h"
#include "operators/kernel/conv_add_kernel.h"
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_add_arm_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -32,34 +32,25 @@ template <>
...
@@ -32,34 +32,25 @@ template <>
void
ConvAddKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddParam
<
CPU
>
&
param
)
{
void
ConvAddKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
switch
(
param
.
ExecMode
())
{
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
AddChannelWise
<
IDENTITY
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
param
.
Paddings
(),
param
.
Output
());
math
::
AddChannelWise
<
IDENTITY
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
DepthwiseConv5x5
<
float
,
float
>
(
param
);
math
::
AddChannelWise
<
IDENTITY
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
WinogradConv3x3
<
8
,
3
>
(
param
);
WinogradConv3x3
<
8
,
3
>
(
param
);
math
::
AddChannelWise
<
IDENTITY
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
ConvAddBasic
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
break
;
default:
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
param
.
ExecMode
());
}
}
math
::
AddChannelWise
<
IDENTITY
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
}
}
template
class
ConvAddKernel
<
CPU
,
float
>;
template
class
ConvAddKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/convolution/conv_add_prelu_kernel.cpp
已删除
100644 → 0
浏览文件 @
42e520bb
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#include "operators/kernel/conv_add_prelu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_prelu_arm_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConvAddPReluKernel
<
CPU
,
float
>::
Init
(
FusionConvAddPReluParam
<
CPU
>
*
param
)
{
return
true
;
}
template
<
>
void
ConvAddPReluKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddPReluParam
<
CPU
>
&
param
)
{
ConvAddPReluCompute
<
float
>
(
param
);
}
template
class
ConvAddPReluKernel
<
CPU
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/arm/convolution/conv_add_relu_kernel.cpp
浏览文件 @
dd575b09
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/kernel/conv_add_relu_kernel.h"
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -32,30 +33,23 @@ void ConvAddReluKernel<CPU, float>::Compute(
...
@@ -32,30 +33,23 @@ void ConvAddReluKernel<CPU, float>::Compute(
const
FusionConvAddReluParam
<
CPU
>
&
param
)
{
const
FusionConvAddReluParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
switch
(
param
.
ExecMode
())
{
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
AddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
DepthwiseConv3x3
<
float
,
float
>
(
param
);
param
.
Paddings
(),
param
.
Output
());
math
::
AddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
DepthwiseConv5x5
<
float
,
float
>
(
param
);
math
::
AddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
WinogradConv3x3
<
8
,
3
>
(
param
);
WinogradConv3x3
<
8
,
3
>
(
param
);
math
::
AddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
ConvAddReluBasic
<
FusionConvAddReluParam
<
CPU
>
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
break
;
default:
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
param
.
ExecMode
());
}
}
math
::
AddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
Bias
(),
param
.
Output
());
}
}
template
class
ConvAddReluKernel
<
CPU
,
float
>;
template
class
ConvAddReluKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/convolution/conv_bn_add_relu_kernel.cpp
浏览文件 @
dd575b09
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <cmath>
#include <cmath>
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -62,34 +63,24 @@ void ConvBNAddReluKernel<CPU, float>::Compute(
...
@@ -62,34 +63,24 @@ void ConvBNAddReluKernel<CPU, float>::Compute(
const
FusionConvBNAddReluParam
<
CPU
>
&
param
)
{
const
FusionConvBNAddReluParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
switch
(
param
.
ExecMode
())
{
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
DepthwiseConv3x3
<
float
,
float
>
(
param
);
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
DepthwiseConv5x5
<
float
,
float
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
WinogradConv3x3
<
8
,
3
>
(
param
);
WinogradConv3x3
<
8
,
3
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
ConvBNReluBasic
<
FusionConvBNAddReluParam
<
CPU
>
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
break
;
default:
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
param
.
ExecMode
());
}
}
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
}
}
template
class
ConvBNAddReluKernel
<
CPU
,
float
>;
template
class
ConvBNAddReluKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/convolution/conv_bn_relu_kernel.cpp
浏览文件 @
dd575b09
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <cmath>
#include <cmath>
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -61,34 +62,24 @@ void ConvBNReluKernel<CPU, float>::Compute(
...
@@ -61,34 +62,24 @@ void ConvBNReluKernel<CPU, float>::Compute(
const
FusionConvBNReluParam
<
CPU
>
&
param
)
{
const
FusionConvBNReluParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
switch
(
param
.
ExecMode
())
{
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
DepthwiseConv3x3
<
float
,
float
>
(
param
);
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
DepthwiseConv5x5
<
float
,
float
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
WinogradConv3x3
<
8
,
3
>
(
param
);
WinogradConv3x3
<
8
,
3
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
ConvBNReluBasic
<
FusionConvBNReluParam
<
CPU
>
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
break
;
default:
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
param
.
ExecMode
());
}
}
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
}
}
template
class
ConvBNReluKernel
<
CPU
,
float
>;
template
class
ConvBNReluKernel
<
CPU
,
float
>;
...
...
src/operators/kernel/arm/convolution/conv_kernel.cpp
浏览文件 @
dd575b09
...
@@ -32,10 +32,10 @@ bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
...
@@ -32,10 +32,10 @@ bool ConvKernel<CPU, float>::Init(ConvParam<CPU> *param) {
template
<
>
template
<
>
void
ConvKernel
<
CPU
,
float
>::
Compute
(
const
ConvParam
<
CPU
>
&
param
)
{
void
ConvKernel
<
CPU
,
float
>::
Compute
(
const
ConvParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
switch
(
param
.
ExecMode
())
{
#ifndef __aarch64__
case
ConvParam
<
CPU
>::
EXEC_GEMM_INT8
:
case
ConvParam
<
CPU
>::
EXEC_GEMM_INT8
:
GemmConv
<
int8_t
,
int32_t
>
(
param
);
GemmConv
<
int8_t
,
int32_t
>
(
param
);
break
;
break
;
#ifndef __aarch64__
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3_INT8
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3_INT8
:
DepthwiseConv3x3
<
int8_t
,
int32_t
>
(
param
);
DepthwiseConv3x3
<
int8_t
,
int32_t
>
(
param
);
break
;
break
;
...
@@ -44,12 +44,8 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> ¶m) {
...
@@ -44,12 +44,8 @@ void ConvKernel<CPU, float>::Compute(const ConvParam<CPU> ¶m) {
break
;
break
;
#endif // __aarch64__
#endif // __aarch64__
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
DepthwiseConv3x3
<
float
,
float
>
(
param
);
param
.
Paddings
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
DepthwiseConv5x5
<
float
,
float
>
(
param
);
...
...
src/operators/kernel/arm/convolution/dwconv_bn_relu_kernel.cpp
浏览文件 @
dd575b09
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include <cmath>
#include <cmath>
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/arm/convolution/conv_common.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#include "operators/math/channel_wise.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -61,37 +62,28 @@ void DWConvBNReluKernel<CPU, float>::Compute(
...
@@ -61,37 +62,28 @@ void DWConvBNReluKernel<CPU, float>::Compute(
const
FusionDWConvBNReluParam
<
CPU
>
&
param
)
{
const
FusionDWConvBNReluParam
<
CPU
>
&
param
)
{
switch
(
param
.
ExecMode
())
{
switch
(
param
.
ExecMode
())
{
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S1_FLOAT
:
math
::
DepthwiseConv3x3S1
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE3x3S2_FLOAT
:
math
::
DepthwiseConv3x3S2
<
float
,
float
>
(
*
param
.
Input
(),
*
param
.
Filter
(),
DepthwiseConv3x3
<
float
,
float
>
(
param
);
param
.
Paddings
(),
param
.
Output
());
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
break
;
#ifndef __aarch64__
#ifndef __aarch64__
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_DEPTHWISE5x5_FLOAT
:
DepthwiseConv5x5
<
float
,
float
>
(
param
);
DepthwiseConv5x5
<
float
,
float
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
break
;
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_WINOGRAD3X3_FLOAT
:
WinogradConv3x3
<
8
,
3
>
(
param
);
WinogradConv3x3
<
8
,
3
>
(
param
);
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
break
;
break
;
#endif // __aarch64__
#endif // __aarch64__
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
case
ConvParam
<
CPU
>::
EXEC_GEMM_FLOAT
:
ConvBNReluBasic
<
FusionDWConvBNReluParam
<
CPU
>
>
(
param
);
GemmConv
<
float
,
float
>
(
param
);
break
;
break
;
default:
default:
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
PADDLE_MOBILE_THROW_EXCEPTION
(
"Invalid convolution execute mode %d"
,
param
.
ExecMode
());
param
.
ExecMode
());
}
}
math
::
ScaleAddChannelWise
<
RELU
>
(
param
.
Output
(),
param
.
NewScale
(),
param
.
NewBias
(),
param
.
Output
());
}
}
template
class
DWConvBNReluKernel
<
CPU
,
float
>;
template
class
DWConvBNReluKernel
<
CPU
,
float
>;
}
// namespace operators
}
// namespace operators
...
...
src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
已删除
100644 → 0
浏览文件 @
42e520bb
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#pragma once
#include <string>
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
ConvAddAddPReluCompute
(
const
FusionConvAddAddPReluParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
Tensor
bias1
=
*
param
.
Bias1
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
float
*
biase_data
=
bias
.
data
<
float
>
();
int
axis
=
param
.
Axis
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
Tensor
aa
=
*
param
.
InputAlpha
();
float
*
p
=
aa
.
data
<
float
>
();
std
::
string
mode
=
param
.
Mode
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
Tensor
bias1_batch
=
bias1
.
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
bias1_slice
=
bias1_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
float
*
biase_data1
=
bias1_slice
.
data
<
float
>
();
math
::
MatMulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
p
,
mode
,
biase_data
,
biase_data1
);
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif // FUSION_CONVADDADDPRELU_OP
src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
已删除
100644 → 0
浏览文件 @
42e520bb
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#pragma once
#include <string>
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
ConvAddPReluCompute
(
const
FusionConvAddPReluParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
float
*
biase_data
=
bias
.
data
<
float
>
();
int
axis
=
param
.
Axis
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
Tensor
aa
=
*
param
.
InputAlpha
();
float
*
p
=
aa
.
data
<
float
>
();
std
::
string
mode
=
param
.
Mode
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
MatMulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
p
,
mode
,
biase_data
,
nullptr
);
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif // FUSION_CONVADDPRELU_OP
src/operators/kernel/central-arm-func/conv_a
dd_arm_func.h
→
src/operators/kernel/central-arm-func/conv_a
rm_func.cpp
浏览文件 @
dd575b09
...
@@ -12,38 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,38 +12,54 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#ifdef FUSION_CONVADD_OP
#include "operators/kernel/central-arm-func/conv_arm_func.h"
#pragma once
#include <vector>
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/depthwise_conv5x5.h"
#include "operators/math/im2col.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/math_function.h"
#include "operators/math/pad.h"
#include "operators/math/vol2col.h"
#include "operators/math/vol2col.h"
#include "operators/math/winograd/winograd_transform.h"
#include "operators/op_param.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
void
ConvAddBasic
(
const
FusionConvAddParam
<
CPU
>
&
param
)
{
int
ConvOutputSize
(
int
input_size
,
int
filter_size
,
int
dilation
,
int
padding
,
int
stride
)
{
const
int
dkernel
=
dilation
*
(
filter_size
-
1
)
+
1
;
int
output_size
=
(
input_size
+
2
*
padding
-
dkernel
)
/
stride
+
1
;
return
output_size
;
}
bool
IsExpand
(
const
std
::
vector
<
int64_t
>
&
filter_dim
,
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
dilations
)
{
bool
filter_1
=
true
,
strides_1
=
true
,
padding_0
=
true
,
dilation_1
=
true
;
for
(
size_t
j
=
0
;
j
<
strides
.
size
();
++
j
)
{
filter_1
=
filter_1
&&
(
static_cast
<
int
>
(
filter_dim
[
j
+
2
])
==
1
);
strides_1
=
strides_1
&&
(
strides
[
j
]
==
1
);
padding_0
=
padding_0
&&
(
paddings
[
j
]
==
0
);
dilation_1
=
dilation_1
&&
(
dilations
[
j
]
==
1
);
}
return
!
(
filter_1
&&
strides_1
&&
padding_0
&&
dilation_1
);
}
template
<
typename
Itype
,
typename
Otype
>
void
GemmConv
(
const
ConvParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
Tensor
*
output
=
param
.
Output
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
output
->
mutable_data
<
Otype
>
();
float
*
biase_data
=
bias
.
data
<
float
>
();
int
axis
=
param
.
Axis
();
int
groups
=
param
.
Groups
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
const
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
const
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
...
@@ -57,12 +73,11 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
...
@@ -57,12 +73,11 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
framework
::
DDim
col_matrix_shape
=
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
bool
is_expand
=
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col
;
Tensor
col_matrix
;
Tensor
col_matrix
;
if
(
is_expand
)
{
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col
.
mutable_data
<
Itype
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
}
...
@@ -81,9 +96,10 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
...
@@ -81,9 +96,10 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Vol2ColFunctor
<
CPU
,
Itype
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
Itype
>
im2col
;
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
...
@@ -92,8 +108,8 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
...
@@ -92,8 +108,8 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
// col_matrix
.ShareDataWith(in_slice);
col_matrix
.
ShareDataWith
(
col
)
;
col_matrix
=
in_slice
;
col_matrix
.
Resize
(
col_matrix_shape
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
}
else
if
(
data_dim
==
2U
)
{
// im2col
// im2col
...
@@ -105,17 +121,122 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
...
@@ -105,17 +121,122 @@ void ConvAddBasic(const FusionConvAddParam<CPU> ¶m) {
// vol2col
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
}
// gemm
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
MatMul
<
float
,
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
math
::
MatMul
<
Itype
,
Otype
>
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
false
,
biase_data
);
static_cast
<
float
>
(
0
),
false
,
static_cast
<
Otype
*>
(
nullptr
));
}
}
}
}
}
}
}
// namespace operators
template
<
int
tile
,
int
kernel
>
}
// namespace paddle_mobile
void
WinogradConv3x3
(
const
ConvParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
filter
=
param
.
transformed_filter_
;
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
batch_size
=
input
->
dims
()[
0
];
int
groups
=
param
.
Groups
();
const
std
::
vector
<
int
>
&
paddings
=
param
.
Paddings
();
auto
winograd_pad
=
[
&
](
int
width
,
int
pad
)
{
int
output_tile
=
tile
-
kernel
+
1
;
// int tiles = (width + pad - kernel) / output_tile + 1;
// return (tiles - 1) * output_tile + tile - width;
int
pad_width
=
(
width
+
2
*
pad
-
kernel
)
/
output_tile
*
output_tile
;
return
pad_width
+
tile
-
width
;
};
math
::
PadFunctor
<
CPU
,
float
>
pad
;
Tensor
input_pad
;
framework
::
Tensor
transformed_input
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
);
// int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
// int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
int
pad_bottom
=
paddings
[
0
];
int
pad_right
=
paddings
[
1
];
if
(
paddings
[
0
]
||
paddings
[
1
]
||
pad_bottom
||
pad_right
)
{
framework
::
DDim
pad_shape
=
in_batch
.
dims
();
pad_shape
[
2
]
+=
paddings
[
0
]
+
pad_bottom
;
pad_shape
[
3
]
+=
paddings
[
1
]
+
pad_right
;
input_pad
.
mutable_data
<
float
>
(
pad_shape
);
pad
(
in_batch
,
paddings
[
0
],
pad_bottom
,
paddings
[
1
],
pad_right
,
&
input_pad
);
}
else
{
input_pad
=
in_batch
;
}
// tile input and transform
math
::
winograd_transform_input
<
tile
,
kernel
>
(
input_pad
,
&
transformed_input
);
// caculate output
math
::
winograd_transform_output
<
tile
,
kernel
>
(
transformed_input
,
*
filter
,
output
);
}
}
template
<
typename
Itype
,
typename
Otype
>
void
DepthwiseConv3x3
(
const
ConvParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
filter
=
param
.
Filter
();
const
std
::
vector
<
int
>
&
paddings
=
param
.
Paddings
();
const
std
::
vector
<
int
>
&
strides
=
param
.
Strides
();
const
int
batch_size
=
input
->
dims
()[
0
];
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
Otype
>
();
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
);
if
(
strides
[
0
]
==
1
)
{
math
::
DepthwiseConv3x3S1
<
Itype
,
Otype
>
(
in_batch
,
*
filter
,
paddings
,
&
out_batch
);
}
else
if
(
strides
[
0
]
==
2
)
{
math
::
DepthwiseConv3x3S2
<
Itype
,
Otype
>
(
in_batch
,
*
filter
,
paddings
,
&
out_batch
);
}
else
{
GemmConv
<
Itype
,
Otype
>
(
param
);
}
}
}
template
<
typename
Itype
,
typename
Otype
>
void
DepthwiseConv5x5
(
const
ConvParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
filter
=
param
.
Filter
();
const
std
::
vector
<
int
>
&
paddings
=
param
.
Paddings
();
const
std
::
vector
<
int
>
&
strides
=
param
.
Strides
();
const
int
batch_size
=
input
->
dims
()[
0
];
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
Otype
>
();
// if (strides[0] == 1) {
// for (int i = 0; i < batch_size; i++) {
// Tensor in_batch = input->Slice(i, i + 1);
// Tensor out_batch = output->Slice(i, i + 1);
// math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
// &out_batch);
// }
// } else {
GemmConv
<
Itype
,
Otype
>
(
param
);
// }
}
template
void
GemmConv
<
float
,
float
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
WinogradConv3x3
<
8
,
3
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
DepthwiseConv3x3
<
float
,
float
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
DepthwiseConv5x5
<
float
,
float
>(
const
ConvParam
<
CPU
>
&
param
);
#ifndef __aarch64__
template
void
GemmConv
<
int8_t
,
int32_t
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
DepthwiseConv3x3
<
int8_t
,
int32_t
>(
const
ConvParam
<
CPU
>
&
param
);
template
void
DepthwiseConv5x5
<
int8_t
,
int32_t
>(
const
ConvParam
<
CPU
>
&
param
);
#endif
#endif
}
// namespace operators
}
// namespace paddle_mobile
src/operators/kernel/central-arm-func/conv_arm_func.h
浏览文件 @
dd575b09
...
@@ -15,386 +15,31 @@ limitations under the License. */
...
@@ -15,386 +15,31 @@ limitations under the License. */
#ifdef CONV_OP
#ifdef CONV_OP
#pragma once
#pragma once
#include <vector>
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/depthwise_conv5x5.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/pad.h"
#include "operators/math/vol2col.h"
#include "operators/math/winograd/winograd_transform.h"
#include "operators/op_param.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
template
<
typename
Itype
,
typename
Otype
>
int
ConvOutputSize
(
int
input_size
,
int
filter_size
,
int
dilation
,
int
padding
,
inline
void
GemmConv
(
const
ConvParam
<
CPU
>
&
param
)
{
int
stride
);
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
Otype
>
();
int
groups
=
param
.
Groups
();
const
std
::
vector
<
int
>
strides
=
param
.
Strides
();
const
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
const
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
Itype
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
Itype
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
Itype
>
im2col
;
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
// col_matrix.ShareDataWith(in_slice);
col_matrix
=
in_slice
;
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
bool
IsExpand
(
const
std
::
vector
<
int64_t
>
&
filter_dim
,
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
paddings
,
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
const
std
::
vector
<
int
>
&
dilations
);
math
::
MatMul
<
Itype
,
Otype
>
(
filter_slice
,
false
,
col_matrix
,
false
,
template
<
typename
Itype
,
typename
Otype
>
static_cast
<
float
>
(
1
),
&
out_slice
,
void
GemmConv
(
const
ConvParam
<
CPU
>
&
param
);
static_cast
<
float
>
(
0
),
false
,
static_cast
<
Otype
*>
(
nullptr
));
}
}
}
template
<
int
tile
,
int
kernel
>
template
<
int
tile
,
int
kernel
>
inline
void
WinogradConv3x3
(
const
ConvParam
<
CPU
>
&
param
)
{
void
WinogradConv3x3
(
const
ConvParam
<
CPU
>
&
param
);
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
filter
=
param
.
transformed_filter_
;
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
batch_size
=
input
->
dims
()[
0
];
int
groups
=
param
.
Groups
();
const
std
::
vector
<
int
>
&
paddings
=
param
.
Paddings
();
auto
winograd_pad
=
[
&
](
int
width
,
int
pad
)
{
int
output_tile
=
tile
-
kernel
+
1
;
// int tiles = (width + pad - kernel) / output_tile + 1;
// return (tiles - 1) * output_tile + tile - width;
int
pad_width
=
(
width
+
2
*
pad
-
kernel
)
/
output_tile
*
output_tile
;
return
pad_width
+
tile
-
width
;
};
math
::
PadFunctor
<
CPU
,
float
>
pad
;
Tensor
input_pad
;
framework
::
Tensor
transformed_input
;
for
(
int
i
=
0
;
i
<
batch_size
;
++
i
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
);
// int pad_bottom = winograd_pad(in_batch.dims()[2], paddings[0]);
// int pad_right = winograd_pad(in_batch.dims()[3], paddings[1]);
int
pad_bottom
=
paddings
[
0
];
int
pad_right
=
paddings
[
1
];
if
(
paddings
[
0
]
||
paddings
[
1
]
||
pad_bottom
||
pad_right
)
{
framework
::
DDim
pad_shape
=
in_batch
.
dims
();
pad_shape
[
2
]
+=
paddings
[
0
]
+
pad_bottom
;
pad_shape
[
3
]
+=
paddings
[
1
]
+
pad_right
;
input_pad
.
mutable_data
<
float
>
(
pad_shape
);
pad
(
in_batch
,
paddings
[
0
],
pad_bottom
,
paddings
[
1
],
pad_right
,
&
input_pad
);
}
else
{
input_pad
=
in_batch
;
}
// tile input and transform
math
::
winograd_transform_input
<
tile
,
kernel
>
(
input_pad
,
&
transformed_input
);
// caculate output
math
::
winograd_transform_output
<
tile
,
kernel
>
(
transformed_input
,
*
filter
,
output
);
}
}
#ifndef __aarch64__
// int8 DepthwiseConv3x3
template
<
typename
Itype
,
typename
Otype
>
template
<
typename
Itype
,
typename
Otype
>
inline
void
DepthwiseConv3x3
(
const
ConvParam
<
CPU
>
&
param
)
{
void
DepthwiseConv3x3
(
const
ConvParam
<
CPU
>
&
param
);
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
filter
=
param
.
Filter
();
const
std
::
vector
<
int
>
&
paddings
=
param
.
Paddings
();
const
std
::
vector
<
int
>
&
strides
=
param
.
Strides
();
const
int
batch_size
=
input
->
dims
()[
0
];
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
Otype
>
();
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
);
if
(
strides
[
0
]
==
1
)
{
math
::
DepthwiseConv3x3S1
<
Itype
,
Otype
>
(
in_batch
,
*
filter
,
paddings
,
&
out_batch
);
}
else
if
(
strides
[
0
]
==
2
)
{
math
::
DepthwiseConv3x3S2
<
Itype
,
Otype
>
(
in_batch
,
*
filter
,
paddings
,
&
out_batch
);
}
else
{
GemmConv
<
Itype
,
Otype
>
(
param
);
}
}
}
#endif // __aarch64__
template
<
typename
Itype
,
typename
Otype
>
template
<
typename
Itype
,
typename
Otype
>
inline
void
DepthwiseConv5x5
(
const
ConvParam
<
CPU
>
&
param
)
{
void
DepthwiseConv5x5
(
const
ConvParam
<
CPU
>
&
param
);
const
Tensor
*
input
=
param
.
Input
();
const
Tensor
*
filter
=
param
.
Filter
();
const
std
::
vector
<
int
>
&
paddings
=
param
.
Paddings
();
const
std
::
vector
<
int
>
&
strides
=
param
.
Strides
();
const
int
batch_size
=
input
->
dims
()[
0
];
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
Otype
>
();
// if (strides[0] == 1) {
// for (int i = 0; i < batch_size; i++) {
// Tensor in_batch = input->Slice(i, i + 1);
// Tensor out_batch = output->Slice(i, i + 1);
// math::DepthwiseConv5x5S1<Itype, Otype>(in_batch, *filter, paddings,
// &out_batch);
// }
// } else {
GemmConv
<
Itype
,
Otype
>
(
param
);
// }
}
template
<
typename
ParamType
>
void
ConvAddReluBasic
(
const
ParamType
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
float
alpha
=
1.0
f
;
float
beta
=
1.0
f
;
int32_t
groups
=
param
.
Groups
();
int32_t
axis
=
param
.
Axis
();
std
::
vector
<
int32_t
>
strides
=
param
.
Strides
();
std
::
vector
<
int32_t
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int32_t
>
dilations
=
param
.
Dilations
();
const
int32_t
batch_size
=
static_cast
<
int32_t
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int32_t
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int32_t
in_step
=
static_cast
<
int32_t
>
(
input
->
dims
()[
1
])
/
groups
;
int32_t
out_step
=
static_cast
<
int32_t
>
(
output
->
dims
()[
1
])
/
groups
;
float
*
bias_data
=
bias
.
data
<
float
>
();
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int32_t
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int32_t
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col_matrix
=
in_slice
;
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int32_t
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
MatMul
<
float
,
float
>
(
filter_slice
,
false
,
col_matrix
,
false
,
alpha
,
&
out_slice
,
beta
,
true
,
bias_data
);
}
}
}
template
<
typename
ParamType
>
void
ConvBNReluBasic
(
const
ParamType
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
new_bias
=
*
param
.
NewBias
();
Tensor
new_scale
=
*
param
.
NewScale
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col_matrix
=
in_slice
;
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
MatMulWithBn
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
0
),
true
,
&
new_scale
,
&
new_bias
,
g
);
}
}
}
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
...
...
src/operators/kernel/central-arm-func/conv_bn_add_relu_arm_func.h
已删除
100644 → 0
浏览文件 @
42e520bb
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVBNADDRELU_OP
#pragma once
#include <vector>
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
void
ConvBNAddReluBasic
(
const
FusionConvBNAddReluParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
new_bias
=
*
param
.
NewBias
();
Tensor
new_scale
=
*
param
.
NewScale
();
Tensor
*
bias1
=
param
.
Bias
();
Tensor
*
output
=
param
.
Output
();
output
->
mutable_data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
Tensor
bias_batch
=
bias1
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
bias_data
=
bias_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
math
::
MatMulWithBn
(
filter_slice
,
false
,
col_matrix
,
false
,
static_cast
<
float
>
(
1
),
&
out_slice
,
static_cast
<
float
>
(
1
),
true
,
&
new_scale
,
&
new_bias
,
g
,
bias_data
.
data
<
float
>
());
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/conv_add_add_prelu_kernel.h
已删除
100644 → 0
浏览文件 @
42e520bb
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_CONVADDADDPRELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
using
framework
::
DDim
;
using
framework
::
OpKernelBase
;
template
<
typename
DeviceType
,
typename
T
>
class
ConvAddAddPReluKernel
:
public
OpKernelBase
<
DeviceType
,
FusionConvAddAddPReluParam
<
DeviceType
>>
{
public:
void
Compute
(
const
FusionConvAddAddPReluParam
<
DeviceType
>
&
param
);
bool
Init
(
FusionConvAddAddPReluParam
<
DeviceType
>
*
param
);
};
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/conv_add_bn_kernel.h
浏览文件 @
dd575b09
...
@@ -19,7 +19,6 @@ limitations under the License. */
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "framework/ddim.h"
#include "framework/ddim.h"
#include "framework/operator.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/conv_add_bn_relu_kernel.h
浏览文件 @
dd575b09
...
@@ -19,7 +19,6 @@ limitations under the License. */
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "framework/ddim.h"
#include "framework/ddim.h"
#include "framework/operator.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/conv_add_kernel.h
浏览文件 @
dd575b09
...
@@ -23,7 +23,6 @@ limitations under the License. */
...
@@ -23,7 +23,6 @@ limitations under the License. */
#include "common/common.h"
#include "common/common.h"
#include "framework/ddim.h"
#include "framework/ddim.h"
#include "framework/operator.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/im2col.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/math_function.h"
...
...
src/operators/kernel/conv_add_prelu_kernel.h
已删除
100644 → 0
浏览文件 @
42e520bb
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_CONVADDPRELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
using
framework
::
DDim
;
using
framework
::
OpKernelBase
;
template
<
typename
DeviceType
,
typename
T
>
class
ConvAddPReluKernel
:
public
OpKernelBase
<
DeviceType
,
FusionConvAddPReluParam
<
DeviceType
>>
{
public:
void
Compute
(
const
FusionConvAddPReluParam
<
DeviceType
>
&
param
);
bool
Init
(
FusionConvAddPReluParam
<
DeviceType
>
*
param
);
};
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/conv_add_relu_kernel.h
浏览文件 @
dd575b09
...
@@ -19,7 +19,6 @@ limitations under the License. */
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "framework/ddim.h"
#include "framework/ddim.h"
#include "framework/operator.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/conv_bn_add_relu_kernel.h
浏览文件 @
dd575b09
...
@@ -19,7 +19,6 @@ limitations under the License. */
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "framework/ddim.h"
#include "framework/ddim.h"
#include "framework/operator.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/conv_bn_kernel.h
浏览文件 @
dd575b09
...
@@ -19,7 +19,6 @@ limitations under the License. */
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "framework/ddim.h"
#include "framework/ddim.h"
#include "framework/operator.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/conv_bn_relu_kernel.h
浏览文件 @
dd575b09
...
@@ -19,7 +19,6 @@ limitations under the License. */
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "framework/ddim.h"
#include "framework/ddim.h"
#include "framework/operator.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/dwconv_bn_relu_kernel.h
浏览文件 @
dd575b09
...
@@ -19,7 +19,6 @@ limitations under the License. */
...
@@ -19,7 +19,6 @@ limitations under the License. */
#include <vector>
#include <vector>
#include "framework/ddim.h"
#include "framework/ddim.h"
#include "framework/operator.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/math/vol2col.h"
...
...
src/operators/kernel/lrn_kernel.h
浏览文件 @
dd575b09
...
@@ -15,24 +15,21 @@ limitations under the License. */
...
@@ -15,24 +15,21 @@ limitations under the License. */
#pragma once
#pragma once
#ifdef LRN_OP
#ifdef LRN_OP
#include <cmath>
#ifdef _OPENMP
#ifdef _OPENMP
#include <omp.h>
#include <omp.h>
#endif
#endif
#include "framework/operator.h"
#include "operators/op_param.h"
#include <cmath>
#ifdef __ARM_NEON
#ifdef __ARM_NEON
#include
"arm_neon.h"
#include
<arm_neon.h>
#include "operators/math/math
_func_neon
.h"
#include "operators/math/math.h"
#endif
#endif
#include "framework/operator.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
using
namespace
framework
;
template
<
typename
T
>
template
<
typename
T
>
struct
LRNFunctor
{
struct
LRNFunctor
{
void
operator
()(
const
framework
::
Tensor
&
input
,
framework
::
Tensor
*
out
,
int
N
,
void
operator
()(
const
framework
::
Tensor
&
input
,
framework
::
Tensor
*
out
,
int
N
,
...
...
src/operators/math/activation.h
浏览文件 @
dd575b09
...
@@ -21,7 +21,7 @@ limitations under the License. */
...
@@ -21,7 +21,7 @@ limitations under the License. */
#include "common/types.h"
#include "common/types.h"
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#include <arm_neon.h>
#include <arm_neon.h>
#include "operators/math/math
_func_neon
.h"
#include "operators/math/math.h"
#endif
#endif
namespace
paddle_mobile
{
namespace
paddle_mobile
{
...
...
src/operators/math/c
onv_func
.h
→
src/operators/math/c
hannel_wise
.h
浏览文件 @
dd575b09
...
@@ -14,91 +14,16 @@ limitations under the License. */
...
@@ -14,91 +14,16 @@ limitations under the License. */
#pragma once
#pragma once
#include <vector>
#include "framework/tensor.h"
#include "operators/math/activation.h"
#ifdef __ARM_NEON
#ifdef __ARM_NEON
#include <arm_neon.h>
#include <arm_neon.h>
#endif
#endif
#include "framework/ddim.h"
#include "framework/tensor.h"
#include "operators/math/activation.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
using
framework
::
DDim
;
using
framework
::
Tensor
;
inline
int
ConvOutputSize
(
int
input_size
,
int
filter_size
,
int
dilation
,
int
padding
,
int
stride
)
{
const
int
dkernel
=
dilation
*
(
filter_size
-
1
)
+
1
;
int
output_size
=
(
input_size
+
2
*
padding
-
dkernel
)
/
stride
+
1
;
return
output_size
;
}
inline
void
expand_bias
(
Tensor
&
bias
,
int
axis
,
const
DDim
&
dDim
)
{
// NOLINT
const
auto
bias_ptr
=
bias
.
data
<
float
>
();
const
DDim
bias_ddim
=
bias
.
dims
();
PADDLE_MOBILE_ENFORCE
(
bias
.
dims
().
size
()
==
1
,
"the bias tensor's dims size != 1"
)
DDim
outer_ddim
=
paddle_mobile
::
framework
::
slice_ddim
(
dDim
,
0
,
axis
+
1
);
DDim
inner_ddim
=
paddle_mobile
::
framework
::
slice_ddim
(
dDim
,
axis
+
1
,
dDim
.
size
());
int
outer_size
=
paddle_mobile
::
framework
::
product
(
outer_ddim
);
int
inner_size
=
paddle_mobile
::
framework
::
product
(
inner_ddim
);
bias
.
Resize
(
dDim
);
auto
new_ptr
=
bias
.
mutable_data
<
float
>
();
int
axis_size
=
dDim
[
axis
];
#ifdef __ARM_NEON
for
(
int
i
=
0
;
i
<
outer_size
;
++
i
)
{
int
inner_num
=
inner_size
>>
4
;
int
remain
=
inner_size
-
(
inner_num
<<
4
);
float
v_bias
=
bias_ptr
[
i
*
axis_size
/
outer_size
];
for
(;
inner_num
>
0
;
inner_num
--
)
{
float32x4_t
v_newptr1
=
vdupq_n_f32
(
v_bias
);
float32x4_t
v_newptr2
=
vdupq_n_f32
(
v_bias
);
float32x4_t
v_newptr3
=
vdupq_n_f32
(
v_bias
);
float32x4_t
v_newptr4
=
vdupq_n_f32
(
v_bias
);
vst1q_f32
(
new_ptr
,
v_newptr1
);
new_ptr
+=
4
;
vst1q_f32
(
new_ptr
,
v_newptr2
);
new_ptr
+=
4
;
vst1q_f32
(
new_ptr
,
v_newptr3
);
new_ptr
+=
4
;
vst1q_f32
(
new_ptr
,
v_newptr4
);
new_ptr
+=
4
;
}
for
(;
remain
>
0
;
remain
--
)
{
*
new_ptr
=
v_bias
;
new_ptr
++
;
}
}
#else
for
(
int
i
=
0
;
i
<
outer_size
;
++
i
)
{
float
v_bias
=
bias_ptr
[
i
*
axis_size
/
outer_size
];
for
(
int
j
=
0
;
j
<
inner_size
;
++
j
)
{
new_ptr
[
i
*
inner_size
+
j
]
=
v_bias
;
}
}
#endif
}
inline
bool
IsExpand
(
const
std
::
vector
<
int64_t
>
&
filter_dim
,
const
std
::
vector
<
int
>
&
strides
,
const
std
::
vector
<
int
>
&
paddings
,
const
std
::
vector
<
int
>
&
dilations
)
{
bool
filter_1
=
true
,
strides_1
=
true
,
padding_0
=
true
,
dilation_1
=
true
;
for
(
size_t
j
=
0
;
j
<
strides
.
size
();
++
j
)
{
filter_1
=
filter_1
&&
(
static_cast
<
int
>
(
filter_dim
[
j
+
2
])
==
1
);
strides_1
=
strides_1
&&
(
strides
[
j
]
==
1
);
padding_0
=
padding_0
&&
(
paddings
[
j
]
==
0
);
dilation_1
=
dilation_1
&&
(
dilations
[
j
]
==
1
);
}
return
!
(
filter_1
&&
strides_1
&&
padding_0
&&
dilation_1
);
}
template
<
ActivationType
Act
>
template
<
ActivationType
Act
>
void
AddChannelWise
(
const
framework
::
Tensor
*
input
,
void
AddChannelWise
(
const
framework
::
Tensor
*
input
,
const
framework
::
Tensor
*
bias
,
framework
::
Tensor
*
output
)
{
const
framework
::
Tensor
*
bias
,
framework
::
Tensor
*
output
)
{
...
...
src/operators/math/depthwise_conv3x3.h
浏览文件 @
dd575b09
...
@@ -17,7 +17,6 @@ limitations under the License. */
...
@@ -17,7 +17,6 @@ limitations under the License. */
#include <algorithm>
#include <algorithm>
#include <vector>
#include <vector>
#include "framework/tensor.h"
#include "framework/tensor.h"
#include "operators/math/conv_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
...
src/operators/math/depthwise_conv3x3_int8.cpp
浏览文件 @
dd575b09
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#if defined(__ARM_NEON__)
&& !defined(__aarch64__
)
#if defined(__ARM_NEON__)
|| defined(__ARM_NEON
)
#include <arm_neon.h>
#include <arm_neon.h>
#include "operators/math/depthwise_conv3x3.h"
#include "operators/math/depthwise_conv3x3.h"
...
@@ -70,7 +70,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
...
@@ -70,7 +70,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
DEPTHWISE_CONV_NORMAL_BORDER
(
0
,
valid_w_start
)
DEPTHWISE_CONV_NORMAL_BORDER
(
0
,
valid_w_start
)
// middle
// middle
int
remain_start
=
valid_w_start
;
int
remain_start
=
valid_w_start
;
#ifdef __ARM_NEON__
int
output_tiles
=
(
valid_w_end
-
valid_w_start
)
/
6
;
int
output_tiles
=
(
valid_w_end
-
valid_w_start
)
/
6
;
remain_start
=
valid_w_start
+
output_tiles
*
6
;
remain_start
=
valid_w_start
+
output_tiles
*
6
;
int32x4_t
_sum0
,
_sum1
;
int32x4_t
_sum0
,
_sum1
;
...
@@ -94,7 +93,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
...
@@ -94,7 +93,6 @@ inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter,
vst1q_s32
(
output_ptr
+
output_offset
,
_sum0
);
vst1q_s32
(
output_ptr
+
output_offset
,
_sum0
);
vst1_s32
(
output_ptr
+
output_offset
+
4
,
vget_low_s32
(
_sum1
));
vst1_s32
(
output_ptr
+
output_offset
+
4
,
vget_low_s32
(
_sum1
));
}
}
#endif // __ARM_NEON__
for
(
int
w
=
remain_start
;
w
<
valid_w_end
;
++
w
)
{
for
(
int
w
=
remain_start
;
w
<
valid_w_end
;
++
w
)
{
int32_t
value
=
0
;
int32_t
value
=
0
;
int
input_start
=
-
padding_w
+
w
*
Stride_w
;
int
input_start
=
-
padding_w
+
w
*
Stride_w
;
...
@@ -215,6 +213,8 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
...
@@ -215,6 +213,8 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
output_ptr2
+=
valid_w_start
;
output_ptr2
+=
valid_w_start
;
output_ptr3
+=
valid_w_start
;
output_ptr3
+=
valid_w_start
;
}
}
#if __aarch64__
#else
// valid
// valid
int
loop
=
output_w_tiles
;
int
loop
=
output_w_tiles
;
asm
volatile
(
asm
volatile
(
...
@@ -525,6 +525,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
...
@@ -525,6 +525,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
#endif // __aarch64__
// pad right
// pad right
if
(
padding_w
)
{
if
(
padding_w
)
{
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
-
2
)));
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
-
2
)));
...
@@ -619,6 +620,8 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
...
@@ -619,6 +620,8 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
output_ptr1
+=
valid_w_start
;
output_ptr1
+=
valid_w_start
;
}
}
// valid
// valid
#if __aarch64__
#else
int
loop
=
output_w_tiles
;
int
loop
=
output_w_tiles
;
asm
volatile
(
asm
volatile
(
"cmp %[loop], #0
\n
"
"cmp %[loop], #0
\n
"
...
@@ -804,6 +807,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
...
@@ -804,6 +807,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
#endif // __aarch64__
// pad right
// pad right
if
(
padding_w
)
{
if
(
padding_w
)
{
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
-
2
)));
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
-
2
)));
...
@@ -870,6 +874,8 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
...
@@ -870,6 +874,8 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
output_ptr0
+=
valid_w_start
;
output_ptr0
+=
valid_w_start
;
}
}
// valid
// valid
#if __aarch64__
#else
int
loop
=
output_w_tiles
;
int
loop
=
output_w_tiles
;
asm
volatile
(
asm
volatile
(
"cmp %[loop], #0
\n
"
"cmp %[loop], #0
\n
"
...
@@ -993,6 +999,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
...
@@ -993,6 +999,7 @@ void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
#endif // __aarch64__
// pad right
// pad right
if
(
padding_w
)
{
if
(
padding_w
)
{
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
-
2
)));
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
-
2
)));
...
@@ -1153,6 +1160,8 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
...
@@ -1153,6 +1160,8 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
output_ptr2
+=
valid_w_start
;
output_ptr2
+=
valid_w_start
;
}
}
// valid
// valid
#if __aarch64__
#else
int
loop
=
output_w_tiles
;
int
loop
=
output_w_tiles
;
asm
volatile
(
asm
volatile
(
"cmp %[loop], #0
\n
"
"cmp %[loop], #0
\n
"
...
@@ -1411,6 +1420,7 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
...
@@ -1411,6 +1420,7 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
#endif // __aarch64__
// pad right
// pad right
if
(
padding_w
>
0
)
{
if
(
padding_w
>
0
)
{
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
)));
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
)));
...
@@ -1491,6 +1501,8 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
...
@@ -1491,6 +1501,8 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
output_ptr0
+=
valid_w_start
;
output_ptr0
+=
valid_w_start
;
}
}
// valid
// valid
#if __aarch64__
#else
int
loop
=
output_w_tiles
;
int
loop
=
output_w_tiles
;
asm
volatile
(
asm
volatile
(
"cmp %[loop], #0
\n
"
"cmp %[loop], #0
\n
"
...
@@ -1608,6 +1620,7 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
...
@@ -1608,6 +1620,7 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
[
remain
]
"r"
(
output_w_remain
),
[
ker0
]
"w"
(
_ker0
),
[
ker1
]
"w"
(
_ker1
)
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
:
"cc"
,
"memory"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
,
"q11"
,
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
"q12"
,
"q13"
,
"q14"
,
"q15"
,
"r0"
);
#endif // __aarch64__
// pad right
// pad right
if
(
padding_w
>
0
)
{
if
(
padding_w
>
0
)
{
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
)));
int16x4_t
row0
=
vget_low_s16
(
vmovl_s8
(
vld1_s8
(
input_ptr0
)));
...
@@ -1645,4 +1658,4 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
...
@@ -1645,4 +1658,4 @@ void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
#endif
#endif
// __ARM_NEON__
src/operators/math/depthwise_conv3x3_int8_arm64.cpp
已删除
100644 → 0
浏览文件 @
42e520bb
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(__ARM_NEON__) && defined(__aarch64__)
#include "operators/math/depthwise_conv3x3.h"
#ifdef __ARM_NEON__
#include <arm_neon.h>
#endif
namespace
paddle_mobile
{
namespace
operators
{
namespace
math
{
// template<>
// void DepthwiseConv3x3<int8_t, int32_t>(
// const framework::Tensor *input, const framework::Tensor *filter,
// const std::vector<int> &strides, framework::Tensor *output) {
// PADDLE_MOBILE_THROW_EXCEPTION(
// "Depthwise conv with generic strides has not been implemented.");
// }
template
<
>
void
DepthwiseConv3x3S1
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
filter
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
)
{
PADDLE_MOBILE_THROW_EXCEPTION
(
"Depthwise conv3x3 with stride 1 for arm v8 has not been implemented."
);
}
template
<
>
void
DepthwiseConv3x3S2
<
int8_t
,
int32_t
>
(
const
framework
::
Tensor
&
input
,
const
framework
::
Tensor
&
filter
,
const
std
::
vector
<
int
>
&
paddings
,
framework
::
Tensor
*
output
)
{
PADDLE_MOBILE_THROW_EXCEPTION
(
"Depthwise conv3x3 with stride 2 for arm v8 has not been implemented."
);
}
}
// namespace math
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/depthwise_conv5x5.h
浏览文件 @
dd575b09
...
@@ -17,7 +17,6 @@ limitations under the License. */
...
@@ -17,7 +17,6 @@ limitations under the License. */
#include <algorithm>
#include <algorithm>
#include <vector>
#include <vector>
#include "framework/tensor.h"
#include "framework/tensor.h"
#include "operators/math/conv_func.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
...
src/operators/math/gemm/pack_kernel.h
浏览文件 @
dd575b09
...
@@ -31,345 +31,239 @@ inline float32x4_t vandq_f32_u32(float32x4_t x, uint32x4_t mask) {
...
@@ -31,345 +31,239 @@ inline float32x4_t vandq_f32_u32(float32x4_t x, uint32x4_t mask) {
void
pack_lhs_6r
(
const
int
m
,
const
int
k
,
const
float
*
A
,
const
int
lda
,
void
pack_lhs_6r
(
const
int
m
,
const
int
k
,
const
float
*
A
,
const
int
lda
,
float
*
output
,
const
bool
unroll
)
{
float
*
output
,
const
bool
unroll
)
{
float
*
zero
=
new
float
[
k
];
uint32_t
mask
[
8
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
4
,
5
};
memset
(
zero
,
0
,
k
*
sizeof
(
float
));
int
remain_k
=
k
&
0x3
;
uint32x4_t
vzero
=
vdupq_n_u32
(
0
);
uint32x4_t
vmask1
=
vcltq_u32
(
vld1q_u32
(
mask
),
vdupq_n_u32
(
remain_k
));
const
int
m_tail
=
m
%
6
;
#pragma omp parallel for if (unroll)
const
int
i_length
=
m
-
m_tail
;
for
(
int
i
=
0
;
i
<
m
-
5
;
i
+=
6
)
{
for
(
int
i
=
0
;
i
<
i_length
;
i
+=
6
)
{
const
float
*
a0
=
A
+
i
*
lda
;
const
float
*
a0
=
A
+
i
*
lda
;
const
float
*
a1
=
A
+
(
i
+
1
)
*
lda
;
const
float
*
a1
=
A
+
(
i
+
1
)
*
lda
;
const
float
*
a2
=
A
+
(
i
+
2
)
*
lda
;
const
float
*
a2
=
A
+
(
i
+
2
)
*
lda
;
const
float
*
a3
=
A
+
(
i
+
3
)
*
lda
;
const
float
*
a3
=
A
+
(
i
+
3
)
*
lda
;
const
float
*
a4
=
A
+
(
i
+
4
)
*
lda
;
const
float
*
a4
=
A
+
(
i
+
4
)
*
lda
;
const
float
*
a5
=
A
+
(
i
+
5
)
*
lda
;
const
float
*
a5
=
A
+
(
i
+
5
)
*
lda
;
float
*
local_buffer
=
output
+
i
*
k
;
float
*
out_ptr
=
output
+
i
*
k
;
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
*
local_buffer
++
=
*
a0
++
;
int
loops
=
k
>>
2
;
*
local_buffer
++
=
*
a1
++
;
if
(
loops
>
0
)
{
*
local_buffer
++
=
*
a2
++
;
#if __aarch64__
*
local_buffer
++
=
*
a3
++
;
for
(
int
l
=
0
;
l
<
loops
;
++
l
)
{
*
local_buffer
++
=
*
a4
++
;
float32x4_t
_d0
=
vld1q_f32
(
a0
);
*
local_buffer
++
=
*
a5
++
;
float32x4_t
_d1
=
vld1q_f32
(
a1
);
}
float32x4_t
_d2
=
vld1q_f32
(
a2
);
}
float32x4_t
_d3
=
vld1q_f32
(
a3
);
if
(
m_tail
!=
0
)
{
float32x4_t
_d4
=
vld1q_f32
(
a4
);
const
float
*
a0
=
A
+
i_length
*
lda
;
float32x4_t
_d5
=
vld1q_f32
(
a5
);
float32x4x2_t
_q0
=
vtrnq_f32
(
_d0
,
_d1
);
float32x4x2_t
_q1
=
vtrnq_f32
(
_d2
,
_d3
);
float32x4x2_t
_q3
=
vtrnq_f32
(
_d4
,
_d5
);
_d0
=
vcombine_f32
(
vget_low_f32
(
_q0
.
val
[
0
]),
vget_low_f32
(
_q1
.
val
[
0
]));
_d1
=
vcombine_f32
(
vget_low_f32
(
_q0
.
val
[
1
]),
vget_low_f32
(
_q1
.
val
[
1
]));
_d2
=
vcombine_f32
(
vget_high_f32
(
_q0
.
val
[
0
]),
vget_high_f32
(
_q1
.
val
[
0
]));
_d3
=
vcombine_f32
(
vget_high_f32
(
_q0
.
val
[
1
]),
vget_high_f32
(
_q1
.
val
[
1
]));
vst1q_f32
(
out_ptr
,
_d0
);
vst1_f32
(
out_ptr
+
4
,
vget_low_f32
(
_q3
.
val
[
0
]));
vst1q_f32
(
out_ptr
+
6
,
_d1
);
vst1_f32
(
out_ptr
+
10
,
vget_low_f32
(
_q3
.
val
[
1
]));
vst1q_f32
(
out_ptr
+
12
,
_d2
);
vst1_f32
(
out_ptr
+
16
,
vget_high_f32
(
_q3
.
val
[
0
]));
vst1q_f32
(
out_ptr
+
18
,
_d3
);
vst1_f32
(
out_ptr
+
22
,
vget_high_f32
(
_q3
.
val
[
1
]));
a0
+=
4
;
a1
+=
4
;
a2
+=
4
;
a3
+=
4
;
a4
+=
4
;
a5
+=
4
;
out_ptr
+=
24
;
}
#else
asm
volatile
(
"loop_4k_%=:
\n
"
"vld1.32 {d0-d1}, [%[a0]]!
\n
"
"vld1.32 {d2-d3}, [%[a1]]!
\n
"
"vld1.32 {d4-d5}, [%[a2]]!
\n
"
"vld1.32 {d6-d7}, [%[a3]]!
\n
"
"vld1.32 {d8-d9}, [%[a4]]!
\n
"
"vld1.32 {d10-d11}, [%[a5]]!
\n
"
"vtrn.32 q0, q1
\n
"
"vtrn.32 q2, q3
\n
"
"vtrn.32 q4, q5
\n
"
"vswp.32 d1, d4
\n
"
"vswp.32 d3, d6
\n
"
"vst1.32 {q0}, [%[out]]!
\n
"
"vst1.32 {d8}, [%[out]]!
\n
"
"vst1.32 {q1}, [%[out]]!
\n
"
"vst1.32 {d10}, [%[out]]!
\n
"
"vst1.32 {q2}, [%[out]]!
\n
"
"vst1.32 {d9}, [%[out]]!
\n
"
"vst1.32 {q3}, [%[out]]!
\n
"
"vst1.32 {d11}, [%[out]]!
\n
"
"subs %[loops], #1
\n
"
"bne loop_4k_%=
\n
"
:
[
out
]
"+r"
(
out_ptr
),
[
a0
]
"+r"
(
a0
),
[
a1
]
"+r"
(
a1
),
[
a2
]
"+r"
(
a2
),
[
a3
]
"+r"
(
a3
),
[
a4
]
"+r"
(
a4
),
[
a5
]
"+r"
(
a5
),
[
loops
]
"+r"
(
loops
)
:
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
);
#endif
}
if
(
remain_k
>
0
)
{
float32x4_t
_d0
=
vld1q_f32
(
a0
);
float32x4_t
_d1
=
vld1q_f32
(
a1
);
float32x4_t
_d2
=
vld1q_f32
(
a2
);
float32x4_t
_d3
=
vld1q_f32
(
a3
);
float32x4_t
_d4
=
vld1q_f32
(
a4
);
float32x4_t
_d5
=
vld1q_f32
(
a5
);
_d0
=
vandq_f32_u32
(
_d0
,
vmask1
);
_d1
=
vandq_f32_u32
(
_d1
,
vmask1
);
_d2
=
vandq_f32_u32
(
_d2
,
vmask1
);
_d3
=
vandq_f32_u32
(
_d3
,
vmask1
);
_d4
=
vandq_f32_u32
(
_d4
,
vmask1
);
_d5
=
vandq_f32_u32
(
_d5
,
vmask1
);
float32x4x2_t
_q0
=
vtrnq_f32
(
_d0
,
_d1
);
float32x4x2_t
_q1
=
vtrnq_f32
(
_d2
,
_d3
);
float32x4x2_t
_q3
=
vtrnq_f32
(
_d4
,
_d5
);
_d0
=
vcombine_f32
(
vget_low_f32
(
_q0
.
val
[
0
]),
vget_low_f32
(
_q1
.
val
[
0
]));
_d1
=
vcombine_f32
(
vget_low_f32
(
_q0
.
val
[
1
]),
vget_low_f32
(
_q1
.
val
[
1
]));
_d2
=
vcombine_f32
(
vget_high_f32
(
_q0
.
val
[
0
]),
vget_high_f32
(
_q1
.
val
[
0
]));
switch
(
remain_k
)
{
case
3
:
vst1q_f32
(
out_ptr
+
12
,
_d2
);
vst1_f32
(
out_ptr
+
16
,
vget_high_f32
(
_q3
.
val
[
0
]));
case
2
:
vst1q_f32
(
out_ptr
+
6
,
_d1
);
vst1_f32
(
out_ptr
+
10
,
vget_low_f32
(
_q3
.
val
[
1
]));
case
1
:
vst1q_f32
(
out_ptr
,
_d0
);
vst1_f32
(
out_ptr
+
4
,
vget_low_f32
(
_q3
.
val
[
0
]));
default:
break
;
}
}
}
int
remain_m
=
m
%
6
;
if
(
remain_m
)
{
int
remain_m_start
=
m
-
remain_m
;
const
float
*
a0
=
A
+
remain_m_start
*
lda
;
const
float
*
a1
=
a0
+
lda
;
const
float
*
a1
=
a0
+
lda
;
const
float
*
a2
=
a0
+
2
*
lda
;
const
float
*
a2
=
a0
+
2
*
lda
;
const
float
*
a3
=
a0
+
3
*
lda
;
const
float
*
a3
=
a0
+
3
*
lda
;
const
float
*
a4
=
a0
+
4
*
lda
;
const
float
*
a4
=
a0
+
4
*
lda
;
const
float
*
a5
=
a0
+
5
*
lda
;
const
float
*
a5
=
a0
+
5
*
lda
;
float
*
local_buffer
=
output
+
i_length
*
k
;
float
*
out_ptr
=
output
+
remain_m_start
*
k
;
switch
(
m_tail
)
{
uint32x4_t
vmask2
=
vcltq_u32
(
vld1q_u32
(
mask
),
vdupq_n_u32
(
remain_m
));
uint32x4_t
vmask3
=
vcltq_u32
(
vld1q_u32
(
mask
+
4
),
vdupq_n_u32
(
remain_m
));
const
float
zerobuff
[
4
]
=
{
0.
f
,
0.
f
,
0.
f
,
0.
f
};
int
lk
=
0
;
for
(;
lk
<
k
-
3
;
lk
+=
4
)
{
switch
(
remain_m
)
{
case
1
:
case
1
:
a1
=
zero
;
a1
=
zerobuff
;
case
2
:
case
2
:
a2
=
zero
;
a2
=
zerobuff
;
case
3
:
case
3
:
a3
=
zero
;
a3
=
zerobuff
;
case
4
:
case
4
:
a4
=
zero
;
a4
=
zerobuff
;
case
5
:
case
5
:
a5
=
zero
;
a5
=
zerobuff
;
break
;
default:
default:
break
;
break
;
}
}
for
(
int
j
=
0
;
j
<
k
;
++
j
)
{
#if __aarch64__
*
local_buffer
++
=
*
a0
++
;
float32x4_t
_d0
=
vld1q_f32
(
a0
);
*
local_buffer
++
=
*
a1
++
;
float32x4_t
_d1
=
vld1q_f32
(
a1
);
*
local_buffer
++
=
*
a2
++
;
float32x4_t
_d2
=
vld1q_f32
(
a2
);
*
local_buffer
++
=
*
a3
++
;
float32x4_t
_d3
=
vld1q_f32
(
a3
);
*
local_buffer
++
=
*
a4
++
;
float32x4_t
_d4
=
vld1q_f32
(
a4
);
*
local_buffer
++
=
*
a5
++
;
float32x4_t
_d5
=
vld1q_f32
(
a5
);
}
delete
[]
zero
;
float32x4x2_t
_q0
=
vtrnq_f32
(
_d0
,
_d1
);
}
float32x4x2_t
_q1
=
vtrnq_f32
(
_d2
,
_d3
);
float32x4x2_t
_q3
=
vtrnq_f32
(
_d4
,
_d5
);
// uint32_t mask[8] = {0, 1, 2, 3, 4, 5, 4, 5};
_d0
=
vcombine_f32
(
vget_low_f32
(
_q0
.
val
[
0
]),
vget_low_f32
(
_q1
.
val
[
0
]));
// int remain_k = k & 0x3;
_d1
=
vcombine_f32
(
vget_low_f32
(
_q0
.
val
[
1
]),
vget_low_f32
(
_q1
.
val
[
1
]));
// uint32x4_t vzero = vdupq_n_u32(0);
_d2
=
vcombine_f32
(
vget_high_f32
(
_q0
.
val
[
0
]),
vget_high_f32
(
_q1
.
val
[
0
]));
// uint32x4_t vmask1 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_k));
_d3
=
vcombine_f32
(
vget_high_f32
(
_q0
.
val
[
1
]),
vget_high_f32
(
_q1
.
val
[
1
]));
//
// std::cout << "m: " << m << ", k: " << k << std::endl;
_d0
=
vandq_f32_u32
(
_d0
,
vmask2
);
// #pragma omp parallel for if (unroll)
_d1
=
vandq_f32_u32
(
_d1
,
vmask2
);
// for (int i = 0; i < m - 5; i += 6) {
_d2
=
vandq_f32_u32
(
_d2
,
vmask2
);
// std::cout << "i: " << i << std::endl;
_d3
=
vandq_f32_u32
(
_d3
,
vmask2
);
// const float *a0 = A + i * lda;
_d4
=
vandq_f32_u32
(
_q3
.
val
[
0
],
vmask3
);
// const float *a1 = A + (i + 1) * lda;
_d5
=
vandq_f32_u32
(
_q3
.
val
[
1
],
vmask3
);
// const float *a2 = A + (i + 2) * lda;
// const float *a3 = A + (i + 3) * lda;
vst1q_f32
(
out_ptr
,
_d0
);
// const float *a4 = A + (i + 4) * lda;
vst1_f32
(
out_ptr
+
4
,
vget_low_f32
(
_d4
));
// const float *a5 = A + (i + 5) * lda;
vst1q_f32
(
out_ptr
+
6
,
_d1
);
// float *out_ptr = output + i * k;
vst1_f32
(
out_ptr
+
10
,
vget_low_f32
(
_d5
));
//
vst1q_f32
(
out_ptr
+
12
,
_d2
);
// int loops = k >> 2;
vst1_f32
(
out_ptr
+
16
,
vget_high_f32
(
_d4
));
// if (loops > 0) {
vst1q_f32
(
out_ptr
+
18
,
_d3
);
// #if __aarch64__
vst1_f32
(
out_ptr
+
22
,
vget_high_f32
(
_d5
));
// for (int l = 0; l < loops; ++l) {
// float32x4_t _d0 = vld1q_f32(a0);
out_ptr
+=
24
;
// float32x4_t _d1 = vld1q_f32(a1);
#else
// float32x4_t _d2 = vld1q_f32(a2);
asm
volatile
(
// float32x4_t _d3 = vld1q_f32(a3);
"vld1.32 {d0-d1}, [%[a0]]
\n
"
// float32x4_t _d4 = vld1q_f32(a4);
"vld1.32 {d2-d3}, [%[a1]]
\n
"
// float32x4_t _d5 = vld1q_f32(a5);
"vld1.32 {d4-d5}, [%[a2]]
\n
"
//
"vld1.32 {d6-d7}, [%[a3]]
\n
"
// float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
"vld1.32 {d8-d9}, [%[a4]]
\n
"
// float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
"vld1.32 {d10-d11}, [%[a5]]
\n
"
// float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
"vtrn.32 q0, q1
\n
"
// _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
"vtrn.32 q2, q3
\n
"
// vget_low_f32(_q1.val[0])); _d1 =
"vtrn.32 q4, q5
\n
"
// vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
"vswp.32 d1, d4
\n
"
// _d2 =
"vswp.32 d3, d6
\n
"
// vcombine_f32(vget_high_f32(_q0.val[0]),
// vget_high_f32(_q1.val[0]));
"vbif q0, %q[vzero], %q[vmask2]
\n
"
// _d3 =
"vbif q1, %q[vzero], %q[vmask2]
\n
"
// vcombine_f32(vget_high_f32(_q0.val[1]),
"vbif q2, %q[vzero], %q[vmask2]
\n
"
// vget_high_f32(_q1.val[1]));
"vbif q3, %q[vzero], %q[vmask2]
\n
"
//
"vbif q4, %q[vzero], %q[vmask3]
\n
"
// vst1q_f32(out_ptr, _d0);
"vbif q5, %q[vzero], %q[vmask3]
\n
"
// vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
// vst1q_f32(out_ptr + 6, _d1);
"vst1.32 {q0}, [%[out]]!
\n
"
// vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
"vst1.32 {d8}, [%[out]]!
\n
"
// vst1q_f32(out_ptr + 12, _d2);
"vst1.32 {q1}, [%[out]]!
\n
"
// vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
"vst1.32 {d10}, [%[out]]!
\n
"
// vst1q_f32(out_ptr + 18, _d3);
"vst1.32 {q2}, [%[out]]!
\n
"
// vst1_f32(out_ptr + 22, vget_high_f32(_q3.val[1]));
"vst1.32 {d9}, [%[out]]!
\n
"
//
"vst1.32 {q3}, [%[out]]!
\n
"
// a0 += 4;
"vst1.32 {d11}, [%[out]]!
\n
"
// a1 += 4;
:
[
out
]
"+r"
(
out_ptr
),
[
a0
]
"+r"
(
a0
),
[
a1
]
"+r"
(
a1
),
[
a2
]
"+r"
(
a2
),
// a2 += 4;
[
a3
]
"+r"
(
a3
),
[
a4
]
"+r"
(
a4
),
[
a5
]
"+r"
(
a5
)
// a3 += 4;
:
[
vmask2
]
"w"
(
vmask2
),
[
vmask3
]
"w"
(
vmask3
),
[
vzero
]
"w"
(
vzero
)
// a4 += 4;
:
"cc"
,
"memory"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
);
// a5 += 4;
#endif
// out_ptr += 24;
}
// }
// remain k
// #else
for
(;
lk
<
k
;
++
lk
)
{
// asm volatile(
*
out_ptr
++
=
*
a0
++
;
// "loop_4k_%=: \n"
*
out_ptr
++
=
*
a1
++
;
// "vld1.32 {d0-d1}, [%[a0]]! \n"
*
out_ptr
++
=
*
a2
++
;
// "vld1.32 {d2-d3}, [%[a1]]! \n"
*
out_ptr
++
=
*
a3
++
;
// "vld1.32 {d4-d5}, [%[a2]]! \n"
*
out_ptr
++
=
*
a4
++
;
// "vld1.32 {d6-d7}, [%[a3]]! \n"
*
out_ptr
++
=
*
a5
++
;
// "vld1.32 {d8-d9}, [%[a4]]! \n"
}
// "vld1.32 {d10-d11}, [%[a5]]! \n"
}
// "vtrn.32 q0, q1 \n"
// "vtrn.32 q2, q3 \n"
// "vtrn.32 q4, q5 \n"
// "vswp.32 d1, d4 \n"
// "vswp.32 d3, d6 \n"
//
// "vst1.32 {q0}, [%[out]]! \n"
// "vst1.32 {d8}, [%[out]]! \n"
// "vst1.32 {q1}, [%[out]]! \n"
// "vst1.32 {d10}, [%[out]]! \n"
// "vst1.32 {q2}, [%[out]]! \n"
// "vst1.32 {d9}, [%[out]]! \n"
// "vst1.32 {q3}, [%[out]]! \n"
// "vst1.32 {d11}, [%[out]]! \n"
//
// "subs %[loops], #1 \n"
// "bne loop_4k_%= \n"
// : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2]
// "+r"(a2),
// [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
// :
// : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
// #endif
// }
//
// if (remain_k > 0) {
// float32x4_t _d0 = vld1q_f32(a0);
// float32x4_t _d1 = vld1q_f32(a1);
// float32x4_t _d2 = vld1q_f32(a2);
// float32x4_t _d3 = vld1q_f32(a3);
// float32x4_t _d4 = vld1q_f32(a4);
// float32x4_t _d5 = vld1q_f32(a5);
//
// _d0 = vandq_f32_u32(_d0, vmask1);
// _d1 = vandq_f32_u32(_d1, vmask1);
// _d2 = vandq_f32_u32(_d2, vmask1);
// _d3 = vandq_f32_u32(_d3, vmask1);
// _d4 = vandq_f32_u32(_d4, vmask1);
// _d5 = vandq_f32_u32(_d5, vmask1);
//
// float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
// float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
// float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
// _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
// vget_low_f32(_q1.val[0])); _d1 =
// vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); _d2
// = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
//
// switch (remain_k) {
// case 3:
// vst1q_f32(out_ptr + 12, _d2);
// vst1_f32(out_ptr + 16, vget_high_f32(_q3.val[0]));
// case 2:
// vst1q_f32(out_ptr + 6, _d1);
// vst1_f32(out_ptr + 10, vget_low_f32(_q3.val[1]));
// case 1:
// vst1q_f32(out_ptr, _d0);
// vst1_f32(out_ptr + 4, vget_low_f32(_q3.val[0]));
// default:
// break;
// }
// }
// }
//
// int remain_m = m % 6;
// if (remain_m) {
// int remain_m_start = m - remain_m;
// std::cout << "remain_m_start: " << remain_m_start << std::endl;
// const float *a0 = A + remain_m_start * lda;
// const float *a1 = a0 + lda;
// const float *a2 = a0 + 2 * lda;
// const float *a3 = a0 + 3 * lda;
// const float *a4 = a0 + 4 * lda;
// const float *a5 = a0 + 5 * lda;
// float *out_ptr = output + remain_m_start * k;
//
// uint32x4_t vmask2 = vcltq_u32(vld1q_u32(mask), vdupq_n_u32(remain_m));
// uint32x4_t vmask3 = vcltq_u32(vld1q_u32(mask + 4),
// vdupq_n_u32(remain_m));
//
// int loops = k >> 2;
// if (loops > 0) {
// #if __aarch64__
// for (int l = 0; l < loops; ++l) {
// float32x4_t _d0 = vld1q_f32(a0);
// float32x4_t _d1 = vld1q_f32(a1);
// float32x4_t _d2 = vld1q_f32(a2);
// float32x4_t _d3 = vld1q_f32(a3);
// float32x4_t _d4 = vld1q_f32(a4);
// float32x4_t _d5 = vld1q_f32(a5);
//
// float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
// float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
// float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
// _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
// vget_low_f32(_q1.val[0])); _d1 =
// vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1]));
// _d2 =
// vcombine_f32(vget_high_f32(_q0.val[0]),
// vget_high_f32(_q1.val[0]));
// _d3 =
// vcombine_f32(vget_high_f32(_q0.val[1]),
// vget_high_f32(_q1.val[1]));
//
// _d0 = vandq_f32_u32(_d0, vmask2);
// _d1 = vandq_f32_u32(_d1, vmask2);
// _d2 = vandq_f32_u32(_d2, vmask2);
// _d3 = vandq_f32_u32(_d3, vmask2);
// _d4 = vandq_f32_u32(_q3.val[0], vmask3);
// _d5 = vandq_f32_u32(_q3.val[1], vmask3);
//
// vst1q_f32(out_ptr, _d0);
// vst1_f32(out_ptr + 4, vget_low_f32(_d4));
// vst1q_f32(out_ptr + 6, _d1);
// vst1_f32(out_ptr + 10, vget_low_f32(_d5));
// vst1q_f32(out_ptr + 12, _d2);
// vst1_f32(out_ptr + 16, vget_high_f32(_d4));
// vst1q_f32(out_ptr + 18, _d3);
// vst1_f32(out_ptr + 22, vget_high_f32(_d5));
//
// a0 += 4;
// a1 += 4;
// a2 += 4;
// a3 += 4;
// a4 += 4;
// a5 += 4;
// out_ptr += 24;
// }
// #else
// asm volatile(
// "loop_4k_%=: \n"
// "vld1.32 {d0-d1}, [%[a0]]! \n"
// "vld1.32 {d2-d3}, [%[a1]]! \n"
// "vld1.32 {d4-d5}, [%[a2]]! \n"
// "vld1.32 {d6-d7}, [%[a3]]! \n"
// "vld1.32 {d8-d9}, [%[a4]]! \n"
// "vld1.32 {d10-d11}, [%[a5]]! \n"
// "vtrn.32 q0, q1 \n"
// "vtrn.32 q2, q3 \n"
// "vtrn.32 q4, q5 \n"
// "vswp.32 d1, d4 \n"
// "vswp.32 d3, d6 \n"
//
// "vbif q0, %q[vzero], %q[vmask2] \n"
// "vbif q1, %q[vzero], %q[vmask2] \n"
// "vbif q2, %q[vzero], %q[vmask2] \n"
// "vbif q3, %q[vzero], %q[vmask2] \n"
// "vbif q4, %q[vzero], %q[vmask3] \n"
// "vbif q5, %q[vzero], %q[vmask3] \n"
//
// "vst1.32 {q0}, [%[out]]! \n"
// "vst1.32 {d8}, [%[out]]! \n"
// "vst1.32 {q1}, [%[out]]! \n"
// "vst1.32 {d10}, [%[out]]! \n"
// "vst1.32 {q2}, [%[out]]! \n"
// "vst1.32 {d9}, [%[out]]! \n"
// "vst1.32 {q3}, [%[out]]! \n"
// "vst1.32 {d11}, [%[out]]! \n"
//
// "subs %[loops], #1 \n"
// "bne loop_4k_%= \n"
// : [out] "+r"(out_ptr), [a0] "+r"(a0), [a1] "+r"(a1), [a2]
// "+r"(a2),
// [a3] "+r"(a3), [a4] "+r"(a4), [a5] "+r"(a5), [loops] "+r"(loops)
// : [vmask2] "w"(vmask2), [vmask3] "w"(vmask3), [vzero] "w"(vzero)
// : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5");
// #endif
// }
//
// if (remain_k > 0) {
// float32x4_t _d0 = vld1q_f32(a0);
// float32x4_t _d1 = vld1q_f32(a1);
// float32x4_t _d2 = vld1q_f32(a2);
// float32x4_t _d3 = vld1q_f32(a3);
// float32x4_t _d4 = vld1q_f32(a4);
// float32x4_t _d5 = vld1q_f32(a5);
//
// _d0 = vandq_f32_u32(_d0, vmask1);
// _d1 = vandq_f32_u32(_d1, vmask1);
// _d2 = vandq_f32_u32(_d2, vmask1);
// _d3 = vandq_f32_u32(_d3, vmask1);
// _d4 = vandq_f32_u32(_d4, vmask1);
// _d5 = vandq_f32_u32(_d5, vmask1);
//
// float32x4x2_t _q0 = vtrnq_f32(_d0, _d1);
// float32x4x2_t _q1 = vtrnq_f32(_d2, _d3);
// float32x4x2_t _q3 = vtrnq_f32(_d4, _d5);
// _d0 = vcombine_f32(vget_low_f32(_q0.val[0]),
// vget_low_f32(_q1.val[0])); _d1 =
// vcombine_f32(vget_low_f32(_q0.val[1]), vget_low_f32(_q1.val[1])); _d2
// = vcombine_f32(vget_high_f32(_q0.val[0]), vget_high_f32(_q1.val[0]));
// // _d3 = vcombine_f32(vget_high_f32(_q0.val[1]),
// // vget_high_f32(_q1.val[1]));
//
// _d0 = vandq_f32_u32(_d0, vmask2);
// _d1 = vandq_f32_u32(_d1, vmask2);
// _d2 = vandq_f32_u32(_d2, vmask2);
// // _d3 = vandq_f32_u32(_d3, vmask2);
// _d4 = vandq_f32_u32(_q3.val[0], vmask3);
// _d5 = vandq_f32_u32(_q3.val[1], vmask3);
//
// switch (remain_k) {
// case 3:
// vst1q_f32(out_ptr + 12, _d2);
// vst1_f32(out_ptr + 16, vget_high_f32(_d4));
// case 2:
// vst1q_f32(out_ptr + 6, _d1);
// vst1_f32(out_ptr + 10, vget_low_f32(_d5));
// case 1:
// vst1q_f32(out_ptr, _d0);
// vst1_f32(out_ptr + 4, vget_low_f32(_d4));
// default:
// break;
// }
// }
// }
}
}
#if __aarch64__
#if __aarch64__
...
...
src/operators/math/math
_func_neon
.h
→
src/operators/math/math.h
浏览文件 @
dd575b09
文件已移动
src/operators/math/softmax.cpp
浏览文件 @
dd575b09
...
@@ -19,7 +19,7 @@ limitations under the License. */
...
@@ -19,7 +19,7 @@ limitations under the License. */
#include <algorithm>
#include <algorithm>
#include <limits>
#include <limits>
#include "common/types.h"
#include "common/types.h"
#include "operators/math/math
_func_neon
.h"
#include "operators/math/math.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录