Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
b59e8f1c
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
331
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b59e8f1c
编写于
8月 24, 2018
作者:
xiebaiyuan
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'upstream/develop' into develop
上级
1b334875
1cce6c25
变更
23
隐藏空白更改
内联
并排
Showing
23 changed file
with
1671 addition
and
15 deletion
+1671
-15
src/common/types.cpp
src/common/types.cpp
+4
-0
src/common/types.h
src/common/types.h
+2
-0
src/framework/operator.h
src/framework/operator.h
+2
-0
src/framework/program/program-optimize/node.cpp
src/framework/program/program-optimize/node.cpp
+107
-4
src/framework/program/program-optimize/node.h
src/framework/program/program-optimize/node.h
+8
-0
src/framework/program/program-optimize/program_optimize.cpp
src/framework/program/program-optimize/program_optimize.cpp
+46
-6
src/operators/fusion_conv_add_add_prelu.cpp
src/operators/fusion_conv_add_add_prelu.cpp
+60
-0
src/operators/fusion_conv_add_add_prelu_op.h
src/operators/fusion_conv_add_add_prelu_op.h
+112
-0
src/operators/fusion_conv_add_prelu_op.cpp
src/operators/fusion_conv_add_prelu_op.cpp
+60
-0
src/operators/fusion_conv_add_prelu_op.h
src/operators/fusion_conv_add_prelu_op.h
+107
-0
src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
+39
-0
src/operators/kernel/arm/conv_add_prelu_kernel.cpp
src/operators/kernel/arm/conv_add_prelu_kernel.cpp
+38
-0
src/operators/kernel/arm/prelu_kernel.cpp
src/operators/kernel/arm/prelu_kernel.cpp
+57
-5
src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
...ors/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
+140
-0
src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
...erators/kernel/central-arm-func/conv_add_prelu_arm_func.h
+130
-0
src/operators/kernel/conv_add_add_prelu_kernel.h
src/operators/kernel/conv_add_add_prelu_kernel.h
+45
-0
src/operators/kernel/conv_add_prelu_kernel.h
src/operators/kernel/conv_add_prelu_kernel.h
+45
-0
src/operators/math/gemm.cpp
src/operators/math/gemm.cpp
+423
-0
src/operators/math/gemm.h
src/operators/math/gemm.h
+16
-0
src/operators/math/math_function.cpp
src/operators/math/math_function.cpp
+31
-0
src/operators/math/math_function.h
src/operators/math/math_function.h
+5
-0
src/operators/op_param.h
src/operators/op_param.h
+186
-0
tools/op.cmake
tools/op.cmake
+8
-0
未找到文件。
src/common/types.cpp
浏览文件 @
b59e8f1c
...
@@ -23,6 +23,8 @@ const char *G_OP_TYPE_BOX_CODER = "box_coder";
...
@@ -23,6 +23,8 @@ const char *G_OP_TYPE_BOX_CODER = "box_coder";
const
char
*
G_OP_TYPE_CONCAT
=
"concat"
;
const
char
*
G_OP_TYPE_CONCAT
=
"concat"
;
const
char
*
G_OP_TYPE_ELEMENTWISE_ADD
=
"elementwise_add"
;
const
char
*
G_OP_TYPE_ELEMENTWISE_ADD
=
"elementwise_add"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_RELU
=
"fusion_conv_add_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_RELU
=
"fusion_conv_add_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_PRELU
=
"fusion_conv_add_prelu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
=
"fusion_conv_add_add_prelu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
=
"fusion_conv_add_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
=
"fusion_conv_add_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_DWCONV_BN_RELU
=
"fusion_dwconv_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_DWCONV_BN_RELU
=
"fusion_dwconv_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_BN_RELU
=
"fusion_conv_bn_relu"
;
const
char
*
G_OP_TYPE_FUSION_CONV_BN_RELU
=
"fusion_conv_bn_relu"
;
...
@@ -83,6 +85,8 @@ std::unordered_map<
...
@@ -83,6 +85,8 @@ std::unordered_map<
{
G_OP_TYPE_RESHAPE
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_RESHAPE
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_DEPTHWISE_CONV
,
{{
"Input"
},
{
"Output"
}}},
{
G_OP_TYPE_DEPTHWISE_CONV
,
{{
"Input"
},
{
"Output"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_RELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_RELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_PRELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
,
{{
"Input"
},
{
"Out"
}}},
{
G_OP_TYPE_IM2SEQUENCE
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_IM2SEQUENCE
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_DROPOUT
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_DROPOUT
,
{{
"X"
},
{
"Out"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_BN
,
{{
"Input"
},
{
"Y"
}}},
{
G_OP_TYPE_FUSION_CONV_ADD_BN
,
{{
"Input"
},
{
"Y"
}}},
...
...
src/common/types.h
浏览文件 @
b59e8f1c
...
@@ -85,6 +85,8 @@ extern const char *G_OP_TYPE_BOX_CODER;
...
@@ -85,6 +85,8 @@ extern const char *G_OP_TYPE_BOX_CODER;
extern
const
char
*
G_OP_TYPE_CONCAT
;
extern
const
char
*
G_OP_TYPE_CONCAT
;
extern
const
char
*
G_OP_TYPE_ELEMENTWISE_ADD
;
extern
const
char
*
G_OP_TYPE_ELEMENTWISE_ADD
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_PRELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
;
extern
const
char
*
G_OP_TYPE_FC
;
extern
const
char
*
G_OP_TYPE_FC
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
;
extern
const
char
*
G_OP_TYPE_FUSION_CONV_ADD_BN_RELU
;
...
...
src/framework/operator.h
浏览文件 @
b59e8f1c
...
@@ -183,6 +183,8 @@ class FusionOpMatcher {
...
@@ -183,6 +183,8 @@ class FusionOpMatcher {
std
::
string
BeginType
()
{
return
node_
.
Type
();
}
std
::
string
BeginType
()
{
return
node_
.
Type
();
}
virtual
std
::
vector
<
std
::
pair
<
int
,
std
::
string
>>
NeedCheck
()
{
return
{};
}
// virtual bool Fusion();
// virtual bool Fusion();
protected:
protected:
Node
node_
;
Node
node_
;
...
...
src/framework/program/program-optimize/node.cpp
浏览文件 @
b59e8f1c
...
@@ -14,15 +14,33 @@ limitations under the License. */
...
@@ -14,15 +14,33 @@ limitations under the License. */
#include "framework/program/program-optimize/node.h"
#include "framework/program/program-optimize/node.h"
#include <algorithm>
#include <algorithm>
#include <map>
#include <memory>
#include "framework/operator.h"
#include "framework/operator.h"
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
framework
{
namespace
framework
{
std
::
vector
<
Node
*>
Node
::
operator
[](
int
index
)
{
std
::
vector
<
Node
*>
nodes
;
GetNodesWithLocation
(
index
,
0
,
&
nodes
);
return
nodes
;
}
void
Node
::
GetNodesWithLocation
(
int
index
,
int
now_index
,
std
::
vector
<
Node
*>
*
nodes
)
{
if
(
index
==
now_index
)
{
nodes
->
push_back
(
this
);
}
for
(
int
i
=
0
;
i
<
this
->
outputs_
.
size
();
++
i
)
{
this
->
outputs_
[
i
]
->
GetNodesWithLocation
(
index
,
now_index
+
1
,
nodes
);
}
}
Node
&
Node
::
operator
>
(
std
::
shared_ptr
<
Node
>
node
)
{
Node
&
Node
::
operator
>
(
std
::
shared_ptr
<
Node
>
node
)
{
outputs_
.
push_back
(
node
);
outputs_
.
push_back
(
node
);
std
::
shared_ptr
<
Node
>
this_node
;
node
->
inputs_
.
push_back
(
this
);
node
->
inputs_
.
push_back
(
this
);
return
*
node
;
return
*
node
;
}
}
...
@@ -31,19 +49,70 @@ bool Node::operator==(const Node &in) {
...
@@ -31,19 +49,70 @@ bool Node::operator==(const Node &in) {
if
(
in
.
type_
==
this
->
type_
)
{
if
(
in
.
type_
==
this
->
type_
)
{
if
(
this
->
outputs_
.
size
()
==
in
.
outputs_
.
size
())
{
if
(
this
->
outputs_
.
size
()
==
in
.
outputs_
.
size
())
{
for
(
int
i
=
0
;
i
<
outputs_
.
size
();
++
i
)
{
for
(
int
i
=
0
;
i
<
outputs_
.
size
();
++
i
)
{
if
(
!
(
*
outputs_
[
i
]
==
*
in
.
outputs_
[
i
]))
{
if
(
!
(
this
->
outputs_
[
i
]
->
MedianEqual
(
*
in
.
outputs_
[
i
])))
{
return
false
;
}
}
}
else
{
return
false
;
}
}
else
{
return
false
;
}
return
true
;
}
bool
Node
::
MedianEqual
(
const
Node
&
in
)
{
if
(
in
.
type_
==
this
->
type_
)
{
if
(
this
->
outputs_
.
size
()
==
in
.
outputs_
.
size
())
{
// if (this->inputs_.size() != in.inputs_.size()) {
// DLOG << " == - this input size: " << this->inputs_.size();
// DLOG << " == - ptr of this " << this;
// DLOG << " == - in input size: " << in.inputs_.size();
// DLOG << " == - input size not equal ";
// return false;
// } else {
// for (int i = 0; i < this->inputs_.size(); ++i) {
// if (this->inputs_[i]->type_ != in.inputs_[i]->type_) {
// DLOG << " == - input type not equal ";
// return false;
// }
// }
// }
for
(
int
i
=
0
;
i
<
outputs_
.
size
();
++
i
)
{
if
(
!
((
*
outputs_
[
i
]).
MedianEqual
(
*
in
.
outputs_
[
i
])))
{
return
false
;
return
false
;
}
}
}
}
}
else
{
}
else
{
// DLOG << " == - output size not equal ";
return
false
;
return
false
;
}
}
}
else
{
}
else
{
// DLOG << " == - median type is not equal ";
return
false
;
return
false
;
}
}
return
true
;
return
true
;
}
}
std
::
map
<
std
::
string
,
Node
*>
Node
::
Relationship
()
{
std
::
map
<
std
::
string
,
Node
*>
map
;
RelationshipPrivate
(
&
map
);
return
map
;
}
void
Node
::
RelationshipPrivate
(
std
::
map
<
std
::
string
,
Node
*>
*
map
)
{
for
(
auto
output
:
op_desc_
->
outputs_
)
{
for
(
auto
output_key
:
output
.
second
)
{
(
*
map
)[
output_key
]
=
this
;
}
}
for
(
auto
output
:
this
->
outputs_
)
{
output
->
RelationshipPrivate
(
map
);
}
}
std
::
shared_ptr
<
Node
>
Node
::
To
(
int
size
)
{
std
::
shared_ptr
<
Node
>
Node
::
To
(
int
size
)
{
std
::
shared_ptr
<
Node
>
node
=
std
::
make_shared
<
Node
>
();
std
::
shared_ptr
<
Node
>
node
=
std
::
make_shared
<
Node
>
();
this
->
To
(
size
-
1
,
node
);
this
->
To
(
size
-
1
,
node
);
...
@@ -51,7 +120,9 @@ std::shared_ptr<Node> Node::To(int size) {
...
@@ -51,7 +120,9 @@ std::shared_ptr<Node> Node::To(int size) {
}
}
void
Node
::
To
(
int
index
,
std
::
shared_ptr
<
Node
>
node
)
{
void
Node
::
To
(
int
index
,
std
::
shared_ptr
<
Node
>
node
)
{
node
->
op_desc_
=
this
->
op_desc_
;
node
->
type_
=
this
->
type_
;
node
->
type_
=
this
->
type_
;
node
->
inputs_
=
this
->
inputs_
;
if
(
index
!=
0
)
{
if
(
index
!=
0
)
{
}
else
{
}
else
{
return
;
return
;
...
@@ -100,8 +171,19 @@ void Node::Folder(
...
@@ -100,8 +171,19 @@ void Node::Folder(
if
(
change
->
find
(
this
->
type_
)
!=
change
->
end
())
{
if
(
change
->
find
(
this
->
type_
)
!=
change
->
end
())
{
auto
change_pairs
=
(
*
change
)[
this
->
type_
];
auto
change_pairs
=
(
*
change
)[
this
->
type_
];
for
(
const
auto
&
change_pair
:
change_pairs
)
{
for
(
const
auto
&
change_pair
:
change_pairs
)
{
op_desc
->
GetInputs
()[
change_pair
.
second
]
=
std
::
map
<
std
::
string
,
int
>
f
;
this
->
op_desc_
->
GetInputs
()[
change_pair
.
first
];
if
(
this
->
op_desc_
->
GetInputs
().
find
(
change_pair
.
first
)
!=
this
->
op_desc_
->
GetInputs
().
end
())
{
if
(
op_desc
->
GetInputs
().
find
(
change_pair
.
second
)
!=
op_desc
->
GetInputs
().
end
())
{
for
(
auto
value
:
this
->
op_desc_
->
GetInputs
()[
change_pair
.
first
])
{
op_desc
->
GetInputs
()[
change_pair
.
second
].
push_back
(
value
);
}
}
else
{
op_desc
->
GetInputs
()[
change_pair
.
second
]
=
this
->
op_desc_
->
GetInputs
()[
change_pair
.
first
];
}
}
}
}
}
}
...
@@ -110,7 +192,28 @@ void Node::Folder(
...
@@ -110,7 +192,28 @@ void Node::Folder(
}
}
if
(
index
>
0
)
{
if
(
index
>
0
)
{
--
index
;
--
index
;
for
(
auto
output
:
outputs_
)
{
for
(
auto
output
:
outputs_
)
{
if
(
change
->
find
(
this
->
type_
)
!=
change
->
end
())
{
auto
change_pairs
=
(
*
change
)[
this
->
type_
];
for
(
const
auto
&
change_pair
:
change_pairs
)
{
std
::
map
<
std
::
string
,
int
>
f
;
if
(
this
->
op_desc_
->
GetOutputs
().
find
(
change_pair
.
first
)
!=
this
->
op_desc_
->
GetOutputs
().
end
())
{
if
(
op_desc
->
GetInputs
().
find
(
change_pair
.
second
)
!=
op_desc
->
GetInputs
().
end
())
{
for
(
auto
value
:
this
->
op_desc_
->
GetOutputs
()[
change_pair
.
first
])
{
op_desc
->
GetInputs
()[
change_pair
.
second
].
push_back
(
value
);
}
}
else
{
op_desc
->
GetInputs
()[
change_pair
.
second
]
=
this
->
op_desc_
->
GetOutputs
()[
change_pair
.
first
];
}
}
}
}
removed_nodes
->
push_back
(
output
);
removed_nodes
->
push_back
(
output
);
output
->
Folder
(
op_desc
,
outputs
,
index
,
change
,
begin_node
,
output
->
Folder
(
op_desc
,
outputs
,
index
,
change
,
begin_node
,
removed_nodes
);
removed_nodes
);
...
...
src/framework/program/program-optimize/node.h
浏览文件 @
b59e8f1c
...
@@ -35,6 +35,7 @@ class Node {
...
@@ -35,6 +35,7 @@ class Node {
:
op_desc_
(
op_desc
),
type_
(
op_desc
->
Type
())
{}
:
op_desc_
(
op_desc
),
type_
(
op_desc
->
Type
())
{}
Node
&
operator
>
(
std
::
shared_ptr
<
Node
>
node
);
Node
&
operator
>
(
std
::
shared_ptr
<
Node
>
node
);
bool
operator
==
(
const
Node
&
in
);
bool
operator
==
(
const
Node
&
in
);
bool
MedianEqual
(
const
Node
&
in
);
#ifdef PADDLE_MOBILE_DEBUG
#ifdef PADDLE_MOBILE_DEBUG
std
::
string
ToString
()
const
;
std
::
string
ToString
()
const
;
...
@@ -50,7 +51,14 @@ class Node {
...
@@ -50,7 +51,14 @@ class Node {
std
::
shared_ptr
<
framework
::
OpDesc
>
OpDescOfNode
()
{
return
op_desc_
;
}
std
::
shared_ptr
<
framework
::
OpDesc
>
OpDescOfNode
()
{
return
op_desc_
;
}
std
::
string
Type
()
{
return
type_
;
}
std
::
string
Type
()
{
return
type_
;
}
std
::
vector
<
Node
*>
operator
[](
int
index
);
std
::
map
<
std
::
string
,
Node
*>
Relationship
();
private:
private:
void
RelationshipPrivate
(
std
::
map
<
std
::
string
,
Node
*>
*
map
);
void
GetNodesWithLocation
(
int
index
,
int
now_index
,
std
::
vector
<
Node
*>
*
nodes
);
void
To
(
int
index
,
std
::
shared_ptr
<
Node
>
);
void
To
(
int
index
,
std
::
shared_ptr
<
Node
>
);
void
Folder
(
void
Folder
(
std
::
shared_ptr
<
framework
::
OpDesc
>
op_desc
,
std
::
shared_ptr
<
framework
::
OpDesc
>
op_desc
,
...
...
src/framework/program/program-optimize/program_optimize.cpp
浏览文件 @
b59e8f1c
...
@@ -29,9 +29,15 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
...
@@ -29,9 +29,15 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
for
(
int
i
=
0
;
i
<
optimize_program
->
Blocks
().
size
();
++
i
)
{
for
(
int
i
=
0
;
i
<
optimize_program
->
Blocks
().
size
();
++
i
)
{
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Node
>>
output_nodes
;
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Node
>>
output_nodes
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
shared_ptr
<
Node
>>>
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
pair
<
std
::
shared_ptr
<
Node
>
,
std
::
unordered_map
<
std
::
string
,
std
::
shared_ptr
<
Node
>>>>>
type_map
;
type_map
;
std
::
unordered_map
<
std
::
string
,
bool
>
output_has
;
std
::
vector
<
std
::
shared_ptr
<
Node
>>
nodes
;
std
::
vector
<
std
::
shared_ptr
<
Node
>>
nodes
;
std
::
shared_ptr
<
Node
>
begin_node
;
std
::
shared_ptr
<
Node
>
begin_node
;
...
@@ -50,7 +56,7 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
...
@@ -50,7 +56,7 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
nodes
.
push_back
(
node
);
nodes
.
push_back
(
node
);
//
//
type_map
[
op
->
Type
()].
push_back
(
node
);
type_map
[
op
->
Type
()].
push_back
(
{
node
,
output_nodes
}
);
if
(
j
==
0
)
{
if
(
j
==
0
)
{
begin_node
=
node
;
begin_node
=
node
;
...
@@ -69,6 +75,7 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
...
@@ -69,6 +75,7 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
}
}
auto
output_keys
=
op_input_output_key
.
at
(
op_type
).
second
;
auto
output_keys
=
op_input_output_key
.
at
(
op_type
).
second
;
for
(
auto
output_key
:
output_keys
)
{
for
(
auto
output_key
:
output_keys
)
{
auto
op_outputs
=
op
->
Output
(
output_key
);
auto
op_outputs
=
op
->
Output
(
output_key
);
for
(
int
k
=
0
;
k
<
op_outputs
.
size
();
++
k
)
{
for
(
int
k
=
0
;
k
<
op_outputs
.
size
();
++
k
)
{
...
@@ -83,11 +90,42 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
...
@@ -83,11 +90,42 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
auto
match_vector
=
type_map
[
matcher
->
BeginType
()];
auto
match_vector
=
type_map
[
matcher
->
BeginType
()];
for
(
auto
&
match_node
:
match_vector
)
{
for
(
auto
&
match_node_pair
:
match_vector
)
{
auto
match_node
=
match_node_pair
.
first
;
auto
node_has
=
match_node_pair
.
second
;
auto
depth
=
matcher
->
BeginNode
().
Depth
();
auto
depth
=
matcher
->
BeginNode
().
Depth
();
auto
sub_node
=
match_node
->
To
(
depth
);
auto
sub_node
=
match_node
->
To
(
depth
);
// DLOG << " sub node: " << *sub_node;
// DLOG << " sub node: " << *sub_node;
if
(
*
sub_node
==
matcher
->
BeginNode
())
{
if
(
*
sub_node
==
matcher
->
BeginNode
())
{
bool
can_folder
=
true
;
auto
relationship_map
=
sub_node
->
Relationship
();
for
(
auto
to_check
:
matcher
->
NeedCheck
())
{
// if (node_has)
auto
nodes
=
(
*
sub_node
)[
to_check
.
first
];
for
(
auto
node
:
nodes
)
{
auto
inputs_to_check
=
node
->
OpDescOfNode
()
->
Input
(
to_check
.
second
);
for
(
auto
input_to_check
:
inputs_to_check
)
{
if
(
node_has
.
find
(
input_to_check
)
==
node_has
.
end
())
{
if
(
relationship_map
.
find
(
input_to_check
)
==
relationship_map
.
end
())
{
can_folder
=
false
;
}
else
{
}
}
}
}
}
if
(
!
can_folder
)
{
continue
;
}
// DLOG << " match success " << " fusion node: \n" <<
// DLOG << " match success " << " fusion node: \n" <<
// matcher->BeginNode() << "\nsub node: \n" << *sub_node;
// matcher->BeginNode() << "\nsub node: \n" << *sub_node;
// DLOG << "match node\n"<< *match_node;
// DLOG << "match node\n"<< *match_node;
...
@@ -95,11 +133,13 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
...
@@ -95,11 +133,13 @@ std::shared_ptr<ProgramDesc> ProgramOptimize::FusionOptimize(
std
::
vector
<
std
::
shared_ptr
<
Node
>>
removed_nodes
;
std
::
vector
<
std
::
shared_ptr
<
Node
>>
removed_nodes
;
matcher
->
FolderNodes
(
match_node
.
get
(),
&
removed_nodes
);
matcher
->
FolderNodes
(
match_node
.
get
(),
&
removed_nodes
);
for
(
int
j
=
0
;
j
<
removed_nodes
.
size
();
++
j
)
{
for
(
int
k
=
removed_nodes
.
size
()
-
1
;
k
>=
0
;
--
k
)
{
auto
removed_node
=
removed_nodes
[
j
];
auto
removed_node
=
removed_nodes
[
k
];
auto
removed_ite
=
auto
removed_ite
=
std
::
find
(
nodes
.
begin
(),
nodes
.
end
(),
removed_node
);
std
::
find
(
nodes
.
begin
(),
nodes
.
end
(),
removed_node
);
nodes
.
erase
(
removed_ite
);
if
(
removed_ite
!=
nodes
.
end
())
{
nodes
.
erase
(
removed_ite
);
}
}
}
}
}
}
}
...
...
src/operators/fusion_conv_add_add_prelu.cpp
0 → 100644
浏览文件 @
b59e8f1c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#include "fusion_conv_add_add_prelu_op.h"
#include "operators/math/conv_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
Dtype
,
typename
T
>
void
FusionConvAddAddPReluOp
<
Dtype
,
T
>::
InferShape
()
const
{
auto
in_dims
=
this
->
param_
.
Input
()
->
dims
();
auto
filter_dims
=
this
->
param_
.
Filter
()
->
dims
();
const
std
::
vector
<
int
>
&
strides
=
this
->
param_
.
Strides
();
std
::
vector
<
int
>
paddings
=
this
->
param_
.
Paddings
();
int
groups
=
this
->
param_
.
Groups
();
std
::
vector
<
int
>
dilations
=
this
->
param_
.
Dilations
();
PADDLE_MOBILE_ENFORCE
((
in_dims
.
size
()
==
filter_dims
.
size
()
&&
dilations
.
size
()
==
paddings
.
size
()
&&
paddings
.
size
()
==
strides
.
size
()),
"ConvParam is not suitable"
);
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
paddings
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
this
->
param_
.
Output
()
->
Resize
(
ddim
);
}
}
// namespace operators
}
// namespace paddle_mobile
namespace
ops
=
paddle_mobile
::
operators
;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU
(
fusion_conv_add_add_prelu
,
ops
::
FusionConvAddAddPReluOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA
(
fusion_conv_add_add_prelu
,
ops
::
FusionConvAddAddPReluOp
);
#endif
#endif
src/operators/fusion_conv_add_add_prelu_op.h
0 → 100644
浏览文件 @
b59e8f1c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/conv_add_add_prelu_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
class
FusionConvAddAddPReluOpMatcher
:
public
framework
::
FusionOpMatcher
{
public:
FusionConvAddAddPReluOpMatcher
()
{
node_
=
framework
::
Node
(
G_OP_TYPE_CONV
);
node_
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_ELEMENTWISE_ADD
)
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_ELEMENTWISE_ADD
)
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_PRELU
);
}
void
FolderNodes
(
framework
::
Node
*
node
,
std
::
vector
<
std
::
shared_ptr
<
framework
::
Node
>>
*
removed_nodes
)
{
node
->
Folder
(
node_
.
Depth
(),
Type
(),
{{
G_OP_TYPE_ELEMENTWISE_ADD
,
{{
"Y"
,
"Y"
},
{
"Out"
,
"addOut"
},
{
"X"
,
"addX"
}}},
{
G_OP_TYPE_PRELU
,
{{
"Alpha"
,
"Alpha"
}}}},
removed_nodes
);
}
std
::
string
Type
()
{
return
G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU
;
}
std
::
vector
<
std
::
pair
<
int
,
std
::
string
>>
NeedCheck
()
{
DLOG
<<
" conv add add prelu check add X "
;
return
{{
2
,
"Y"
},
{
2
,
"X"
}};
}
};
template
<
typename
DeviceType
,
typename
T
>
class
FusionConvAddAddPReluOp
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddAddPReluKernel
<
DeviceType
,
T
>>
{
public:
FusionConvAddAddPReluOp
(
const
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddAddPReluKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
using
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddAddPReluKernel
<
DeviceType
,
T
>>::
OperatorWithKernel
;
void
InferShape
()
const
override
;
protected:
};
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_ADD_PRELU_REGISTER
#define CONV_ADD_ADD_PRELU_REGISTER
static
framework
::
FusionOpRegistrar
fusion_conv_add_add_prelu_registrar
(
new
FusionConvAddAddPReluOpMatcher
());
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef CONV_ADD_ADD_PRELU_REGISTER
#define CONV_ADD_ADD_PRELU_REGISTER
static
framework
::
FusionOpRegistrar
fusion_conv_add_add_prelu_registrar
(
new
FusionConvAddAddPReluOpMatcher
());
#endif
#endif
}
// namespace operators
}
// namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU
(
fusion_conv_add_add_prelu
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA
(
fusion_conv_add_add_prelu
);
#endif
#endif
src/operators/fusion_conv_add_prelu_op.cpp
0 → 100644
浏览文件 @
b59e8f1c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#include "fusion_conv_add_prelu_op.h"
#include "operators/math/conv_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
Dtype
,
typename
T
>
void
FusionConvAddPReluOp
<
Dtype
,
T
>::
InferShape
()
const
{
auto
in_dims
=
this
->
param_
.
Input
()
->
dims
();
auto
filter_dims
=
this
->
param_
.
Filter
()
->
dims
();
const
std
::
vector
<
int
>
&
strides
=
this
->
param_
.
Strides
();
std
::
vector
<
int
>
paddings
=
this
->
param_
.
Paddings
();
int
groups
=
this
->
param_
.
Groups
();
std
::
vector
<
int
>
dilations
=
this
->
param_
.
Dilations
();
PADDLE_MOBILE_ENFORCE
((
in_dims
.
size
()
==
filter_dims
.
size
()
&&
dilations
.
size
()
==
paddings
.
size
()
&&
paddings
.
size
()
==
strides
.
size
()),
"ConvParam is not suitable"
);
std
::
vector
<
int64_t
>
output_shape
({
in_dims
[
0
],
filter_dims
[
0
]});
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
output_shape
.
push_back
(
math
::
ConvOutputSize
(
in_dims
[
i
+
2
],
filter_dims
[
i
+
2
],
dilations
[
i
],
paddings
[
i
],
strides
[
i
]));
}
framework
::
DDim
ddim
=
framework
::
make_ddim
(
output_shape
);
this
->
param_
.
Output
()
->
Resize
(
ddim
);
}
}
// namespace operators
}
// namespace paddle_mobile
namespace
ops
=
paddle_mobile
::
operators
;
#ifdef PADDLE_MOBILE_CPU
REGISTER_OPERATOR_CPU
(
fusion_conv_add_prelu
,
ops
::
FusionConvAddPReluOp
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
REGISTER_OPERATOR_FPGA
(
fusion_conv_add_prelu
,
ops
::
FusionConvAddPReluOp
);
#endif
#endif
src/operators/fusion_conv_add_prelu_op.h
0 → 100644
浏览文件 @
b59e8f1c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#pragma once
#include <string>
#include <vector>
#include "framework/operator.h"
#include "framework/program/program-optimize/fusion_op_register.h"
#include "operators/kernel/conv_add_prelu_kernel.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
class
FusionConvAddPReluOpMatcher
:
public
framework
::
FusionOpMatcher
{
public:
FusionConvAddPReluOpMatcher
()
{
node_
=
framework
::
Node
(
G_OP_TYPE_CONV
);
node_
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_ELEMENTWISE_ADD
)
>
std
::
make_shared
<
framework
::
Node
>
(
G_OP_TYPE_PRELU
);
}
void
FolderNodes
(
framework
::
Node
*
node
,
std
::
vector
<
std
::
shared_ptr
<
framework
::
Node
>>
*
removed_nodes
)
{
node
->
Folder
(
node_
.
Depth
(),
Type
(),
{{
G_OP_TYPE_ELEMENTWISE_ADD
,
{{
"Y"
,
"Y"
}}},
{
G_OP_TYPE_PRELU
,
{{
"Alpha"
,
"Alpha"
}}}
},
removed_nodes
);
}
std
::
string
Type
()
{
return
G_OP_TYPE_FUSION_CONV_ADD_PRELU
;
}
};
template
<
typename
DeviceType
,
typename
T
>
class
FusionConvAddPReluOp
:
public
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddPReluKernel
<
DeviceType
,
T
>>
{
public:
FusionConvAddPReluOp
(
const
string
&
type
,
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
,
std
::
shared_ptr
<
framework
::
Scope
>
scope
)
:
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddPReluKernel
<
DeviceType
,
T
>>
(
type
,
inputs
,
outputs
,
attrs
,
scope
)
{}
using
framework
::
OperatorWithKernel
<
DeviceType
,
FusionConvAddPReluParam
<
DeviceType
>
,
operators
::
ConvAddPReluKernel
<
DeviceType
,
T
>>::
OperatorWithKernel
;
void
InferShape
()
const
override
;
protected:
};
#ifdef PADDLE_MOBILE_CPU
#ifndef CONV_ADD_PRELU_REGISTER
#define CONV_ADD_PRELU_REGISTER
static
framework
::
FusionOpRegistrar
fusion_conv_add_prelu_registrar
(
new
FusionConvAddPReluOpMatcher
());
#endif
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
#ifndef CONV_ADD_PRELU_REGISTER
#define CONV_ADD_PRELU_REGISTER
static
framework
::
FusionOpRegistrar
fusion_conv_add_prelu_registrar
(
new
FusionConvAddPReluOpMatcher
());
#endif
#endif
}
// namespace operators
}
// namespace paddle_mobile
#ifdef PADDLE_MOBILE_CPU
USE_OP_CPU
(
fusion_conv_add_prelu
);
#endif
#ifdef PADDLE_MOBILE_MALI_GPU
#endif
#ifdef PADDLE_MOBILE_FPGA
USE_OP_FPGA
(
fusion_conv_add_prelu
);
#endif
#endif
src/operators/kernel/arm/conv_add_add_prelu_kernel.cpp
0 → 100644
浏览文件 @
b59e8f1c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#include "operators/kernel/conv_add_add_prelu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConvAddAddPReluKernel
<
CPU
,
float
>::
Init
(
FusionConvAddAddPReluParam
<
CPU
>
*
param
)
{
return
true
;
}
template
<
>
void
ConvAddAddPReluKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddAddPReluParam
<
CPU
>
&
param
)
const
{
ConvAddAddPReluCompute
<
float
>
(
param
);
}
template
class
ConvAddAddPReluKernel
<
CPU
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/arm/conv_add_prelu_kernel.cpp
0 → 100644
浏览文件 @
b59e8f1c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#include "operators/kernel/conv_add_prelu_kernel.h"
#include "operators/kernel/central-arm-func/conv_add_prelu_arm_func.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
>
bool
ConvAddPReluKernel
<
CPU
,
float
>::
Init
(
FusionConvAddPReluParam
<
CPU
>
*
param
)
{
return
true
;
}
template
<
>
void
ConvAddPReluKernel
<
CPU
,
float
>::
Compute
(
const
FusionConvAddPReluParam
<
CPU
>
&
param
)
const
{
ConvAddPReluCompute
<
float
>
(
param
);
}
template
class
ConvAddPReluKernel
<
CPU
,
float
>;
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/arm/prelu_kernel.cpp
浏览文件 @
b59e8f1c
...
@@ -16,6 +16,9 @@ limitations under the License. */
...
@@ -16,6 +16,9 @@ limitations under the License. */
#include "operators/kernel/prelu_kernel.h"
#include "operators/kernel/prelu_kernel.h"
#include <operators/math/transform.h>
#include <operators/math/transform.h>
#if __ARM_NEON
#include <arm_neon.h>
#endif
namespace
paddle_mobile
{
namespace
paddle_mobile
{
namespace
operators
{
namespace
operators
{
...
@@ -37,32 +40,81 @@ void PReluKernel<CPU, float>::Compute(const PReluParam<CPU> ¶m) const {
...
@@ -37,32 +40,81 @@ void PReluKernel<CPU, float>::Compute(const PReluParam<CPU> ¶m) const {
auto
*
alpha
=
param
.
InputAlpha
();
auto
*
alpha
=
param
.
InputAlpha
();
auto
*
out
=
param
.
Out
();
auto
*
out
=
param
.
Out
();
std
::
string
mode
=
param
.
Mode
();
std
::
string
mode
=
param
.
Mode
();
const
auto
*
x_ptr
=
x
->
data
<
float
>
();
auto
*
x_ptr
=
x
->
data
<
float
>
();
auto
*
o_ptr
=
out
->
mutable_data
<
float
>
();
auto
*
o_ptr
=
out
->
mutable_data
<
float
>
();
const
auto
*
alpha_ptr
=
alpha
->
data
<
float
>
();
auto
*
alpha_ptr
=
alpha
->
data
<
float
>
();
int
numel
=
x
->
numel
();
int
numel
=
x
->
numel
();
auto
dim
=
x
->
dims
();
auto
dim
=
x
->
dims
();
int
k
=
dim
[
0
]
*
dim
[
1
];
int
n
=
dim
[
2
]
*
dim
[
3
];
int
index
=
0
;
int
index
=
0
;
int
i
=
0
;
int
i
=
0
;
int
temp
=
0
;
int
temp
=
0
;
#if __ARM_NEON
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
k
;
i
++
)
{
float32x4_t
zero
=
vdupq_n_f32
(
0.0
);
float32x4_t
cv
;
float32x4_t
cv1
;
float32x4_t
cv2
;
float32x4_t
pv
;
for
(
int
j
=
0
;
(
j
+
3
)
<
n
;
j
+=
4
)
{
const
float
*
in
=
x_ptr
+
i
*
n
+
j
;
float
*
out
=
o_ptr
+
i
*
n
+
j
;
cv
=
vld1q_f32
(
in
);
cv1
=
vmaxq_f32
(
cv
,
zero
);
cv2
=
vminq_f32
(
cv
,
zero
);
if
(
mode
==
"channel"
)
{
cv2
=
vmulq_n_f32
(
cv2
,
alpha_ptr
[
i
]);
}
else
if
(
mode
==
"element"
)
{
pv
=
vld1q_f32
(
alpha_ptr
+
i
*
n
+
j
);
cv2
=
vmulq_f32
(
cv2
,
pv
);
}
else
{
cv2
=
vmulq_n_f32
(
cv2
,
alpha_ptr
[
0
]);
}
cv
=
vaddq_f32
(
cv1
,
cv2
);
vst1q_f32
(
out
,
cv
);
}
int
j
;
for
(
j
=
0
;
(
j
+
3
)
<
n
;
j
+=
4
)
{
}
for
(
int
m
=
j
;
m
<
n
;
m
++
)
{
if
(
mode
==
"channel"
)
{
o_ptr
[
i
*
n
+
m
]
=
x_ptr
[
i
*
n
+
m
]
>
0
?
x_ptr
[
i
*
n
+
m
]
:
alpha_ptr
[
i
]
*
x_ptr
[
i
*
n
+
m
];
}
else
if
(
mode
==
"element"
)
{
o_ptr
[
i
*
n
+
m
]
=
x_ptr
[
i
*
n
+
m
]
>
0
?
x_ptr
[
i
*
n
+
m
]
:
alpha_ptr
[
i
*
n
+
m
]
*
x_ptr
[
i
*
n
+
m
];
}
else
{
o_ptr
[
i
*
n
+
m
]
=
x_ptr
[
i
*
n
+
m
]
>
0
?
x_ptr
[
i
*
n
+
m
]
:
alpha_ptr
[
0
]
*
x_ptr
[
i
*
n
+
m
];
}
}
}
#else
if
(
mode
==
"channel"
)
{
if
(
mode
==
"channel"
)
{
temp
=
numel
/
(
dim
[
0
]
*
dim
[
1
]);
temp
=
numel
/
(
dim
[
0
]
*
dim
[
1
]);
#pragma omp parallel for
#pragma omp parallel for
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
index
=
(
i
/
temp
)
%
dim
[
1
];
index
=
(
i
/
temp
)
%
dim
[
1
];
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
index
]
*
x_ptr
[
i
];
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
index
]
*
x_ptr
[
i
];
}
}
}
else
if
(
mode
==
"element"
)
{
}
else
if
(
mode
==
"element"
)
{
#pragma omp parallel for
#pragma omp parallel for
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
i
]
*
x_ptr
[
i
];
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
i
]
*
x_ptr
[
i
];
}
}
}
else
{
}
else
{
#pragma omp parallel for
#pragma omp parallel for
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
for
(
i
=
0
;
i
<
numel
;
i
++
)
{
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
0
]
*
x_ptr
[
i
];
o_ptr
[
i
]
=
x_ptr
[
i
]
>
0
?
x_ptr
[
i
]
:
alpha_ptr
[
0
]
*
x_ptr
[
i
];
}
}
}
}
#endif
}
}
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
...
...
src/operators/kernel/central-arm-func/conv_add_add_prelu_arm_func.h
0 → 100644
浏览文件 @
b59e8f1c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDADDPRELU_OP
#pragma once
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
ConvAddAddPReluCompute
(
const
FusionConvAddAddPReluParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
Tensor
bias1
=
*
param
.
Bias1
();
int
axis
=
param
.
Axis
();
Tensor
*
output
=
param
.
Output
();
float
*
biase_data
=
bias
.
data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
Tensor
aa
=
*
param
.
InputAlpha
();
float
*
p
=
aa
.
data
<
float
>
();
std
::
string
mode
=
param
.
Mode
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
Tensor
bias1_batch
=
bias1
.
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
bias1_slice
=
bias1_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
float
*
biase_data1
=
bias1_slice
.
data
<
float
>
();
// int n = bias1_slice.dims()[0];
// int m = bias1_slice.dims()[1];
// for(int i=0;i<n*m;i++){
// if(biase_data1[i]!=0)
// DLOG<<biase_data1[i]<<",yangfei";
// }
// math::matmul<float>(filter_slice, false, col_matrix,
// false,
// static_cast<float>(1),
// &out_slice,
// static_cast<float>(1), true,
// biase_data);
math
::
matmulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
p
,
mode
,
biase_data
,
biase_data1
);
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/central-arm-func/conv_add_prelu_arm_func.h
0 → 100644
浏览文件 @
b59e8f1c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef FUSION_CONVADDPRELU_OP
#pragma once
#include <vector>
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
template
<
typename
P
>
void
ConvAddPReluCompute
(
const
FusionConvAddPReluParam
<
CPU
>
&
param
)
{
const
Tensor
*
input
=
param
.
Input
();
Tensor
filter
=
*
param
.
Filter
();
Tensor
bias
=
*
param
.
Bias
();
// DLOG<<"yangfei";
// DLOG<<bias.dims();
int
axis
=
param
.
Axis
();
Tensor
*
output
=
param
.
Output
();
float
*
biase_data
=
bias
.
data
<
float
>
();
int
groups
=
param
.
Groups
();
std
::
vector
<
int
>
strides
=
param
.
Strides
();
std
::
vector
<
int
>
paddings
=
param
.
Paddings
();
std
::
vector
<
int
>
dilations
=
param
.
Dilations
();
Tensor
aa
=
*
param
.
InputAlpha
();
float
*
p
=
aa
.
data
<
float
>
();
std
::
string
mode
=
param
.
Mode
();
const
int
batch_size
=
static_cast
<
int
>
(
input
->
dims
()[
0
]);
std
::
vector
<
int64_t
>
filter_shape_vec
(
framework
::
vectorize
(
filter
.
dims
()));
std
::
vector
<
int64_t
>
output_shape_vec
(
framework
::
vectorize
(
output
->
dims
()));
size_t
data_dim
=
filter_shape_vec
.
size
()
-
2
;
std
::
vector
<
int64_t
>
col_shape_vec
(
1
+
2
*
data_dim
);
col_shape_vec
[
0
]
=
input
->
dims
()[
1
]
/
groups
;
for
(
size_t
j
=
0
;
j
<
data_dim
;
++
j
)
{
col_shape_vec
[
j
+
1
]
=
filter_shape_vec
[
j
+
2
];
col_shape_vec
[
j
+
1
+
data_dim
]
=
output_shape_vec
[
j
+
2
];
}
framework
::
DDim
col_shape
(
framework
::
make_ddim
(
col_shape_vec
));
framework
::
DDim
col_matrix_shape
=
framework
::
flatten_to_2d
(
col_shape
,
data_dim
+
1
);
bool
is_expand
=
math
::
IsExpand
(
filter_shape_vec
,
strides
,
paddings
,
dilations
);
Tensor
col
;
Tensor
col_matrix
;
if
(
is_expand
)
{
col
.
mutable_data
<
float
>
(
col_shape
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
framework
::
DDim
input_shape
=
framework
::
slice_ddim
(
input
->
dims
(),
1
,
static_cast
<
int
>
(
input
->
dims
().
size
()));
framework
::
DDim
filter_matrix_shape
=
{
filter
.
dims
()[
0
],
filter
.
numel
()
/
filter
.
dims
()[
0
]};
filter
.
Resize
(
filter_matrix_shape
);
framework
::
DDim
output_matrix_shape
=
{
output
->
dims
()[
1
],
output
->
numel
()
/
(
output
->
dims
()[
0
]
*
output
->
dims
()[
1
])};
// convolution operator: im2col(or vol2col) + gemm
int
in_step
=
static_cast
<
int
>
(
input
->
dims
()[
1
])
/
groups
;
int
out_step
=
static_cast
<
int
>
(
output
->
dims
()[
1
])
/
groups
;
math
::
Vol2ColFunctor
<
CPU
,
float
>
vol2col
;
math
::
Im2ColFunctor
<
math
::
ColFormat
::
kCFO
,
CPU
,
float
>
im2col
;
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
Tensor
in_batch
=
input
->
Slice
(
i
,
i
+
1
).
Resize
(
input_shape
);
Tensor
out_batch
=
output
->
Slice
(
i
,
i
+
1
).
Resize
(
output_matrix_shape
);
for
(
int
g
=
0
;
g
<
groups
;
g
++
)
{
Tensor
in_slice
=
in_batch
.
Slice
(
g
*
in_step
,
(
g
+
1
)
*
in_step
);
if
(
!
is_expand
)
{
col
.
ShareDataWith
(
in_slice
);
col_matrix
.
ShareDataWith
(
col
);
col_matrix
.
Resize
(
col_matrix_shape
);
}
else
if
(
data_dim
==
2U
)
{
// im2col
im2col
(
in_slice
,
dilations
,
strides
,
std
::
vector
<
int
>
{
paddings
[
0
],
paddings
[
1
],
paddings
[
0
],
paddings
[
1
]},
&
col
);
}
else
if
(
data_dim
==
3U
)
{
// vol2col
vol2col
(
in_slice
,
dilations
,
strides
,
paddings
,
&
col
);
}
// gemm
Tensor
out_slice
=
out_batch
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
Tensor
filter_slice
=
filter
.
Slice
(
g
*
out_step
,
(
g
+
1
)
*
out_step
);
// math::matmul<float>(filter_slice, false, col_matrix,
// false,
// static_cast<float>(1),
// &out_slice,
// static_cast<float>(1), true,
// biase_data);
math
::
matmulWithPRelu
(
filter_slice
,
false
,
col_matrix
,
false
,
&
out_slice
,
p
,
mode
,
biase_data
,
nullptr
);
}
}
}
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/conv_add_add_prelu_kernel.h
0 → 100644
浏览文件 @
b59e8f1c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_CONVADDADDPRELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
using
framework
::
DDim
;
using
framework
::
OpKernelBase
;
template
<
typename
DeviceType
,
typename
T
>
class
ConvAddAddPReluKernel
:
public
OpKernelBase
<
DeviceType
,
FusionConvAddAddPReluParam
<
DeviceType
>>
{
public:
void
Compute
(
const
FusionConvAddAddPReluParam
<
DeviceType
>
&
param
)
const
;
bool
Init
(
FusionConvAddAddPReluParam
<
DeviceType
>
*
param
);
};
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/kernel/conv_add_prelu_kernel.h
0 → 100644
浏览文件 @
b59e8f1c
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef FUSION_CONVADDPRELU_OP
#include <vector>
#include "framework/ddim.h"
#include "framework/operator.h"
#include "operators/math/conv_func.h"
#include "operators/math/im2col.h"
#include "operators/math/math_function.h"
#include "operators/math/vol2col.h"
#include "operators/op_param.h"
namespace
paddle_mobile
{
namespace
operators
{
using
framework
::
DDim
;
using
framework
::
OpKernelBase
;
template
<
typename
DeviceType
,
typename
T
>
class
ConvAddPReluKernel
:
public
OpKernelBase
<
DeviceType
,
FusionConvAddPReluParam
<
DeviceType
>>
{
public:
void
Compute
(
const
FusionConvAddPReluParam
<
DeviceType
>
&
param
)
const
;
bool
Init
(
FusionConvAddPReluParam
<
DeviceType
>
*
param
);
};
}
// namespace operators
}
// namespace paddle_mobile
#endif
src/operators/math/gemm.cpp
浏览文件 @
b59e8f1c
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "operators/math/gemm.h"
#include "operators/math/gemm.h"
#include <string>
#include "common/log.h"
#include "common/log.h"
#include "memory/t_malloc.h"
#include "memory/t_malloc.h"
#if __ARM_NEON
#if __ARM_NEON
...
@@ -707,6 +708,25 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
...
@@ -707,6 +708,25 @@ void InnerKernelWithBn(int mc, int nc, float alpha, const float *a,
}
}
}
}
void
InnerKernelWithPRelu
(
int
mc
,
int
nc
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
nc
;
j
+=
NR
)
{
for
(
int
i
=
0
;
i
<
mc
;
i
+=
MR
)
{
#if __aarch64__
// AddDot8x12(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x16
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
#else
// AddDot4x4(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
// AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
AddDot6x8
(
KC
,
a
+
i
*
KC
,
b
+
j
*
KC
,
c
+
i
*
NC
+
j
,
NC
);
#endif
}
}
WriteWithAddPRelu
(
mc
,
nc
,
c
,
C
,
ldc
,
p
,
mode
,
bias
,
bias1
);
}
#if __ARM_NEON
#if __ARM_NEON
#if __aarch64__
#if __aarch64__
...
@@ -988,6 +1008,82 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
...
@@ -988,6 +1008,82 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
}
}
}
}
// C = A * B + C,prelu(C)
void
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
int
nc1
=
nc
/
4
;
int
_nc1
=
nc
%
4
;
float
*
c_ptr
,
*
C_ptr
;
float32x4_t
cv
;
float32x4_t
cv1
;
float32x4_t
biasv
;
float32x4_t
biasv1
;
float32x4_t
zero
=
vdupq_n_f32
(
0.0
);
float32x4_t
pv
;
float
*
ptr
=
p
;
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
c_ptr
=
c
+
i
*
NC
;
C_ptr
=
C
+
i
*
ldc
;
biasv
=
vld1q_dup_f32
(
bias
+
i
);
if
(
bias1
==
nullptr
)
{
biasv1
=
zero
;
}
else
{
biasv1
=
vld1q_dup_f32
(
bias1
+
i
);
}
for
(
int
j
=
0
;
j
<
nc1
;
++
j
)
{
cv
=
vld1q_f32
(
c_ptr
);
cv
=
vaddq_f32
(
cv
,
biasv
);
cv
=
vaddq_f32
(
cv
,
biasv1
);
cv
=
vmaxq_f32
(
cv
,
zero
);
cv1
=
vminq_f32
(
cv
,
zero
);
if
(
mode
==
"channel"
)
{
cv1
=
vmulq_n_f32
(
cv1
,
ptr
[
i
]);
}
else
if
(
mode
==
"element"
)
{
pv
=
vld1q_f32
(
ptr
);
cv1
=
vmulq_f32
(
cv1
,
pv
);
ptr
=
ptr
+
4
;
}
else
{
cv1
=
vmulq_n_f32
(
cv1
,
ptr
[
0
]);
}
cv
=
vaddq_f32
(
cv
,
cv1
);
vst1q_f32
(
C_ptr
,
cv
);
c_ptr
+=
4
;
C_ptr
+=
4
;
}
if
(
_nc1
!=
0
)
{
cv
=
vld1q_f32
(
c_ptr
);
cv
=
vaddq_f32
(
cv
,
biasv
);
cv
=
vaddq_f32
(
cv
,
biasv1
);
cv
=
vmaxq_f32
(
cv
,
zero
);
cv1
=
vminq_f32
(
cv
,
zero
);
if
(
mode
==
"channel"
)
{
cv1
=
vmulq_n_f32
(
cv1
,
ptr
[
i
]);
}
else
if
(
mode
==
"element"
)
{
pv
=
vld1q_f32
(
ptr
);
cv1
=
vmulq_f32
(
cv1
,
pv
);
ptr
=
ptr
+
4
;
}
else
{
cv1
=
vmulq_n_f32
(
cv1
,
ptr
[
0
]);
}
cv
=
vaddq_f32
(
cv
,
cv1
);
if
(
_nc1
>=
1
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
0
);
C_ptr
++
;
}
if
(
_nc1
>=
2
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
1
);
C_ptr
++
;
}
if
(
_nc1
>=
3
)
{
vst1q_lane_f32
(
C_ptr
,
cv
,
2
);
C_ptr
++
;
}
}
}
}
// C = A * B, batchnorm(C)
// C = A * B, batchnorm(C)
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
)
{
float
*
new_bias
)
{
...
@@ -1971,6 +2067,145 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
...
@@ -1971,6 +2067,145 @@ void WriteWithAddReluV1(int mc, int nc, float *c, float *C, int ldc,
}
}
}
}
void
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
if
(
nc
<
4
)
{
if
(
bias1
==
nullptr
)
{
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
for
(
int
j
=
0
;
j
<
nc
;
++
j
)
{
float
r
=
*
c
+
*
bias
;
if
(
r
<
0
)
{
r
=
*
p
;
}
c
++
;
}
bias
++
;
p
++
;
}
}
else
{
for
(
int
i
=
0
;
i
<
mc
;
++
i
)
{
for
(
int
j
=
0
;
j
<
nc
;
++
j
)
{
float
r
=
*
c
+
*
bias
;
r
+=
*
bias1
;
if
(
r
<
0
)
{
r
*=
*
p
;
}
c
++
;
bias1
++
;
}
bias
++
;
p
++
;
}
}
return
;
}
int
nc1
=
nc
/
8
;
int
step
=
4
*
(
ldc
-
nc
);
int
step1
=
4
*
(
NC
-
nc
);
if
(
bias1
==
nullptr
)
{
asm
volatile
(
"vmov.f32 q14, #0.0
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"blt end_mc_%=
\n\t
"
"loop_mc_%=:
\n\t
"
"mov r5, %[nc1]
\n\t
"
"vld1.32 {d0}, [%[bias]]
\n\t
"
"vld1.32 {d1}, [%[p]]
\n\t
"
"vdup.32 q1, d0[0]
\n\t
"
"vdup.32 q2, d1[0]
\n\t
"
"subs r5, r5, #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"pld [%[c], #32]
\n\t
"
"vld1.32 {q3, q4}, [%[c]]!
\n\t
"
"vadd.f32 q3, q3, q1
\n\t
"
"vadd.f32 q4, q4, q1
\n\t
"
"vmax.f32 q5, q3, q14
\n\t
"
"vmin.f32 q7, q3, q14
\n\t
"
"vmax.f32 q6, q4, q14
\n\t
"
"vmin.f32 q8, q4, q14
\n\t
"
"vmla.f32 q5, q7, q2
\n\t
"
"vmla.f32 q6, q8, q2
\n\t
"
"vst1.32 {q5, q6}, [%[C]]!
\n\t
"
"subs r5, r5, #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"add %[p], %[p], #4
\n\t
"
"add %[bias], %[bias], #4
\n\t
"
"add %[c], %[c], %[step1]
\n\t
"
"add %[C], %[C], %[step]
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"bge loop_mc_%=
\n\t
"
"end_mc_%=:
\n\t
"
:
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
mc
]
"r"
(
mc
),
[
nc1
]
"r"
(
nc1
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
),
[
p
]
"r"
(
p
),
[
bias
]
"r"
(
bias
),
[
bias1
]
"r"
(
bias1
)
:
"memory"
,
"r5"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
);
}
else
{
asm
volatile
(
"vmov.f32 q14, #0.0
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"blt end_mc_%=
\n\t
"
"loop_mc_%=:
\n\t
"
"mov r5, %[nc1]
\n\t
"
"vld1.32 {d0}, [%[bias]]
\n\t
"
"vld1.32 {d1}, [%[p]]
\n\t
"
"vdup.32 q1, d0[0]
\n\t
"
"vdup.32 q2, d1[0]
\n\t
"
"subs r5, r5, #1
\n\t
"
"blt end_nc1_%=
\n\t
"
"loop_nc1_%=:
\n\t
"
"pld [%[c], #32]
\n\t
"
"pld [%[bias1], #32]
\n\t
"
"vld1.32 {q3, q4}, [%[c]]!
\n\t
"
"vld1.32 {q9, q10}, [%[bias1]]!
\n\t
"
"vadd.f32 q3, q3, q1
\n\t
"
"vadd.f32 q4, q4, q1
\n\t
"
"vadd.f32 q3, q3, q9
\n\t
"
"vadd.f32 q4, q4, q10
\n\t
"
"vmax.f32 q5, q3, q14
\n\t
"
"vmin.f32 q7, q3, q14
\n\t
"
"vmax.f32 q6, q4, q14
\n\t
"
"vmin.f32 q8, q4, q14
\n\t
"
"vmla.f32 q5, q7, q2
\n\t
"
"vmla.f32 q6, q8, q2
\n\t
"
"vst1.32 {q5, q6}, [%[C]]!
\n\t
"
"subs r5, r5, #1
\n\t
"
"bge loop_nc1_%=
\n\t
"
"end_nc1_%=:
\n\t
"
"add %[p], %[p], #4
\n\t
"
"add %[bias], %[bias], #4
\n\t
"
"add %[c], %[c], %[step1]
\n\t
"
"add %[C], %[C], %[step]
\n\t
"
"subs %[mc], %[mc], #1
\n\t
"
"bge loop_mc_%=
\n\t
"
"end_mc_%=:
\n\t
"
:
:
[
C
]
"r"
(
C
),
[
c
]
"r"
(
c
),
[
mc
]
"r"
(
mc
),
[
nc1
]
"r"
(
nc1
),
[
step
]
"r"
(
step
),
[
step1
]
"r"
(
step1
),
[
p
]
"r"
(
p
),
[
bias
]
"r"
(
bias
),
[
bias1
]
"r"
(
bias1
)
:
"memory"
,
"r5"
,
"q0"
,
"q1"
,
"q2"
,
"q3"
,
"q4"
,
"q5"
,
"q6"
,
"q7"
,
"q8"
,
"q9"
,
"q10"
);
}
}
// C = A * B, batchnorm(C)
// C = A * B, batchnorm(C)
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
scale
,
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
scale
,
float
*
bias
)
{
float
*
bias
)
{
...
@@ -2513,6 +2748,9 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {}
...
@@ -2513,6 +2748,9 @@ void WriteWithAddRelu(int mc, int nc, float *c, float *C, int ldc) {}
void
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
void
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
)
{}
float
*
bias
)
{}
void
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{}
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
void
WriteWithBn
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
new_scale
,
float
*
new_bias
)
{}
float
*
new_bias
)
{}
...
@@ -2642,6 +2880,74 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -2642,6 +2880,74 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile
::
memory
::
Free
(
zero
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
}
void
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
// L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
// L2 cache is 0.5~4 Mib (Contex-A72 cluster)
int
L1
=
32
*
1024
;
int
L2
=
0.5
*
1024
*
1024
;
KC
=
k
;
MC
=
L1
/
(
KC
*
sizeof
(
float
));
NC
=
L2
/
(
KC
*
sizeof
(
float
));
// make sure MC is multiple of MR, and NC is multiple of NR
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
// DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
// DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
));
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
for
(
int
l
=
0
;
l
<
KC
;
++
l
)
{
zero
[
l
]
=
0
;
}
int
mc
,
nc
;
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
nc
=
s_min
(
n
-
j
,
NC
);
#if __aarch64__
// PackMatrixB_12c(KC, nc, nc % NR, &B(0, j), ldb, packedB);
PackMatrixB_16c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
#else
PackMatrixB_8c
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
packedB
);
#endif
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
mc
=
s_min
(
m
-
i
,
MC
);
#if __aarch64__
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
// PackMatrixA_8r(mc, KC, mc % MR, &A(i, 0), lda, packedA);
#else
PackMatrixA_6r
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
packedA
);
#endif
if
(
bias1
==
nullptr
)
{
InnerKernelWithPRelu
(
mc
,
nc
,
packedA
,
packedB
,
packedC
,
&
C
(
i
,
j
),
ldc
,
p
+
i
,
mode
,
bias
+
i
,
nullptr
);
}
else
{
InnerKernelWithPRelu
(
mc
,
nc
,
packedA
,
packedB
,
packedC
,
&
C
(
i
,
j
),
ldc
,
p
+
i
,
mode
,
bias
+
i
,
bias1
+
i
*
ldc
+
j
);
}
}
}
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
// 32位 float 矩阵乘法
// 32位 float 矩阵乘法
void
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
void
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
...
@@ -2857,6 +3163,123 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -2857,6 +3163,123 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
paddle_mobile
::
memory
::
Free
(
zero
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
}
void
SgemmWithPRelu_omp
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
#ifdef _OPENMP
int
max_threads
=
omp_get_max_threads
();
#else
int
max_threads
=
1
;
#endif
int
L1
=
32
*
1024
;
KC
=
k
;
if
(
m
>
n
)
{
// 对 A 分块
MC
=
L1
/
(
KC
*
sizeof
(
float
));
int
mblock_num
=
(
m
+
MC
-
1
)
/
MC
;
MC
=
(
m
+
mblock_num
-
1
)
/
mblock_num
;
MC
=
(
MC
+
MR
-
1
)
/
MR
*
MR
;
// 补齐 B
NC
=
(
n
+
NR
-
1
)
/
NR
*
NR
;
#if __aarch64__
procPackA
=
PackMatrixA_6r
;
procPackB
=
PackMatrixB_omp_16c
;
procAddDot
=
AddDot6x16
;
#else
procPackA
=
PackMatrixA_6r
;
procPackB
=
PackMatrixB_omp_8c
;
procAddDot
=
AddDot6x8
;
#endif
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
));
procPackB
(
KC
,
NC
,
NC
%
NR
,
B
,
ldb
,
packedB
);
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
*
max_threads
));
}
else
{
// 对 B 分块
NC
=
L1
/
(
KC
*
sizeof
(
float
));
int
nblock_num
=
(
n
+
NC
-
1
)
/
NC
;
NC
=
(
n
+
nblock_num
-
1
)
/
nblock_num
;
NC
=
(
NC
+
NR
-
1
)
/
NR
*
NR
;
// 补齐 A
MC
=
(
m
+
MR
-
1
)
/
MR
*
MR
;
#if __aarch64__
procPackA
=
PackMatrixA_omp_6r
;
procPackB
=
PackMatrixB_16c
;
procAddDot
=
AddDot6x16
;
#else
procPackA
=
PackMatrixA_omp_6r
;
procPackB
=
PackMatrixB_8c
;
procAddDot
=
AddDot6x8
;
#endif
packedA
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
KC
));
procPackA
(
MC
,
KC
,
MC
%
MR
,
A
,
lda
,
packedA
);
packedB
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
*
NC
*
max_threads
));
}
zero
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
KC
));
memset
(
static_cast
<
void
*>
(
zero
),
0
,
sizeof
(
float
)
*
KC
);
packedC
=
static_cast
<
float
*>
(
paddle_mobile
::
memory
::
Alloc
(
sizeof
(
float
)
*
MC
*
NC
*
max_threads
));
if
(
m
>
n
)
{
#pragma omp parallel for
for
(
int
i
=
0
;
i
<
m
;
i
+=
MC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
#else
int
local_threads
=
0
;
#endif
int
mc
;
mc
=
s_min
(
m
-
i
,
MC
);
float
*
local_A
=
packedA
+
MC
*
KC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
procPackA
(
mc
,
KC
,
mc
%
MR
,
&
A
(
i
,
0
),
lda
,
local_A
);
if
(
bias1
==
nullptr
)
{
InnerKernelWithPRelu
(
mc
,
n
,
local_A
,
packedB
,
local_C
,
&
C
(
i
,
0
),
ldc
,
p
+
i
,
mode
,
bias
+
i
,
nullptr
);
}
else
{
InnerKernelWithPRelu
(
mc
,
n
,
local_A
,
packedB
,
local_C
,
&
C
(
i
,
0
),
ldc
,
p
+
i
,
mode
,
bias
+
i
,
bias1
+
i
*
ldc
);
}
}
}
else
{
#pragma omp parallel for
for
(
int
j
=
0
;
j
<
n
;
j
+=
NC
)
{
#ifdef _OPENMP
int
local_threads
=
omp_get_thread_num
();
#else
int
local_threads
=
0
;
#endif
int
nc
;
nc
=
s_min
(
n
-
j
,
NC
);
float
*
local_B
=
packedB
+
KC
*
NC
*
local_threads
;
float
*
local_C
=
packedC
+
MC
*
NC
*
local_threads
;
procPackB
(
KC
,
nc
,
nc
%
NR
,
&
B
(
0
,
j
),
ldb
,
local_B
);
if
(
bias1
==
nullptr
)
{
InnerKernelWithPRelu
(
m
,
nc
,
packedA
,
local_B
,
local_C
,
&
C
(
0
,
j
),
ldc
,
p
,
mode
,
bias
,
nullptr
);
}
else
{
InnerKernelWithPRelu
(
m
,
nc
,
packedA
,
local_B
,
local_C
,
&
C
(
0
,
j
),
ldc
,
p
,
mode
,
bias
,
bias1
+
j
);
}
}
}
paddle_mobile
::
memory
::
Free
(
packedA
);
paddle_mobile
::
memory
::
Free
(
packedB
);
paddle_mobile
::
memory
::
Free
(
packedC
);
paddle_mobile
::
memory
::
Free
(
zero
);
}
void
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
void
AddDot6x8
(
int
k
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
int
ldc
)
{
#if __ARM_NEON
#if __ARM_NEON
#if __aarch64__
#if __aarch64__
...
...
src/operators/math/gemm.h
浏览文件 @
b59e8f1c
...
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
...
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include <string>
#include "common/log.h"
// 矩阵取值运算宏,假设矩阵按行存储
// 矩阵取值运算宏,假设矩阵按行存储
#define A(i, j) A[(i)*lda + (j)]
#define A(i, j) A[(i)*lda + (j)]
...
@@ -79,6 +81,9 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
...
@@ -79,6 +81,9 @@ void InnerKernelWithBias(int mc, int nc, float alpha, const float *a,
void
InnerKernelWithBn
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
void
InnerKernelWithBn
(
int
mc
,
int
nc
,
float
alpha
,
const
float
*
a
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
const
float
*
b
,
float
beta
,
float
*
c
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
void
InnerKernelWithPRelu
(
int
mc
,
int
nc
,
const
float
*
a
,
const
float
*
b
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
/*
/*
// 向量矩阵乘法 (M = 1)
// 向量矩阵乘法 (M = 1)
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
void VectorKernel(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -108,6 +113,9 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
...
@@ -108,6 +113,9 @@ void WriteWithAdd(int mc, int nc, float *c, float *C, int ldc);
void
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
);
void
WriteWithAddV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
);
// C = A * B + C, relu(C)
// C = A * B + C, relu(C)
void
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
void
WriteWithAddRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
);
// C = A * B + C,prelu(C)
void
WriteWithAddPRelu
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
// C = A * B + bias ,relu(C)
// C = A * B + bias ,relu(C)
void
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
void
WriteWithAddReluV1
(
int
mc
,
int
nc
,
float
*
c
,
float
*
C
,
int
ldc
,
float
*
bias
);
float
*
bias
);
...
@@ -146,6 +154,10 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -146,6 +154,10 @@ void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
void
SgemmWithPRelu
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
// 32位 float 矩阵乘法(openmp 多线程版本)
// 32位 float 矩阵乘法(openmp 多线程版本)
void
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
void
Sgemm_omp
(
int
m
,
int
n
,
int
k
,
float
alpha
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
...
@@ -156,6 +168,10 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
...
@@ -156,6 +168,10 @@ void SgemmWithBn_omp(int m, int n, int k, float alpha, const float *A, int lda,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
const
float
*
B
,
int
ldb
,
float
beta
,
float
*
C
,
int
ldc
,
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
bool
relu
,
float
*
new_scale
,
float
*
new_bias
);
void
SgemmWithPRelu_omp
(
int
m
,
int
n
,
int
k
,
const
float
*
A
,
int
lda
,
const
float
*
B
,
int
ldb
,
float
*
C
,
int
ldc
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
src/operators/math/math_function.cpp
浏览文件 @
b59e8f1c
...
@@ -87,6 +87,37 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -87,6 +87,37 @@ void matmulWithBn<float>(const framework::Tensor &matrix_a, bool trans_a,
new_bias
->
data
<
float
>
()
+
group
);
new_bias
->
data
<
float
>
()
+
group
);
#endif
#endif
}
}
void
matmulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
framework
::
Tensor
*
matrix_out
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
)
{
auto
dim_a
=
matrix_a
.
dims
();
auto
dim_b
=
matrix_b
.
dims
();
auto
dim_out
=
matrix_out
->
dims
();
// PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 &&
// dim_out.size() ==
// 2,
// "The input and output of matmul be matrix");
//
// PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
// platform::is_cpu_place(matrix_b.place())
// &&
// platform::is_cpu_place(matrix_out->place()),
// "Matrix must all be in CPUPlace");
int
M
=
dim_out
[
0
];
int
N
=
dim_out
[
1
];
int
K
=
(
!
trans_a
)
?
dim_a
[
1
]
:
dim_a
[
0
];
#ifdef _OPENMP
SgemmWithPRelu_omp
(
M
,
N
,
K
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
matrix_out
->
data
<
float
>
(),
N
,
p
,
mode
,
bias
,
bias1
);
#else
SgemmWithPRelu
(
M
,
N
,
K
,
matrix_a
.
data
<
float
>
(),
K
,
matrix_b
.
data
<
float
>
(),
N
,
matrix_out
->
data
<
float
>
(),
N
,
p
,
mode
,
bias
,
bias1
);
#endif
}
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
...
...
src/operators/math/math_function.h
浏览文件 @
b59e8f1c
...
@@ -33,6 +33,11 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
...
@@ -33,6 +33,11 @@ void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,
framework
::
Tensor
*
matrix_out
,
T
beta
,
bool
relu
,
framework
::
Tensor
*
matrix_out
,
T
beta
,
bool
relu
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
,
framework
::
Tensor
*
new_scale
,
framework
::
Tensor
*
new_bias
,
int
group
);
int
group
);
void
matmulWithPRelu
(
const
framework
::
Tensor
&
matrix_a
,
bool
trans_a
,
const
framework
::
Tensor
&
matrix_b
,
bool
trans_b
,
framework
::
Tensor
*
matrix_out
,
float
*
p
,
std
::
string
mode
,
float
*
bias
,
float
*
bias1
);
}
// namespace math
}
// namespace math
}
// namespace operators
}
// namespace operators
}
// namespace paddle_mobile
}
// namespace paddle_mobile
src/operators/op_param.h
浏览文件 @
b59e8f1c
...
@@ -87,12 +87,21 @@ class OpParam {
...
@@ -87,12 +87,21 @@ class OpParam {
static
T
*
InputXFrom
(
const
VariableNameMap
&
inputs
,
const
Scope
&
scope
)
{
static
T
*
InputXFrom
(
const
VariableNameMap
&
inputs
,
const
Scope
&
scope
)
{
return
GetVarValue
<
T
>
(
"X"
,
inputs
,
scope
);
return
GetVarValue
<
T
>
(
"X"
,
inputs
,
scope
);
}
}
template
<
typename
T
>
static
T
*
InputXFrom1
(
const
VariableNameMap
&
inputs
,
const
Scope
&
scope
)
{
return
GetVarValue1
<
T
>
(
"addX"
,
inputs
,
scope
);
}
template
<
typename
T
>
template
<
typename
T
>
static
T
*
InputYFrom
(
const
VariableNameMap
&
inputs
,
const
Scope
&
scope
)
{
static
T
*
InputYFrom
(
const
VariableNameMap
&
inputs
,
const
Scope
&
scope
)
{
return
GetVarValue
<
T
>
(
"Y"
,
inputs
,
scope
);
return
GetVarValue
<
T
>
(
"Y"
,
inputs
,
scope
);
}
}
template
<
typename
T
>
static
T
*
InputYFrom1
(
const
VariableNameMap
&
inputs
,
const
Scope
&
scope
)
{
return
GetVarValue1
<
T
>
(
"Y"
,
inputs
,
scope
);
}
template
<
typename
T
>
template
<
typename
T
>
static
T
*
InputZFrom
(
const
VariableNameMap
&
inputs
,
const
Scope
&
scope
)
{
static
T
*
InputZFrom
(
const
VariableNameMap
&
inputs
,
const
Scope
&
scope
)
{
return
GetVarValue
<
T
>
(
"Z"
,
inputs
,
scope
);
return
GetVarValue
<
T
>
(
"Z"
,
inputs
,
scope
);
...
@@ -222,6 +231,26 @@ class OpParam {
...
@@ -222,6 +231,26 @@ class OpParam {
}
}
}
}
static
std
::
string
getkey
(
const
string
&
key
,
const
VariableNameMap
&
var_map
,
int
index
)
{
auto
var_vec
=
var_map
.
at
(
key
);
return
var_vec
[
index
];
}
template
<
typename
T
>
static
T
*
GetVarValue1
(
const
string
&
key
,
const
VariableNameMap
&
var_map
,
const
Scope
&
scope
)
{
PADDLE_MOBILE_ENFORCE
(
var_map
.
count
(
key
)
>
0
,
"%s is not contained in var_map"
,
key
.
c_str
())
auto
var_vec
=
var_map
.
at
(
key
);
if
(
!
var_vec
.
empty
())
{
auto
var
=
scope
.
FindVar
(
var_vec
[
1
]);
return
var
->
GetMutable
<
T
>
();
}
else
{
return
nullptr
;
}
}
template
<
typename
T
>
template
<
typename
T
>
static
vector
<
T
*>
GetMultiVarValue
(
const
string
&
key
,
static
vector
<
T
*>
GetMultiVarValue
(
const
string
&
key
,
const
VariableNameMap
&
var_map
,
const
VariableNameMap
&
var_map
,
...
@@ -1184,6 +1213,163 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
...
@@ -1184,6 +1213,163 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
};
};
#endif
#endif
#ifdef FUSION_CONVADDPRELU_OP
template
<
typename
DeviceType
>
class
FusionConvAddPReluParam
:
public
OpParam
{
typedef
typename
DtypeTensorTrait
<
DeviceType
>::
gtype
GType
;
typedef
typename
DtypeTensorTrait
<
DeviceType
>::
rtype
RType
;
public:
FusionConvAddPReluParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
alpha_
=
InputAlphaFrom
<
GType
>
(
inputs
,
scope
);
mode_
=
GetAttr
<
std
::
string
>
(
"mode"
,
attrs
);
framework
::
DDim
dims
=
alpha_
->
dims
();
bias_
=
InputYFrom
<
GType
>
(
inputs
,
scope
);
axis_
=
GetAttr
<
int
>
(
"axis"
,
attrs
);
filter_
=
FilterFrom
<
GType
>
(
inputs
,
scope
);
input_
=
InputFrom
<
GType
>
(
inputs
,
scope
);
output_
=
OutFrom
<
GType
>
(
outputs
,
scope
);
strides_
=
GetAttr
<
vector
<
int
>>
(
"strides"
,
attrs
);
paddings_
=
GetAttr
<
vector
<
int
>>
(
"paddings"
,
attrs
);
dilations_
=
GetAttr
<
vector
<
int
>>
(
"dilations"
,
attrs
);
groups
=
GetAttr
<
int
>
(
"groups"
,
attrs
);
}
const
RType
*
InputAlpha
()
const
{
return
alpha_
;
}
const
std
::
string
&
Mode
()
const
{
return
mode_
;
}
RType
*
Bias
()
const
{
return
bias_
;
}
const
int
&
Axis
()
const
{
return
axis_
;
}
const
RType
*
Input
()
const
{
return
input_
;
}
#ifdef PADDLE_MOBILE_FPGA
RType
*
Filter
()
const
{
return
filter_
;
}
#else
const
RType
*
Filter
()
const
{
return
filter_
;
}
#endif
RType
*
Output
()
const
{
return
output_
;
}
const
vector
<
int
>
&
Strides
()
const
{
return
strides_
;
}
const
vector
<
int
>
&
Paddings
()
const
{
return
paddings_
;
}
const
vector
<
int
>
&
Dilations
()
const
{
return
dilations_
;
}
const
int
&
Groups
()
const
{
return
groups
;
}
protected:
RType
*
bias_
;
int
axis_
;
RType
*
input_
;
RType
*
output_
;
RType
*
filter_
;
vector
<
int
>
strides_
;
vector
<
int
>
paddings_
;
vector
<
int
>
dilations_
;
int
groups
;
RType
*
alpha_
;
std
::
string
mode_
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
ConvArgs
fpga_conv_args
;
public:
const
fpga
::
ConvArgs
&
FpgaArgs
()
const
{
return
fpga_conv_args
;
}
void
SetFpgaArgs
(
const
fpga
::
ConvArgs
&
args
)
{
fpga_conv_args
=
args
;
}
#endif
};
#endif
#ifdef FUSION_CONVADDADDPRELU_OP
template
<
typename
DeviceType
>
class
FusionConvAddAddPReluParam
:
public
OpParam
{
typedef
typename
DtypeTensorTrait
<
DeviceType
>::
gtype
GType
;
typedef
typename
DtypeTensorTrait
<
DeviceType
>::
rtype
RType
;
public:
FusionConvAddAddPReluParam
(
const
VariableNameMap
&
inputs
,
const
VariableNameMap
&
outputs
,
const
AttributeMap
&
attrs
,
const
Scope
&
scope
)
{
bias1_
=
InputYFrom1
<
GType
>
(
inputs
,
scope
);
alpha_
=
InputAlphaFrom
<
GType
>
(
inputs
,
scope
);
mode_
=
GetAttr
<
std
::
string
>
(
"mode"
,
attrs
);
framework
::
DDim
dims
=
alpha_
->
dims
();
bias_
=
InputYFrom
<
GType
>
(
inputs
,
scope
);
axis_
=
GetAttr
<
int
>
(
"axis"
,
attrs
);
filter_
=
FilterFrom
<
GType
>
(
inputs
,
scope
);
input_
=
InputFrom
<
GType
>
(
inputs
,
scope
);
output_
=
OutFrom
<
GType
>
(
outputs
,
scope
);
strides_
=
GetAttr
<
vector
<
int
>>
(
"strides"
,
attrs
);
paddings_
=
GetAttr
<
vector
<
int
>>
(
"paddings"
,
attrs
);
dilations_
=
GetAttr
<
vector
<
int
>>
(
"dilations"
,
attrs
);
groups
=
GetAttr
<
int
>
(
"groups"
,
attrs
);
keyOutput_
=
getkey
(
"addOut"
,
inputs
,
0
);
keyX1_
=
getkey
(
"addX"
,
inputs
,
1
);
keyY1_
=
getkey
(
"Y"
,
inputs
,
1
);
if
(
keyX1_
==
keyOutput_
)
{
bias1_
=
InputYFrom1
<
GType
>
(
inputs
,
scope
);
}
else
if
(
keyY1_
==
keyOutput_
)
{
bias1_
=
InputXFrom1
<
GType
>
(
inputs
,
scope
);
}
}
const
RType
*
InputAlpha
()
const
{
return
alpha_
;
}
const
std
::
string
&
Mode
()
const
{
return
mode_
;
}
const
RType
*
Bias1
()
const
{
return
bias1_
;
}
RType
*
Bias
()
const
{
return
bias_
;
}
const
int
&
Axis
()
const
{
return
axis_
;
}
const
RType
*
Input
()
const
{
return
input_
;
}
#ifdef PADDLE_MOBILE_FPGA
RType
*
Filter
()
const
{
return
filter_
;
}
#else
const
RType
*
Filter
()
const
{
return
filter_
;
}
#endif
RType
*
Output
()
const
{
return
output_
;
}
const
vector
<
int
>
&
Strides
()
const
{
return
strides_
;
}
const
vector
<
int
>
&
Paddings
()
const
{
return
paddings_
;
}
const
vector
<
int
>
&
Dilations
()
const
{
return
dilations_
;
}
const
int
&
Groups
()
const
{
return
groups
;
}
protected:
RType
*
bias_
;
int
axis_
;
RType
*
input_
;
RType
*
output_
;
RType
*
filter_
;
vector
<
int
>
strides_
;
vector
<
int
>
paddings_
;
vector
<
int
>
dilations_
;
int
groups
;
RType
*
alpha_
;
std
::
string
mode_
;
RType
*
bias1_
;
std
::
string
keyOutput_
;
std
::
string
keyX1_
;
std
::
string
keyY1_
;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga
::
ConvArgs
fpga_conv_args
;
public:
const
fpga
::
ConvArgs
&
FpgaArgs
()
const
{
return
fpga_conv_args
;
}
void
SetFpgaArgs
(
const
fpga
::
ConvArgs
&
args
)
{
fpga_conv_args
=
args
;
}
#endif
};
#endif
#ifdef FUSION_CONVADDBNRELU_OP
#ifdef FUSION_CONVADDBNRELU_OP
template
<
typename
Dtype
>
template
<
typename
Dtype
>
class
FusionConvAddBNReluParam
:
public
OpParam
{
class
FusionConvAddBNReluParam
:
public
OpParam
{
...
...
tools/op.cmake
浏览文件 @
b59e8f1c
...
@@ -124,6 +124,7 @@ if(NOT FOUND_MATCH)
...
@@ -124,6 +124,7 @@ if(NOT FOUND_MATCH)
set
(
DEPTHWISECONV_OP ON
)
set
(
DEPTHWISECONV_OP ON
)
set
(
ELEMENTWISEADD_OP ON
)
set
(
ELEMENTWISEADD_OP ON
)
set
(
FUSION_CONVADD_OP ON
)
set
(
FUSION_CONVADD_OP ON
)
set
(
FUSION_CONVADDPRELU_OP ON
)
set
(
FUSION_CONVADDRELU_OP ON
)
set
(
FUSION_CONVADDRELU_OP ON
)
set
(
FUSION_FC_OP ON
)
set
(
FUSION_FC_OP ON
)
set
(
LRN_OP ON
)
set
(
LRN_OP ON
)
...
@@ -137,6 +138,7 @@ if(NOT FOUND_MATCH)
...
@@ -137,6 +138,7 @@ if(NOT FOUND_MATCH)
set
(
SOFTMAX_OP ON
)
set
(
SOFTMAX_OP ON
)
set
(
TRANSPOSE_OP ON
)
set
(
TRANSPOSE_OP ON
)
set
(
FUSION_CONVADDBNRELU_OP ON
)
set
(
FUSION_CONVADDBNRELU_OP ON
)
set
(
FUSION_CONVADDADDPRELU_OP ON
)
set
(
FUSION_DWCONVBNRELU_OP ON
)
set
(
FUSION_DWCONVBNRELU_OP ON
)
set
(
FUSION_CONVBNRELU_OP ON
)
set
(
FUSION_CONVBNRELU_OP ON
)
set
(
PRELU_OP ON
)
set
(
PRELU_OP ON
)
...
@@ -192,6 +194,12 @@ endif()
...
@@ -192,6 +194,12 @@ endif()
if
(
FUSION_CONVADDRELU_OP
)
if
(
FUSION_CONVADDRELU_OP
)
add_definitions
(
-DFUSION_CONVADDRELU_OP
)
add_definitions
(
-DFUSION_CONVADDRELU_OP
)
endif
()
endif
()
if
(
FUSION_CONVADDPRELU_OP
)
add_definitions
(
-DFUSION_CONVADDPRELU_OP
)
endif
()
if
(
FUSION_CONVADDADDPRELU_OP
)
add_definitions
(
-DFUSION_CONVADDADDPRELU_OP
)
endif
()
if
(
FUSION_FC_OP
)
if
(
FUSION_FC_OP
)
add_definitions
(
-DFUSION_FC_OP
)
add_definitions
(
-DFUSION_FC_OP
)
endif
()
endif
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录