Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
22708640
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2297
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
22708640
编写于
4月 26, 2020
作者:
W
wangchaochaohu
提交者:
GitHub
4月 26, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fusion group optimize for cuda codegen(#23940)
上级
94dfb7d7
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
136 addition
and
64 deletion
+136
-64
paddle/fluid/framework/ir/fusion_group/code_generator.cc
paddle/fluid/framework/ir/fusion_group/code_generator.cc
+50
-38
paddle/fluid/framework/ir/fusion_group/code_generator.h
paddle/fluid/framework/ir/fusion_group/code_generator.h
+4
-0
paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
.../fluid/framework/ir/fusion_group/code_generator_helper.cc
+0
-2
paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
...e/fluid/framework/ir/fusion_group/code_generator_helper.h
+8
-2
paddle/fluid/framework/ir/fusion_group/cuda_resources.h
paddle/fluid/framework/ir/fusion_group/cuda_resources.h
+16
-0
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
+20
-4
paddle/fluid/framework/ir/fusion_group/operation.cc
paddle/fluid/framework/ir/fusion_group/operation.cc
+1
-0
paddle/fluid/framework/ir/fusion_group/subgraph.h
paddle/fluid/framework/ir/fusion_group/subgraph.h
+37
-18
未找到文件。
paddle/fluid/framework/ir/fusion_group/code_generator.cc
浏览文件 @
22708640
...
...
@@ -71,6 +71,8 @@ static bool HasInput(Node* n, std::string name) {
std
::
vector
<
OperationExpression
>
CodeGenerator
::
ConvertToExpressions
(
SubGraph
*
subgraph
)
{
std
::
unordered_map
<
std
::
string
,
int
>
var_ids
=
EncodeVarNodes
(
subgraph
);
std
::
vector
<
Node
*>
intermediate_out_nodes
=
subgraph
->
GetIntermediateOutVarNodes
();
std
::
vector
<
OperationExpression
>
expressions
;
for
(
auto
*
node
:
subgraph
->
SortedNodes
())
{
if
(
node
&&
node
->
IsOp
()
&&
node
->
Op
())
{
...
...
@@ -81,7 +83,8 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
// - X, Y in forward operations
// - X, Y, Out, out@GRAD in backward operations
std
::
vector
<
int
>
input_ids
;
auto
operation
=
OperationMap
::
Instance
().
Get
(
op
->
Type
());
std
::
string
op_name
=
op
->
Type
();
auto
operation
=
OperationMap
::
Instance
().
Get
(
op_name
);
std
::
vector
<
std
::
string
>
input_names
=
operation
.
input_names
;
for
(
auto
&
name
:
input_names
)
{
...
...
@@ -105,6 +108,7 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
std
::
vector
<
int
>
output_ids
;
std
::
vector
<
std
::
string
>
output_names
=
OperationMap
::
Instance
().
Get
(
op
->
Type
()).
output_names
;
std
::
unordered_map
<
int
,
bool
>
intermediate_state
;
for
(
auto
&
name
:
output_names
)
{
PADDLE_ENFORCE_NE
(
...
...
@@ -112,12 +116,21 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
platform
::
errors
::
InvalidArgument
(
"Output(%s) of operation %s is not set."
,
name
,
op
->
Type
()));
output_ids
.
push_back
(
var_ids
[
op
->
Output
(
name
)[
0
]]);
bool
enable_intermediate
=
false
;
for
(
auto
*
n
:
intermediate_out_nodes
)
{
if
(
n
->
Name
()
==
op
->
Output
(
name
)[
0
])
{
enable_intermediate
=
true
;
break
;
}
}
intermediate_state
[
var_ids
[
op
->
Output
(
name
)[
0
]]]
=
enable_intermediate
;
}
std
::
string
lhs_type
=
ExtractDataType
(
node
->
outputs
);
std
::
string
rhs_type
=
ExtractDataType
(
node
->
inputs
);
auto
expression
=
OperationExpression
(
node
->
Name
(),
input_ids
,
output_ids
,
rhs_type
,
lhs_type
);
auto
expression
=
OperationExpression
(
node
->
Name
(),
input_ids
,
output_ids
,
rhs_type
,
lhs_type
,
intermediate_state
);
expression
.
SetAttr
(
attr
);
expressions
.
push_back
(
expression
);
}
...
...
@@ -133,13 +146,17 @@ std::string CodeGenerator::Generate(
// TODO(liuyiqun): Check whether all expressions are elementwise operations.
std
::
set
<
int
>
input_ids
=
std
::
move
(
DistilInputIds
(
expressions
));
std
::
set
<
int
>
output_ids
=
std
::
move
(
DistilOutputIds
(
expressions
));
std
::
set
<
int
>
intermediate_ids
=
std
::
move
(
DistilIntermediateIds
(
expressions
));
std
::
unordered_map
<
int
,
std
::
string
>
dtypes
=
std
::
move
(
DistilDtypes
(
expressions
));
TemplateVariable
template_var
;
template_var
.
Add
(
"func_name"
,
func_name
);
template_var
.
Add
(
"parameters"
,
EmitParameters
(
input_ids
,
output_ids
,
dtypes
));
template_var
.
Add
(
"parameters"
,
EmitParameters
(
input_ids
,
output_ids
,
intermediate_ids
,
dtypes
));
template_var
.
Add
(
"compute_body"
,
EmitComputeBody
(
expressions
,
input_ids
,
output_ids
,
dtypes
));
EmitComputeBody
(
expressions
,
input_ids
,
output_ids
,
intermediate_ids
,
dtypes
));
std
::
set
<
std
::
string
>
all_dtype
;
for
(
const
auto
&
type
:
dtypes
)
{
...
...
@@ -185,6 +202,19 @@ std::set<int> CodeGenerator::DistilOutputIds(
return
output_ids
;
}
std
::
set
<
int
>
CodeGenerator
::
DistilIntermediateIds
(
const
std
::
vector
<
OperationExpression
>&
expressions
)
{
std
::
set
<
int
>
intermediate_ids
;
// Use std::set to remove the reptead id and get a ordered list.
for
(
size_t
i
=
0
;
i
<
expressions
.
size
();
i
++
)
{
for
(
auto
id
:
expressions
[
i
].
GetOutputIds
())
{
auto
intermediate_state
=
expressions
[
i
].
GetIntermediateState
();
if
(
intermediate_state
[
id
])
intermediate_ids
.
insert
(
id
);
}
}
return
intermediate_ids
;
}
std
::
unordered_map
<
int
,
std
::
string
>
CodeGenerator
::
DistilDtypes
(
const
std
::
vector
<
OperationExpression
>&
expressions
)
{
std
::
unordered_map
<
int
,
std
::
string
>
dtypes
;
...
...
@@ -218,6 +248,7 @@ std::unordered_map<int, std::string> CodeGenerator::DistilDtypes(
// we get the parameter list code for the expression information
std
::
string
CodeGenerator
::
EmitParameters
(
const
std
::
set
<
int
>&
input_ids
,
const
std
::
set
<
int
>&
output_ids
,
const
std
::
set
<
int
>&
intermediate_ids
,
const
std
::
unordered_map
<
int
,
std
::
string
>&
dtypes
)
const
{
std
::
stringstream
ret
;
ret
<<
"int N, "
;
...
...
@@ -226,25 +257,28 @@ std::string CodeGenerator::EmitParameters(
// from the input list.
for
(
auto
id
:
input_ids
)
{
if
(
output_ids
.
find
(
id
)
==
output_ids
.
end
())
{
ret
<<
dtypes
.
at
(
id
)
<<
"* "
<<
ArgName
(
id
)
<<
", "
;
ret
<<
"const "
<<
dtypes
.
at
(
id
)
<<
"* __restrict__ "
<<
ArgName
(
id
)
<<
", "
;
}
}
size_t
index
=
0
;
for
(
auto
id
:
output_ids
)
{
if
(
intermediate_ids
.
find
(
id
)
==
intermediate_ids
.
end
())
{
ret
<<
dtypes
.
at
(
id
)
<<
"* "
<<
ArgName
(
id
);
if
(
index
!=
output_ids
.
size
()
-
1
)
{
ret
<<
", "
;
}
}
index
++
;
}
return
ret
.
str
();
}
std
::
string
CodeGenerator
::
EmitComputeBody
(
const
std
::
vector
<
OperationExpression
>&
expressions
,
const
std
::
set
<
int
>&
input_ids
,
const
std
::
set
<
int
>&
output_ids
,
const
std
::
set
<
int
>&
intermediate_ids
,
const
std
::
unordered_map
<
int
,
std
::
string
>&
dtypes
)
const
{
std
::
ostringstream
compute
;
std
::
unordered_set
<
int
>
used
;
...
...
@@ -258,15 +292,18 @@ std::string CodeGenerator::EmitComputeBody(
for
(
auto
id
:
input_ids
)
{
if
(
output_ids
.
find
(
id
)
==
output_ids
.
end
()
&&
used
.
find
(
id
)
!=
used
.
end
())
{
load
<<
dtypes
.
at
(
id
)
<<
" "
<<
TmpName
(
id
)
<<
" = "
<<
VarName
(
id
)
load
<<
dtypes
.
at
(
id
)
<<
" "
<<
TmpName
(
id
)
<<
" = "
<<
"__ldg(&"
<<
VarName
(
id
)
<<
")"
<<
";"
;
}
}
// Store temporal variables to memory.
std
::
ostringstream
store
;
for
(
auto
id
:
output_ids
)
{
if
(
intermediate_ids
.
find
(
id
)
==
intermediate_ids
.
end
())
{
store
<<
VarName
(
id
)
<<
" = "
<<
TmpName
(
id
)
<<
";"
;
}
}
return
load
.
str
()
+
compute
.
str
()
+
store
.
str
();
}
...
...
@@ -285,32 +322,7 @@ std::unordered_map<std::string, int> CodeGenerator::EncodeVarNodes(
var_ids
[
in
->
Name
()]
=
id
++
;
}
}
// Numbering internal vars.
for
(
auto
*
node
:
subgraph
->
SortedNodes
())
{
if
(
node
&&
node
->
IsVar
()
&&
node
->
Var
())
{
bool
is_found
=
false
;
for
(
auto
*
in
:
input_var_nodes
)
{
if
(
node
==
in
)
{
is_found
=
true
;
break
;
}
}
if
(
is_found
)
{
continue
;
}
for
(
auto
*
out
:
output_var_nodes
)
{
if
(
node
==
out
)
{
is_found
=
true
;
break
;
}
}
PADDLE_ENFORCE_EQ
(
is_found
,
true
,
platform
::
errors
::
Unimplemented
(
"Subgraph with internal var nodes (%s) is not supported yet."
,
node
->
Name
()));
}
}
// Encoding output vars.
for
(
auto
*
out
:
output_var_nodes
)
{
VLOG
(
3
)
<<
"Ecoding output names:"
<<
out
->
Name
()
<<
", id:"
<<
id
;
...
...
paddle/fluid/framework/ir/fusion_group/code_generator.h
浏览文件 @
22708640
...
...
@@ -43,17 +43,21 @@ class CodeGenerator {
const
std
::
vector
<
OperationExpression
>&
expressions
);
std
::
set
<
int
>
DistilOutputIds
(
const
std
::
vector
<
OperationExpression
>&
expressions
);
std
::
set
<
int
>
DistilIntermediateIds
(
const
std
::
vector
<
OperationExpression
>&
expressions
);
std
::
unordered_map
<
int
,
std
::
string
>
DistilDtypes
(
const
std
::
vector
<
OperationExpression
>&
expressions
);
// we get the parameter list code for the expression information
std
::
string
EmitParameters
(
const
std
::
set
<
int
>&
input_ids
,
const
std
::
set
<
int
>&
output_ids
,
const
std
::
set
<
int
>&
intermediate_ids
,
const
std
::
unordered_map
<
int
,
std
::
string
>&
dtypes
)
const
;
std
::
string
EmitComputeBody
(
const
std
::
vector
<
OperationExpression
>&
expressions
,
const
std
::
set
<
int
>&
input_ids
,
const
std
::
set
<
int
>&
output_ids
,
const
std
::
set
<
int
>&
intermediate_ids
,
const
std
::
unordered_map
<
int
,
std
::
string
>&
dtypes
)
const
;
// Encode all var nodes in the subgraph with an unique number.
...
...
paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
浏览文件 @
22708640
...
...
@@ -149,8 +149,6 @@ std::string OperationExpression::GetRHS(std::unordered_set<int>* used,
"Expected %d-th input id > 0 for operation < %s "
">. Received %d."
,
index
,
op_type_
,
input_ids_
[
index
]));
// TODO(wangchaochaohu): Here fp16 convert to float to do comupte, we
// need to add general fp16 compute later.
var_name
=
TmpName
(
input_ids_
[
index
]);
rhs
.
replace
(
pos
,
length
+
3
,
var_name
);
used
->
insert
(
input_ids_
[
index
]);
...
...
paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
浏览文件 @
22708640
...
...
@@ -46,14 +46,19 @@ class OperationExpression {
public:
explicit
OperationExpression
(
std
::
string
op_type
,
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
,
std
::
string
rhs_type
,
std
::
string
lhs_type
)
std
::
string
rhs_type
,
std
::
string
lhs_type
,
std
::
unordered_map
<
int
,
bool
>
intermediate_state
)
:
op_type_
(
op_type
),
input_ids_
(
input_ids
),
output_ids_
(
output_ids
),
rhs_type_
(
rhs_type
),
lhs_type_
(
lhs_type
)
{}
lhs_type_
(
lhs_type
),
intermediate_state_
(
intermediate_state
)
{}
std
::
string
GetOpType
()
const
{
return
op_type_
;
}
std
::
unordered_map
<
int
,
bool
>
GetIntermediateState
()
const
{
return
intermediate_state_
;
}
std
::
vector
<
int
>
GetInputIds
()
const
{
return
input_ids_
;
}
std
::
vector
<
int
>
GetOutputIds
()
const
{
return
output_ids_
;
}
std
::
string
GetRHSType
()
const
{
return
rhs_type_
;
}
...
...
@@ -78,6 +83,7 @@ class OperationExpression {
AttributeMap
attr_
;
std
::
string
rhs_type_
;
std
::
string
lhs_type_
;
std
::
unordered_map
<
int
,
bool
>
intermediate_state_
;
};
class
TemplateVariable
{
...
...
paddle/fluid/framework/ir/fusion_group/cuda_resources.h
浏览文件 @
22708640
...
...
@@ -269,6 +269,22 @@ __CUDA_FP16_DECL__ __half hsqrt(const __half a) {
__APPROX_FCAST(sqrt);
}
#if defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__))
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
#define __LDG_PTR "l"
#else
#define __LDG_PTR "r"
#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
__CUDA_FP16_DECL__ __half __ldg(const __half *ptr)
{
__half ret;
asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
return ret;
}
#undef __LDG_PTR
#endif /*defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__))*/
__device__ inline __half Exp(const __half x) { return hexp(x); }
__device__ inline __half Log(const __half x) { return hlog(x); }
__device__ inline __half Sqrt(const __half x) { return hsqrt(x); }
...
...
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
浏览文件 @
22708640
...
...
@@ -48,13 +48,18 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
int
num_subgraphs
=
0
;
size_t
min_subgraph_size
=
2
;
bool
save_intermediate_out
=
tru
e
;
bool
save_intermediate_out
=
fals
e
;
for
(
auto
&
vec
:
subgraphs
)
{
fusion_group
::
SubGraph
subgraph
(
type
,
""
,
save_intermediate_out
,
std
::
unordered_set
<
Node
*>
(
vec
.
begin
(),
vec
.
end
()));
VLOG
(
3
)
<<
"subgraph: {
\n
"
<<
DebugString
(
subgraph
.
SortedNodes
())
<<
"}
\n
"
;
// In elementwise fused kernel, memory is the bound of execution,
// here we remove the output id to use less memory and less time.
if
(
subgraph
.
RemoveIntermediateOut
())
{
subgraph
.
DetectIntermediateOutWithGraph
(
graph
);
}
if
(
subgraph
.
IsValid
(
min_subgraph_size
))
{
subgraph
.
SetFuncName
(
"FusedElementwise"
+
std
::
to_string
(
index
++
));
if
(
GenerateCode
(
&
subgraph
))
{
...
...
@@ -106,6 +111,8 @@ void FusionGroupPass::InsertFusionGroupOp(
subgraph
->
GetInputVarNodes
();
const
std
::
vector
<
Node
*>&
output_vars_of_subgraph
=
subgraph
->
GetOutputVarNodes
();
const
std
::
vector
<
Node
*>
intermediate_vars_of_subgraph
=
subgraph
->
GetIntermediateOutVarNodes
();
std
::
unordered_set
<
Node
*>
external_nodes
;
OpDesc
op_desc
;
...
...
@@ -122,9 +129,18 @@ void FusionGroupPass::InsertFusionGroupOp(
std
::
vector
<
std
::
string
>
output_names
;
std
::
vector
<
std
::
string
>
outs_data_types
;
std
::
vector
<
Node
*>
output_var_without_intermediate
;
for
(
auto
*
n
:
output_vars_of_subgraph
)
{
auto
it_input
=
find
(
input_vars_of_subgraph
.
begin
(),
input_vars_of_subgraph
.
end
(),
n
);
auto
it_intermediate
=
find
(
intermediate_vars_of_subgraph
.
begin
(),
intermediate_vars_of_subgraph
.
end
(),
n
);
if
(
it_intermediate
==
intermediate_vars_of_subgraph
.
end
()
&&
it_input
==
input_vars_of_subgraph
.
end
())
{
output_names
.
push_back
(
n
->
Name
());
outs_data_types
.
push_back
(
DataTypeToString
(
n
->
Var
()
->
GetDataType
()));
output_var_without_intermediate
.
push_back
(
n
);
}
external_nodes
.
insert
(
n
);
}
...
...
@@ -141,7 +157,7 @@ void FusionGroupPass::InsertFusionGroupOp(
IR_NODE_LINK_TO
(
in
,
fusion_group_node
);
}
for
(
auto
*
out
:
output_var
s_of_subgraph
)
{
for
(
auto
*
out
:
output_var
_without_intermediate
)
{
IR_NODE_LINK_TO
(
fusion_group_node
,
out
);
}
...
...
paddle/fluid/framework/ir/fusion_group/operation.cc
浏览文件 @
22708640
...
...
@@ -54,6 +54,7 @@ void OperationMap::Insert(int type, int num_operands, std::string op_type,
std
::
string
grad_op_type
=
op_type
+
"_grad"
;
// grad_inputs = inputs + outputs + grad of outputs
std
::
vector
<
std
::
string
>
grad_input_names
=
input_names
;
for
(
auto
name
:
output_names
)
{
grad_input_names
.
push_back
(
name
);
}
...
...
paddle/fluid/framework/ir/fusion_group/subgraph.h
浏览文件 @
22708640
...
...
@@ -19,6 +19,8 @@ limitations under the License. */
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/fusion_group/operation.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/subgraph_detector.h"
...
...
@@ -64,6 +66,7 @@ class SubGraph {
}
int
GetType
()
const
{
return
type_
;
}
bool
RemoveIntermediateOut
()
{
return
!
save_intermediate_out_
;
}
void
SetFuncName
(
std
::
string
func_name
)
{
func_name_
=
func_name
;
}
std
::
string
GetFuncName
()
const
{
return
func_name_
;
}
...
...
@@ -133,30 +136,45 @@ class SubGraph {
}
}
}
if
(
save_intermediate_out_
)
{
return
output_vars_all
;
}
std
::
vector
<
Node
*>
output_vars_outside
;
for
(
auto
*
n
:
output_vars_all
)
{
// If one of the var_node's outputs is the input of some operator
// outside the subgraph, it is considered the output var node of the
// subgraph.
bool
is_found
=
true
;
if
(
n
->
outputs
.
size
()
==
0U
)
{
is_found
=
false
;
std
::
vector
<
Node
*>
GetIntermediateOutVarNodes
()
{
return
intermediate_out_nodes_
;
}
for
(
auto
*
out
:
n
->
outputs
)
{
if
(
!
Has
(
out
))
{
is_found
=
false
;
void
DetectIntermediateOutWithGraph
(
Graph
*
graph
)
{
auto
graph_nodes
=
graph
->
Nodes
();
for
(
auto
*
n
:
SortedNodes
())
{
bool
enable_remove
=
true
;
if
(
n
&&
n
->
IsVar
()
&&
n
->
Var
())
{
bool
leaf_graph
=
true
;
for
(
auto
*
node
:
graph_nodes
)
{
if
(
node
->
IsOp
())
{
auto
inputs
=
node
->
inputs
;
for
(
auto
*
in
:
inputs
)
{
if
(
in
==
n
)
{
if
(
!
Has
(
node
))
enable_remove
=
false
;
leaf_graph
=
false
;
}
}
if
(
!
is_found
)
{
output_vars_outside
.
push_back
(
n
);
}
if
(
!
enable_remove
)
{
break
;
}
}
if
(
leaf_graph
)
enable_remove
=
false
;
}
else
{
enable_remove
=
false
;
}
if
(
enable_remove
)
{
intermediate_out_nodes_
.
push_back
(
n
);
}
}
return
output_vars_outside
;
}
private:
...
...
@@ -218,6 +236,7 @@ class SubGraph {
bool
save_intermediate_out_
{
true
};
std
::
unordered_set
<
Node
*>
nodes_set_
;
std
::
vector
<
Node
*>
intermediate_out_nodes_
{};
bool
is_sorted_
{
false
};
std
::
vector
<
Node
*>
sorted_nodes_
;
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录