Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
22708640
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
22708640
编写于
4月 26, 2020
作者:
W
wangchaochaohu
提交者:
GitHub
4月 26, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fusion group optimize for cuda codegen(#23940)
上级
94dfb7d7
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
136 addition
and
64 deletion
+136
-64
paddle/fluid/framework/ir/fusion_group/code_generator.cc
paddle/fluid/framework/ir/fusion_group/code_generator.cc
+50
-38
paddle/fluid/framework/ir/fusion_group/code_generator.h
paddle/fluid/framework/ir/fusion_group/code_generator.h
+4
-0
paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
.../fluid/framework/ir/fusion_group/code_generator_helper.cc
+0
-2
paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
...e/fluid/framework/ir/fusion_group/code_generator_helper.h
+8
-2
paddle/fluid/framework/ir/fusion_group/cuda_resources.h
paddle/fluid/framework/ir/fusion_group/cuda_resources.h
+16
-0
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
+20
-4
paddle/fluid/framework/ir/fusion_group/operation.cc
paddle/fluid/framework/ir/fusion_group/operation.cc
+1
-0
paddle/fluid/framework/ir/fusion_group/subgraph.h
paddle/fluid/framework/ir/fusion_group/subgraph.h
+37
-18
未找到文件。
paddle/fluid/framework/ir/fusion_group/code_generator.cc
浏览文件 @
22708640
...
...
@@ -71,6 +71,8 @@ static bool HasInput(Node* n, std::string name) {
std
::
vector
<
OperationExpression
>
CodeGenerator
::
ConvertToExpressions
(
SubGraph
*
subgraph
)
{
std
::
unordered_map
<
std
::
string
,
int
>
var_ids
=
EncodeVarNodes
(
subgraph
);
std
::
vector
<
Node
*>
intermediate_out_nodes
=
subgraph
->
GetIntermediateOutVarNodes
();
std
::
vector
<
OperationExpression
>
expressions
;
for
(
auto
*
node
:
subgraph
->
SortedNodes
())
{
if
(
node
&&
node
->
IsOp
()
&&
node
->
Op
())
{
...
...
@@ -81,7 +83,8 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
// - X, Y in forward operations
// - X, Y, Out, out@GRAD in backward operations
std
::
vector
<
int
>
input_ids
;
auto
operation
=
OperationMap
::
Instance
().
Get
(
op
->
Type
());
std
::
string
op_name
=
op
->
Type
();
auto
operation
=
OperationMap
::
Instance
().
Get
(
op_name
);
std
::
vector
<
std
::
string
>
input_names
=
operation
.
input_names
;
for
(
auto
&
name
:
input_names
)
{
...
...
@@ -105,6 +108,7 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
std
::
vector
<
int
>
output_ids
;
std
::
vector
<
std
::
string
>
output_names
=
OperationMap
::
Instance
().
Get
(
op
->
Type
()).
output_names
;
std
::
unordered_map
<
int
,
bool
>
intermediate_state
;
for
(
auto
&
name
:
output_names
)
{
PADDLE_ENFORCE_NE
(
...
...
@@ -112,12 +116,21 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
platform
::
errors
::
InvalidArgument
(
"Output(%s) of operation %s is not set."
,
name
,
op
->
Type
()));
output_ids
.
push_back
(
var_ids
[
op
->
Output
(
name
)[
0
]]);
bool
enable_intermediate
=
false
;
for
(
auto
*
n
:
intermediate_out_nodes
)
{
if
(
n
->
Name
()
==
op
->
Output
(
name
)[
0
])
{
enable_intermediate
=
true
;
break
;
}
}
intermediate_state
[
var_ids
[
op
->
Output
(
name
)[
0
]]]
=
enable_intermediate
;
}
std
::
string
lhs_type
=
ExtractDataType
(
node
->
outputs
);
std
::
string
rhs_type
=
ExtractDataType
(
node
->
inputs
);
auto
expression
=
OperationExpression
(
node
->
Name
(),
input_ids
,
output_ids
,
rhs_type
,
lhs_type
);
auto
expression
=
OperationExpression
(
node
->
Name
(),
input_ids
,
output_ids
,
rhs_type
,
lhs_type
,
intermediate_state
);
expression
.
SetAttr
(
attr
);
expressions
.
push_back
(
expression
);
}
...
...
@@ -133,13 +146,17 @@ std::string CodeGenerator::Generate(
// TODO(liuyiqun): Check whether all expressions are elementwise operations.
std
::
set
<
int
>
input_ids
=
std
::
move
(
DistilInputIds
(
expressions
));
std
::
set
<
int
>
output_ids
=
std
::
move
(
DistilOutputIds
(
expressions
));
std
::
set
<
int
>
intermediate_ids
=
std
::
move
(
DistilIntermediateIds
(
expressions
));
std
::
unordered_map
<
int
,
std
::
string
>
dtypes
=
std
::
move
(
DistilDtypes
(
expressions
));
TemplateVariable
template_var
;
template_var
.
Add
(
"func_name"
,
func_name
);
template_var
.
Add
(
"parameters"
,
EmitParameters
(
input_ids
,
output_ids
,
dtypes
));
template_var
.
Add
(
"parameters"
,
EmitParameters
(
input_ids
,
output_ids
,
intermediate_ids
,
dtypes
));
template_var
.
Add
(
"compute_body"
,
EmitComputeBody
(
expressions
,
input_ids
,
output_ids
,
dtypes
));
EmitComputeBody
(
expressions
,
input_ids
,
output_ids
,
intermediate_ids
,
dtypes
));
std
::
set
<
std
::
string
>
all_dtype
;
for
(
const
auto
&
type
:
dtypes
)
{
...
...
@@ -185,6 +202,19 @@ std::set<int> CodeGenerator::DistilOutputIds(
return
output_ids
;
}
std
::
set
<
int
>
CodeGenerator
::
DistilIntermediateIds
(
const
std
::
vector
<
OperationExpression
>&
expressions
)
{
std
::
set
<
int
>
intermediate_ids
;
// Use std::set to remove the reptead id and get a ordered list.
for
(
size_t
i
=
0
;
i
<
expressions
.
size
();
i
++
)
{
for
(
auto
id
:
expressions
[
i
].
GetOutputIds
())
{
auto
intermediate_state
=
expressions
[
i
].
GetIntermediateState
();
if
(
intermediate_state
[
id
])
intermediate_ids
.
insert
(
id
);
}
}
return
intermediate_ids
;
}
std
::
unordered_map
<
int
,
std
::
string
>
CodeGenerator
::
DistilDtypes
(
const
std
::
vector
<
OperationExpression
>&
expressions
)
{
std
::
unordered_map
<
int
,
std
::
string
>
dtypes
;
...
...
@@ -218,6 +248,7 @@ std::unordered_map<int, std::string> CodeGenerator::DistilDtypes(
// we get the parameter list code for the expression information
std
::
string
CodeGenerator
::
EmitParameters
(
const
std
::
set
<
int
>&
input_ids
,
const
std
::
set
<
int
>&
output_ids
,
const
std
::
set
<
int
>&
intermediate_ids
,
const
std
::
unordered_map
<
int
,
std
::
string
>&
dtypes
)
const
{
std
::
stringstream
ret
;
ret
<<
"int N, "
;
...
...
@@ -226,25 +257,28 @@ std::string CodeGenerator::EmitParameters(
// from the input list.
for
(
auto
id
:
input_ids
)
{
if
(
output_ids
.
find
(
id
)
==
output_ids
.
end
())
{
ret
<<
dtypes
.
at
(
id
)
<<
"* "
<<
ArgName
(
id
)
<<
", "
;
ret
<<
"const "
<<
dtypes
.
at
(
id
)
<<
"* __restrict__ "
<<
ArgName
(
id
)
<<
", "
;
}
}
size_t
index
=
0
;
for
(
auto
id
:
output_ids
)
{
if
(
intermediate_ids
.
find
(
id
)
==
intermediate_ids
.
end
())
{
ret
<<
dtypes
.
at
(
id
)
<<
"* "
<<
ArgName
(
id
);
if
(
index
!=
output_ids
.
size
()
-
1
)
{
ret
<<
", "
;
}
}
index
++
;
}
return
ret
.
str
();
}
std
::
string
CodeGenerator
::
EmitComputeBody
(
const
std
::
vector
<
OperationExpression
>&
expressions
,
const
std
::
set
<
int
>&
input_ids
,
const
std
::
set
<
int
>&
output_ids
,
const
std
::
set
<
int
>&
intermediate_ids
,
const
std
::
unordered_map
<
int
,
std
::
string
>&
dtypes
)
const
{
std
::
ostringstream
compute
;
std
::
unordered_set
<
int
>
used
;
...
...
@@ -258,15 +292,18 @@ std::string CodeGenerator::EmitComputeBody(
for
(
auto
id
:
input_ids
)
{
if
(
output_ids
.
find
(
id
)
==
output_ids
.
end
()
&&
used
.
find
(
id
)
!=
used
.
end
())
{
load
<<
dtypes
.
at
(
id
)
<<
" "
<<
TmpName
(
id
)
<<
" = "
<<
VarName
(
id
)
load
<<
dtypes
.
at
(
id
)
<<
" "
<<
TmpName
(
id
)
<<
" = "
<<
"__ldg(&"
<<
VarName
(
id
)
<<
")"
<<
";"
;
}
}
// Store temporal variables to memory.
std
::
ostringstream
store
;
for
(
auto
id
:
output_ids
)
{
if
(
intermediate_ids
.
find
(
id
)
==
intermediate_ids
.
end
())
{
store
<<
VarName
(
id
)
<<
" = "
<<
TmpName
(
id
)
<<
";"
;
}
}
return
load
.
str
()
+
compute
.
str
()
+
store
.
str
();
}
...
...
@@ -285,32 +322,7 @@ std::unordered_map<std::string, int> CodeGenerator::EncodeVarNodes(
var_ids
[
in
->
Name
()]
=
id
++
;
}
}
// Numbering internal vars.
for
(
auto
*
node
:
subgraph
->
SortedNodes
())
{
if
(
node
&&
node
->
IsVar
()
&&
node
->
Var
())
{
bool
is_found
=
false
;
for
(
auto
*
in
:
input_var_nodes
)
{
if
(
node
==
in
)
{
is_found
=
true
;
break
;
}
}
if
(
is_found
)
{
continue
;
}
for
(
auto
*
out
:
output_var_nodes
)
{
if
(
node
==
out
)
{
is_found
=
true
;
break
;
}
}
PADDLE_ENFORCE_EQ
(
is_found
,
true
,
platform
::
errors
::
Unimplemented
(
"Subgraph with internal var nodes (%s) is not supported yet."
,
node
->
Name
()));
}
}
// Encoding output vars.
for
(
auto
*
out
:
output_var_nodes
)
{
VLOG
(
3
)
<<
"Ecoding output names:"
<<
out
->
Name
()
<<
", id:"
<<
id
;
...
...
paddle/fluid/framework/ir/fusion_group/code_generator.h
浏览文件 @
22708640
...
...
@@ -43,17 +43,21 @@ class CodeGenerator {
const
std
::
vector
<
OperationExpression
>&
expressions
);
std
::
set
<
int
>
DistilOutputIds
(
const
std
::
vector
<
OperationExpression
>&
expressions
);
std
::
set
<
int
>
DistilIntermediateIds
(
const
std
::
vector
<
OperationExpression
>&
expressions
);
std
::
unordered_map
<
int
,
std
::
string
>
DistilDtypes
(
const
std
::
vector
<
OperationExpression
>&
expressions
);
// we get the parameter list code for the expression information
std
::
string
EmitParameters
(
const
std
::
set
<
int
>&
input_ids
,
const
std
::
set
<
int
>&
output_ids
,
const
std
::
set
<
int
>&
intermediate_ids
,
const
std
::
unordered_map
<
int
,
std
::
string
>&
dtypes
)
const
;
std
::
string
EmitComputeBody
(
const
std
::
vector
<
OperationExpression
>&
expressions
,
const
std
::
set
<
int
>&
input_ids
,
const
std
::
set
<
int
>&
output_ids
,
const
std
::
set
<
int
>&
intermediate_ids
,
const
std
::
unordered_map
<
int
,
std
::
string
>&
dtypes
)
const
;
// Encode all var nodes in the subgraph with an unique number.
...
...
paddle/fluid/framework/ir/fusion_group/code_generator_helper.cc
浏览文件 @
22708640
...
...
@@ -149,8 +149,6 @@ std::string OperationExpression::GetRHS(std::unordered_set<int>* used,
"Expected %d-th input id > 0 for operation < %s "
">. Received %d."
,
index
,
op_type_
,
input_ids_
[
index
]));
// TODO(wangchaochaohu): Here fp16 convert to float to do comupte, we
// need to add general fp16 compute later.
var_name
=
TmpName
(
input_ids_
[
index
]);
rhs
.
replace
(
pos
,
length
+
3
,
var_name
);
used
->
insert
(
input_ids_
[
index
]);
...
...
paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
浏览文件 @
22708640
...
...
@@ -46,14 +46,19 @@ class OperationExpression {
public:
explicit
OperationExpression
(
std
::
string
op_type
,
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
,
std
::
string
rhs_type
,
std
::
string
lhs_type
)
std
::
string
rhs_type
,
std
::
string
lhs_type
,
std
::
unordered_map
<
int
,
bool
>
intermediate_state
)
:
op_type_
(
op_type
),
input_ids_
(
input_ids
),
output_ids_
(
output_ids
),
rhs_type_
(
rhs_type
),
lhs_type_
(
lhs_type
)
{}
lhs_type_
(
lhs_type
),
intermediate_state_
(
intermediate_state
)
{}
std
::
string
GetOpType
()
const
{
return
op_type_
;
}
std
::
unordered_map
<
int
,
bool
>
GetIntermediateState
()
const
{
return
intermediate_state_
;
}
std
::
vector
<
int
>
GetInputIds
()
const
{
return
input_ids_
;
}
std
::
vector
<
int
>
GetOutputIds
()
const
{
return
output_ids_
;
}
std
::
string
GetRHSType
()
const
{
return
rhs_type_
;
}
...
...
@@ -78,6 +83,7 @@ class OperationExpression {
AttributeMap
attr_
;
std
::
string
rhs_type_
;
std
::
string
lhs_type_
;
std
::
unordered_map
<
int
,
bool
>
intermediate_state_
;
};
class
TemplateVariable
{
...
...
paddle/fluid/framework/ir/fusion_group/cuda_resources.h
浏览文件 @
22708640
...
...
@@ -269,6 +269,22 @@ __CUDA_FP16_DECL__ __half hsqrt(const __half a) {
__APPROX_FCAST(sqrt);
}
#if defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__))
#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
#define __LDG_PTR "l"
#else
#define __LDG_PTR "r"
#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
__CUDA_FP16_DECL__ __half __ldg(const __half *ptr)
{
__half ret;
asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
return ret;
}
#undef __LDG_PTR
#endif /*defined(__cplusplus) && (__CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__))*/
__device__ inline __half Exp(const __half x) { return hexp(x); }
__device__ inline __half Log(const __half x) { return hlog(x); }
__device__ inline __half Sqrt(const __half x) { return hsqrt(x); }
...
...
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
浏览文件 @
22708640
...
...
@@ -48,13 +48,18 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
int
num_subgraphs
=
0
;
size_t
min_subgraph_size
=
2
;
bool
save_intermediate_out
=
tru
e
;
bool
save_intermediate_out
=
fals
e
;
for
(
auto
&
vec
:
subgraphs
)
{
fusion_group
::
SubGraph
subgraph
(
type
,
""
,
save_intermediate_out
,
std
::
unordered_set
<
Node
*>
(
vec
.
begin
(),
vec
.
end
()));
VLOG
(
3
)
<<
"subgraph: {
\n
"
<<
DebugString
(
subgraph
.
SortedNodes
())
<<
"}
\n
"
;
// In elementwise fused kernel, memory is the bound of execution,
// here we remove the output id to use less memory and less time.
if
(
subgraph
.
RemoveIntermediateOut
())
{
subgraph
.
DetectIntermediateOutWithGraph
(
graph
);
}
if
(
subgraph
.
IsValid
(
min_subgraph_size
))
{
subgraph
.
SetFuncName
(
"FusedElementwise"
+
std
::
to_string
(
index
++
));
if
(
GenerateCode
(
&
subgraph
))
{
...
...
@@ -106,6 +111,8 @@ void FusionGroupPass::InsertFusionGroupOp(
subgraph
->
GetInputVarNodes
();
const
std
::
vector
<
Node
*>&
output_vars_of_subgraph
=
subgraph
->
GetOutputVarNodes
();
const
std
::
vector
<
Node
*>
intermediate_vars_of_subgraph
=
subgraph
->
GetIntermediateOutVarNodes
();
std
::
unordered_set
<
Node
*>
external_nodes
;
OpDesc
op_desc
;
...
...
@@ -122,9 +129,18 @@ void FusionGroupPass::InsertFusionGroupOp(
std
::
vector
<
std
::
string
>
output_names
;
std
::
vector
<
std
::
string
>
outs_data_types
;
std
::
vector
<
Node
*>
output_var_without_intermediate
;
for
(
auto
*
n
:
output_vars_of_subgraph
)
{
auto
it_input
=
find
(
input_vars_of_subgraph
.
begin
(),
input_vars_of_subgraph
.
end
(),
n
);
auto
it_intermediate
=
find
(
intermediate_vars_of_subgraph
.
begin
(),
intermediate_vars_of_subgraph
.
end
(),
n
);
if
(
it_intermediate
==
intermediate_vars_of_subgraph
.
end
()
&&
it_input
==
input_vars_of_subgraph
.
end
())
{
output_names
.
push_back
(
n
->
Name
());
outs_data_types
.
push_back
(
DataTypeToString
(
n
->
Var
()
->
GetDataType
()));
output_var_without_intermediate
.
push_back
(
n
);
}
external_nodes
.
insert
(
n
);
}
...
...
@@ -141,7 +157,7 @@ void FusionGroupPass::InsertFusionGroupOp(
IR_NODE_LINK_TO
(
in
,
fusion_group_node
);
}
for
(
auto
*
out
:
output_var
s_of_subgraph
)
{
for
(
auto
*
out
:
output_var
_without_intermediate
)
{
IR_NODE_LINK_TO
(
fusion_group_node
,
out
);
}
...
...
paddle/fluid/framework/ir/fusion_group/operation.cc
浏览文件 @
22708640
...
...
@@ -54,6 +54,7 @@ void OperationMap::Insert(int type, int num_operands, std::string op_type,
std
::
string
grad_op_type
=
op_type
+
"_grad"
;
// grad_inputs = inputs + outputs + grad of outputs
std
::
vector
<
std
::
string
>
grad_input_names
=
input_names
;
for
(
auto
name
:
output_names
)
{
grad_input_names
.
push_back
(
name
);
}
...
...
paddle/fluid/framework/ir/fusion_group/subgraph.h
浏览文件 @
22708640
...
...
@@ -19,6 +19,8 @@ limitations under the License. */
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/ir/fusion_group/operation.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/ir/subgraph_detector.h"
...
...
@@ -64,6 +66,7 @@ class SubGraph {
}
int
GetType
()
const
{
return
type_
;
}
bool
RemoveIntermediateOut
()
{
return
!
save_intermediate_out_
;
}
void
SetFuncName
(
std
::
string
func_name
)
{
func_name_
=
func_name
;
}
std
::
string
GetFuncName
()
const
{
return
func_name_
;
}
...
...
@@ -133,30 +136,45 @@ class SubGraph {
}
}
}
if
(
save_intermediate_out_
)
{
return
output_vars_all
;
}
std
::
vector
<
Node
*>
output_vars_outside
;
for
(
auto
*
n
:
output_vars_all
)
{
// If one of the var_node's outputs is the input of some operator
// outside the subgraph, it is considered the output var node of the
// subgraph.
bool
is_found
=
true
;
if
(
n
->
outputs
.
size
()
==
0U
)
{
is_found
=
false
;
std
::
vector
<
Node
*>
GetIntermediateOutVarNodes
()
{
return
intermediate_out_nodes_
;
}
for
(
auto
*
out
:
n
->
outputs
)
{
if
(
!
Has
(
out
))
{
is_found
=
false
;
void
DetectIntermediateOutWithGraph
(
Graph
*
graph
)
{
auto
graph_nodes
=
graph
->
Nodes
();
for
(
auto
*
n
:
SortedNodes
())
{
bool
enable_remove
=
true
;
if
(
n
&&
n
->
IsVar
()
&&
n
->
Var
())
{
bool
leaf_graph
=
true
;
for
(
auto
*
node
:
graph_nodes
)
{
if
(
node
->
IsOp
())
{
auto
inputs
=
node
->
inputs
;
for
(
auto
*
in
:
inputs
)
{
if
(
in
==
n
)
{
if
(
!
Has
(
node
))
enable_remove
=
false
;
leaf_graph
=
false
;
}
}
if
(
!
is_found
)
{
output_vars_outside
.
push_back
(
n
);
}
if
(
!
enable_remove
)
{
break
;
}
}
if
(
leaf_graph
)
enable_remove
=
false
;
}
else
{
enable_remove
=
false
;
}
if
(
enable_remove
)
{
intermediate_out_nodes_
.
push_back
(
n
);
}
}
return
output_vars_outside
;
}
private:
...
...
@@ -218,6 +236,7 @@ class SubGraph {
bool
save_intermediate_out_
{
true
};
std
::
unordered_set
<
Node
*>
nodes_set_
;
std
::
vector
<
Node
*>
intermediate_out_nodes_
{};
bool
is_sorted_
{
false
};
std
::
vector
<
Node
*>
sorted_nodes_
;
};
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录