Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
22bbd547
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
22bbd547
编写于
2月 21, 2020
作者:
Y
Yiqun Liu
提交者:
GitHub
2月 21, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add the support of fp16 in fusion_group (#22239)
上级
d97475d5
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
570 addition
and
194 deletion
+570
-194
paddle/fluid/framework/ir/fusion_group/code_generator.cc
paddle/fluid/framework/ir/fusion_group/code_generator.cc
+28
-7
paddle/fluid/framework/ir/fusion_group/code_generator.h
paddle/fluid/framework/ir/fusion_group/code_generator.h
+2
-2
paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
...e/fluid/framework/ir/fusion_group/code_generator_helper.h
+0
-25
paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
.../fluid/framework/ir/fusion_group/code_generator_tester.cc
+157
-124
paddle/fluid/framework/ir/fusion_group/cuda_resources.h
paddle/fluid/framework/ir/fusion_group/cuda_resources.h
+82
-0
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
+19
-18
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
+1
-1
paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
...uid/framework/ir/fusion_group/fusion_group_pass_tester.cc
+10
-0
paddle/fluid/framework/ir/fusion_group/operation.cc
paddle/fluid/framework/ir/fusion_group/operation.cc
+19
-4
paddle/fluid/framework/ir/fusion_group/subgraph.h
paddle/fluid/framework/ir/fusion_group/subgraph.h
+45
-1
paddle/fluid/framework/ir/pass_tester_helper.h
paddle/fluid/framework/ir/pass_tester_helper.h
+6
-3
paddle/fluid/operators/fused/fusion_group_op.cu.cc
paddle/fluid/operators/fused/fusion_group_op.cu.cc
+4
-3
paddle/fluid/platform/device_code.cc
paddle/fluid/platform/device_code.cc
+52
-3
paddle/fluid/platform/device_code.h
paddle/fluid/platform/device_code.h
+2
-2
python/paddle/fluid/tests/unittests/ir/CMakeLists.txt
python/paddle/fluid/tests/unittests/ir/CMakeLists.txt
+1
-1
python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
...dle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
+142
-0
未找到文件。
paddle/fluid/framework/ir/fusion_group/code_generator.cc
浏览文件 @
22bbd547
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <sstream>
#include <unordered_set>
#include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h"
#include "paddle/fluid/framework/ir/fusion_group/cuda_resources.h"
#include "paddle/fluid/framework/ir/fusion_group/operation.h"
namespace
paddle
{
...
...
@@ -27,13 +28,14 @@ CodeGenerator::CodeGenerator() {
// Only support elementwise operations now.
code_templates_
.
resize
(
1
);
CodeTemplate
elementwise_t
(
elementwise_cuda_template
);
CodeTemplate
elementwise_t
(
cuda_kernel_template_1d
);
code_templates_
[
0
]
=
elementwise_t
;
}
std
::
string
CodeGenerator
::
Generate
(
SubGraph
*
subgraph
)
{
std
::
vector
<
OperationExpression
>
expressions
=
ConvertToExpressions
(
subgraph
);
return
Generate
(
subgraph
->
GetFuncName
(),
expressions
);
return
Generate
(
subgraph
->
GetFuncName
(),
subgraph
->
GetDataType
(),
expressions
);
}
static
bool
HasInput
(
Node
*
n
,
std
::
string
name
)
{
...
...
@@ -100,9 +102,9 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
// In order to get the right result of expression, we need to calculate and
// store the expression as suffix Expressions using vector.
std
::
string
CodeGenerator
::
Generate
(
std
::
string
func_name
,
std
::
vector
<
OperationExpression
>
expressions
)
{
std
::
string
func_name
,
std
::
string
dtype
,
const
std
::
vector
<
OperationExpression
>&
expressions
)
{
// TODO(liuyiqun): Check whether all expressions are elementwise operations.
std
::
string
dtype
=
"float"
;
std
::
set
<
int
>
input_ids
=
DistilInputIds
(
expressions
);
std
::
set
<
int
>
output_ids
=
DistilOutputIds
(
expressions
);
...
...
@@ -111,6 +113,15 @@ std::string CodeGenerator::Generate(
template_var
.
Add
(
"parameters"
,
EmitParameters
(
input_ids
,
output_ids
,
dtype
));
template_var
.
Add
(
"compute_body"
,
EmitComputeBody
(
expressions
,
input_ids
,
output_ids
,
dtype
));
std
::
string
predefined_cuda_functions
;
if
(
dtype
==
"float"
)
{
predefined_cuda_functions
=
predefined_cuda_functions_fp32
;
}
else
if
(
dtype
==
"double"
)
{
predefined_cuda_functions
=
predefined_cuda_functions_fp64
;
}
else
if
(
dtype
==
"float16"
)
{
predefined_cuda_functions
=
predefined_cuda_functions_fp16
;
}
return
predefined_cuda_functions
+
code_templates_
[
0
].
Format
(
template_var
);
}
...
...
@@ -173,9 +184,10 @@ std::string CodeGenerator::EmitComputeBody(
std
::
string
dtype
)
{
std
::
ostringstream
compute
;
std
::
unordered_set
<
int
>
used
;
std
::
string
compute_dtype
=
(
dtype
==
"float16"
)
?
"float"
:
dtype
;
for
(
size_t
i
=
0
;
i
<
expressions
.
size
();
i
++
)
{
VLOG
(
3
)
<<
DebugString
(
expressions
[
i
]);
compute
<<
expressions
[
i
].
GetExpression
(
dtype
,
&
used
);
compute
<<
expressions
[
i
].
GetExpression
(
compute_
dtype
,
&
used
);
}
// Load input to temporal variables.
...
...
@@ -183,14 +195,23 @@ std::string CodeGenerator::EmitComputeBody(
for
(
auto
id
:
input_ids
)
{
if
(
output_ids
.
find
(
id
)
==
output_ids
.
end
()
&&
used
.
find
(
id
)
!=
used
.
end
())
{
load
<<
dtype
<<
" "
<<
TmpName
(
id
)
<<
" = "
<<
ArgName
(
id
)
<<
"[idx];"
;
if
(
dtype
==
"float16"
)
{
load
<<
"float "
<<
TmpName
(
id
)
<<
" = __half2float("
<<
ArgName
(
id
)
<<
"[idx]);"
;
}
else
{
load
<<
dtype
<<
" "
<<
TmpName
(
id
)
<<
" = "
<<
ArgName
(
id
)
<<
"[idx];"
;
}
}
}
// Store temporal variables to memory.
std
::
ostringstream
store
;
for
(
auto
id
:
output_ids
)
{
store
<<
ArgName
(
id
)
<<
"[idx] = "
<<
TmpName
(
id
)
<<
";"
;
if
(
dtype
==
"float16"
)
{
store
<<
ArgName
(
id
)
<<
"[idx] = __float2half("
<<
TmpName
(
id
)
<<
");"
;
}
else
{
store
<<
ArgName
(
id
)
<<
"[idx] = "
<<
TmpName
(
id
)
<<
";"
;
}
}
return
load
.
str
()
+
compute
.
str
()
+
store
.
str
();
...
...
paddle/fluid/framework/ir/fusion_group/code_generator.h
浏览文件 @
22bbd547
...
...
@@ -30,8 +30,8 @@ class CodeGenerator {
public:
CodeGenerator
();
std
::
string
Generate
(
std
::
string
func_name
,
std
::
vector
<
OperationExpression
>
expressions
);
std
::
string
Generate
(
std
::
string
func_name
,
std
::
string
dtype
,
const
std
::
vector
<
OperationExpression
>&
expressions
);
std
::
string
Generate
(
SubGraph
*
subgraph
);
...
...
paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
浏览文件 @
22bbd547
...
...
@@ -149,31 +149,6 @@ class CodeTemplate {
std
::
string
template_str_
;
};
static
const
char
predefined_cuda_functions
[]
=
R"(
__device__ float real_exp(float x) { return ::expf(x); }
__device__ double real_exp(double x) { return ::exp(x); }
__device__ float real_log(float x) { return ::logf(x); }
__device__ double real_log(double x) { return ::log(x); }
__device__ float real_min(float x, float y) { return ::fminf(x, y); }
__device__ double real_min(double x, double y) { return ::fmin(x, y); }
__device__ float real_max(float x, float y) { return ::fmaxf(x, y); }
__device__ double real_max(double x, double y) { return ::fmax(x, y); }
)"
;
static
const
char
elementwise_cuda_template
[]
=
R"(
extern "C" __global__ void $func_name($parameters) {
for(int idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < N;
idx += gridDim.x * blockDim.x) {
$compute_body
}
}
)"
;
static
std
::
string
DebugString
(
const
OperationExpression
&
expr
)
{
std
::
stringstream
ret
;
ret
<<
"Op("
<<
expr
.
GetOpType
()
<<
"), inputs:{"
;
...
...
paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
浏览文件 @
22bbd547
...
...
@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/platform/device_code.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/init.h"
#ifdef PADDLE_WITH_CUDA
...
...
@@ -88,7 +89,8 @@ inline float elementwise_mul_grad_dy(float x, float y, float out, float dout) {
void
CheckOutput
(
const
std
::
vector
<
OperationExpression
>&
expressions
,
const
std
::
vector
<
LoDTensor
>
cpu_tensors
,
const
std
::
vector
<
int
>
input_ids_of_subgraph
,
const
std
::
vector
<
int
>
output_ids_of_subgraph
,
int
i
)
{
const
std
::
vector
<
int
>
output_ids_of_subgraph
,
int
i
,
float
eps
)
{
std
::
vector
<
float
>
var
(
cpu_tensors
.
size
());
for
(
auto
id
:
input_ids_of_subgraph
)
{
if
(
id
>=
0
)
{
...
...
@@ -138,7 +140,12 @@ void CheckOutput(const std::vector<OperationExpression>& expressions,
for
(
auto
id
:
output_ids_of_subgraph
)
{
float
actual
=
cpu_tensors
[
id
].
data
<
float
>
()[
i
];
float
expect
=
var
[
id
];
EXPECT_LT
(
fabs
(
actual
-
expect
),
1.E-05
);
if
(
fabs
(
actual
-
expect
)
>
eps
)
{
LOG
(
INFO
)
<<
"Precision check failed from i = "
<<
id
<<
", expect: "
<<
expect
<<
", actual: "
<<
actual
;
EXPECT_LT
(
fabs
(
actual
-
expect
),
eps
);
break
;
}
}
}
...
...
@@ -162,33 +169,49 @@ void SetupRandomCPUTensor(LoDTensor* tensor) {
namespace
fusion_group
=
paddle
::
framework
::
ir
::
fusion_group
;
template
<
typename
T
>
void
TestMainImpl
(
std
::
string
func_name
,
std
::
string
code_str
,
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
,
int
n
,
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
)
{
bool
is_float16
=
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
paddle
::
platform
::
float16
));
paddle
::
framework
::
InitDevices
(
false
,
{
0
});
paddle
::
platform
::
CUDAPlace
place
=
paddle
::
platform
::
CUDAPlace
(
0
);
paddle
::
platform
::
CUDADeviceCode
device_code
(
place
,
func_name
,
code_str
);
device_code
.
Compile
();
device_code
.
Compile
(
is_float16
);
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
gpu_tensors
(
cpu_tensors
.
size
());
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
tmp_cpu_tensors
(
cpu_tensors
.
size
());
std
::
vector
<
float
*>
gpu_ptrs
(
gpu_tensors
.
size
());
std
::
vector
<
T
*>
gpu_ptrs
(
gpu_tensors
.
size
());
std
::
vector
<
void
*>
args
;
args
.
push_back
(
&
n
);
for
(
auto
id
:
input_ids
)
{
if
(
id
>=
0
)
{
gpu_ptrs
[
id
]
=
gpu_tensors
[
id
].
mutable_data
<
float
>
(
cpu_tensors
[
id
].
dims
(),
place
);
gpu_tensors
[
id
].
mutable_data
<
T
>
(
cpu_tensors
[
id
].
dims
(),
place
);
fusion_group
::
SetupRandomCPUTensor
<
float
>
(
&
cpu_tensors
[
id
]);
TensorCopySync
(
cpu_tensors
[
id
],
place
,
&
gpu_tensors
[
id
]);
if
(
is_float16
)
{
paddle
::
platform
::
float16
*
tmp_cpu_ptr
=
tmp_cpu_tensors
[
id
].
mutable_data
<
paddle
::
platform
::
float16
>
(
cpu_tensors
[
id
].
dims
(),
paddle
::
platform
::
CPUPlace
());
const
float
*
cpu_ptr
=
cpu_tensors
[
id
].
data
<
float
>
();
for
(
int64_t
i
=
0
;
i
<
cpu_tensors
[
id
].
numel
();
++
i
)
{
tmp_cpu_ptr
[
i
]
=
paddle
::
platform
::
float16
(
cpu_ptr
[
i
]);
}
TensorCopySync
(
tmp_cpu_tensors
[
id
],
place
,
&
gpu_tensors
[
id
]);
}
else
{
TensorCopySync
(
cpu_tensors
[
id
],
place
,
&
gpu_tensors
[
id
]);
}
args
.
push_back
(
&
gpu_ptrs
[
id
]);
}
}
for
(
auto
id
:
output_ids
)
{
gpu_ptrs
[
id
]
=
gpu_tensors
[
id
].
mutable_data
<
float
>
(
cpu_tensors
[
id
].
dims
(),
place
);
gpu_tensors
[
id
].
mutable_data
<
T
>
(
cpu_tensors
[
id
].
dims
(),
place
);
args
.
push_back
(
&
gpu_ptrs
[
id
]);
}
...
...
@@ -200,38 +223,93 @@ void TestMainImpl(std::string func_name, std::string code_str,
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
dev_ctx
->
Wait
();
// Copy the results back to CPU.
for
(
auto
id
:
output_ids
)
{
TensorCopySync
(
gpu_tensors
[
id
],
paddle
::
platform
::
CPUPlace
(),
&
cpu_tensors
[
id
]);
if
(
is_float16
)
{
paddle
::
platform
::
float16
*
tmp_cpu_ptr
=
tmp_cpu_tensors
[
id
].
mutable_data
<
paddle
::
platform
::
float16
>
(
cpu_tensors
[
id
].
dims
(),
paddle
::
platform
::
CPUPlace
());
TensorCopySync
(
gpu_tensors
[
id
],
paddle
::
platform
::
CPUPlace
(),
&
tmp_cpu_tensors
[
id
]);
float
*
cpu_ptr
=
cpu_tensors
[
id
].
mutable_data
<
float
>
(
cpu_tensors
[
id
].
dims
(),
paddle
::
platform
::
CPUPlace
());
for
(
int64_t
i
=
0
;
i
<
cpu_tensors
[
id
].
numel
();
++
i
)
{
cpu_ptr
[
i
]
=
static_cast
<
float
>
(
tmp_cpu_ptr
[
i
]);
}
}
else
{
TensorCopySync
(
gpu_tensors
[
id
],
paddle
::
platform
::
CPUPlace
(),
&
cpu_tensors
[
id
]);
}
}
}
void
TestElementwiseMain
(
std
::
string
func_name
,
std
::
string
code_str
,
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
,
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
,
std
::
string
dtype
)
{
std
::
unordered_set
<
int
>
ids
;
for
(
auto
id
:
input_ids
)
{
ids
.
insert
(
id
);
}
for
(
auto
id
:
output_ids
)
{
ids
.
insert
(
id
);
}
// Prepare CPU tensors which always hold float.
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
(
ids
.
size
());
auto
dims
=
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
256
),
static_cast
<
int64_t
>
(
1024
)});
for
(
size_t
i
=
0
;
i
<
cpu_tensors
.
size
();
++
i
)
{
cpu_tensors
[
i
].
mutable_data
<
float
>
(
dims
,
paddle
::
platform
::
CPUPlace
());
}
int
n
=
cpu_tensors
[
0
].
numel
();
if
(
dtype
==
"float16"
)
{
TestMainImpl
<
paddle
::
platform
::
float16
>
(
func_name
,
code_str
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
}
else
{
TestMainImpl
<
float
>
(
func_name
,
code_str
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
}
// Check the results
float
eps
=
(
dtype
==
"float16"
)
?
1E-2
:
1E-5
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
fusion_group
::
CheckOutput
(
expressions
,
cpu_tensors
,
input_ids
,
output_ids
,
i
,
eps
);
}
}
void
TestMain
(
std
::
string
func_name
,
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
,
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
,
int
n
,
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
)
{
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
,
std
::
string
dtype
)
{
fusion_group
::
OperationMap
::
Init
();
fusion_group
::
CodeGenerator
code_generator
;
std
::
string
code_str
=
code_generator
.
Generate
(
func_name
,
expressions
);
std
::
string
code_str
=
code_generator
.
Generate
(
func_name
,
dtype
,
expressions
);
VLOG
(
3
)
<<
code_str
;
TestMainImpl
(
func_name
,
code_str
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
LOG
(
INFO
)
<<
"dtype: "
<<
dtype
;
TestElementwiseMain
(
func_name
,
code_str
,
expressions
,
input_ids
,
output_ids
,
dtype
);
}
std
::
vector
<
fusion_group
::
OperationExpression
>
TestMain
(
fusion_group
::
SubGraph
*
subgraph
,
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
,
int
n
,
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
)
{
void
TestMain
(
fusion_group
::
SubGraph
*
subgraph
,
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
)
{
fusion_group
::
OperationMap
::
Init
();
fusion_group
::
CodeGenerator
code_generator
;
std
::
string
code_str
=
code_generator
.
Generate
(
subgraph
);
VLOG
(
3
)
<<
code_str
;
TestMainImpl
(
subgraph
->
GetFuncName
(),
code_str
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
// Need to check the accuracy according to expressions.
return
code_generator
.
ConvertToExpressions
(
subgraph
);
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
=
code_generator
.
ConvertToExpressions
(
subgraph
);
LOG
(
INFO
)
<<
"dtype: "
<<
subgraph
->
GetDataType
();
TestElementwiseMain
(
subgraph
->
GetFuncName
(),
code_str
,
expressions
,
input_ids
,
output_ids
,
subgraph
->
GetDataType
());
}
TEST
(
code_generator
,
elementwise
)
{
...
...
@@ -248,30 +326,16 @@ TEST(code_generator, elementwise) {
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
=
{
exp1
,
exp2
,
exp3
,
exp4
,
exp5
};
// Prepare CPU tensors
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
(
9
);
auto
dims
=
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
256
),
static_cast
<
int64_t
>
(
1024
)});
for
(
size_t
i
=
0
;
i
<
cpu_tensors
.
size
();
++
i
)
{
cpu_tensors
[
i
].
mutable_data
<
float
>
(
dims
,
paddle
::
platform
::
CPUPlace
());
}
// Expressions:
// Op(elementwise_mul), inputs:{0,1}, outputs:{2}
// Op(elementwise_add), inputs:{2,3}, outputs:{4}
// Op(elementwise_sub), inputs:{4,5}, outputs:{6}
// Op(relu), inputs:{6}, outputs:{7}
// Op(sigmoid), inputs:{7}, outputs:{8}
int
n
=
cpu_tensors
[
0
].
numel
();
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
3
,
5
};
std
::
vector
<
int
>
output_ids
=
{
2
,
4
,
6
,
7
,
8
};
TestMain
(
"elementwise_kernel_0"
,
expressions
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
// Check the results
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
fusion_group
::
CheckOutput
(
expressions
,
cpu_tensors
,
input_ids
,
output_ids
,
i
);
for
(
std
::
string
dtype
:
{
"float"
,
"float16"
})
{
// Expressions:
// Op(elementwise_mul), inputs:{0,1}, outputs:{2}
// Op(elementwise_add), inputs:{2,3}, outputs:{4}
// Op(elementwise_sub), inputs:{4,5}, outputs:{6}
// Op(relu), inputs:{6}, outputs:{7}
// Op(sigmoid), inputs:{7}, outputs:{8}
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
3
,
5
};
std
::
vector
<
int
>
output_ids
=
{
2
,
4
,
6
,
7
,
8
};
TestMain
(
"elementwise_kernel_0"
,
expressions
,
input_ids
,
output_ids
,
dtype
);
}
}
...
...
@@ -286,32 +350,19 @@ TEST(code_generator, elementwise_grad) {
{
4
,
5
});
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
=
{
exp1
,
exp2
};
// Prepare CPU tensors
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
(
8
);
auto
dims
=
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
256
),
static_cast
<
int64_t
>
(
1024
)});
for
(
size_t
i
=
0
;
i
<
cpu_tensors
.
size
();
++
i
)
{
cpu_tensors
[
i
].
mutable_data
<
float
>
(
dims
,
paddle
::
platform
::
CPUPlace
());
}
// Expressions:
// Op(relu_grad), inputs:{2,3,7}, outputs:{6}
// Op(elementwise_mul_grad), inputs:{0,1,2,6}, outputs:{4,5}
int
n
=
cpu_tensors
[
0
].
numel
();
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
2
,
3
,
7
};
std
::
vector
<
int
>
output_ids
=
{
4
,
5
,
6
};
TestMain
(
"elementwise_grad_kernel_0"
,
expressions
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
// Check the results
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
fusion_group
::
CheckOutput
(
expressions
,
cpu_tensors
,
input_ids
,
output_ids
,
i
);
for
(
std
::
string
dtype
:
{
"float"
,
"float16"
})
{
// Expressions:
// Op(relu_grad), inputs:{2,3,7}, outputs:{6}
// Op(elementwise_mul_grad), inputs:{0,1,2,6}, outputs:{4,5}
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
2
,
3
,
7
};
std
::
vector
<
int
>
output_ids
=
{
4
,
5
,
6
};
TestMain
(
"elementwise_grad_kernel_0"
,
expressions
,
input_ids
,
output_ids
,
dtype
);
}
}
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
BuildGraph
(
bool
backward
=
fals
e
)
{
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
BuildGraph
(
bool
backward
,
std
::
string
dtyp
e
)
{
// inputs operator output
// --------------------------------------------------------
// x0 sigmoid -> tmp_0
...
...
@@ -353,6 +404,14 @@ std::unique_ptr<paddle::framework::ir::Graph> BuildGraph(
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
graph
(
new
paddle
::
framework
::
ir
::
Graph
(
layers
.
main_program
()));
auto
proto_dtype
=
(
dtype
==
"float16"
)
?
paddle
::
framework
::
proto
::
VarType
::
FP16
:
paddle
::
framework
::
proto
::
VarType
::
FP32
;
for
(
auto
*
n
:
graph
->
Nodes
())
{
if
(
n
&&
n
->
IsVar
()
&&
n
->
Var
())
{
n
->
Var
()
->
SetDataType
(
proto_dtype
);
}
}
#ifdef __clang__
return
graph
;
#else
...
...
@@ -401,66 +460,40 @@ std::unordered_set<paddle::framework::ir::Node*> DistilGradNodes(
}
TEST
(
code_generator
,
subgraph
)
{
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
graph
=
BuildGraph
(
false
);
fusion_group
::
SubGraph
subgraph
(
0
,
"elementwise_kernel_1"
,
true
,
graph
->
Nodes
());
// Prepare CPU tensors
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
(
9
);
auto
dims
=
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
256
),
static_cast
<
int64_t
>
(
1024
)});
for
(
size_t
i
=
0
;
i
<
cpu_tensors
.
size
();
++
i
)
{
cpu_tensors
[
i
].
mutable_data
<
float
>
(
dims
,
paddle
::
platform
::
CPUPlace
());
}
// Expressions generated by code_generator (they may be different):
// Op(sigmoid), inputs:{0}, outputs:{4}
// Op(elementwise_mul), inputs:{4,1}, outputs:{7}
// Op(tanh), inputs:{2}, outputs:{5}
// Op(elementwise_mul), inputs:{3,5}, outputs:{6}
// Op(elementwise_add), inputs:{7,6}, outputs:{8}
int
n
=
cpu_tensors
[
0
].
numel
();
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
2
,
3
};
std
::
vector
<
int
>
output_ids
=
{
4
,
5
,
6
,
7
,
8
};
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
=
TestMain
(
&
subgraph
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
// Check the results
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
fusion_group
::
CheckOutput
(
expressions
,
cpu_tensors
,
input_ids
,
output_ids
,
i
);
for
(
std
::
string
dtype
:
{
"float"
,
"float16"
})
{
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
graph
=
BuildGraph
(
false
,
dtype
);
fusion_group
::
SubGraph
subgraph
(
0
,
"elementwise_kernel_1"
,
true
,
graph
->
Nodes
());
// Expressions generated by code_generator (they may be different):
// Op(sigmoid), inputs:{0}, outputs:{4}
// Op(elementwise_mul), inputs:{4,1}, outputs:{7}
// Op(tanh), inputs:{2}, outputs:{5}
// Op(elementwise_mul), inputs:{3,5}, outputs:{6}
// Op(elementwise_add), inputs:{7,6}, outputs:{8}
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
2
,
3
};
std
::
vector
<
int
>
output_ids
=
{
4
,
5
,
6
,
7
,
8
};
TestMain
(
&
subgraph
,
input_ids
,
output_ids
);
}
}
TEST
(
code_generator
,
subgraph_grad
)
{
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
graph
=
BuildGraph
(
true
);
fusion_group
::
SubGraph
subgraph
(
0
,
"elementwise_grad_kernel_1"
,
true
,
DistilGradNodes
(
graph
));
// Prepare CPU tensors
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
(
18
);
auto
dims
=
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
256
),
static_cast
<
int64_t
>
(
1024
)});
for
(
size_t
i
=
0
;
i
<
cpu_tensors
.
size
();
++
i
)
{
cpu_tensors
[
i
].
mutable_data
<
float
>
(
dims
,
paddle
::
platform
::
CPUPlace
());
}
// Expressions generated by code_generator (they may be different):
// Op(elementwise_add_grad), inputs:{1,2,3,0}, outputs:{11,10}
// Op(elementwise_mul_grad), inputs:{5,4,2,10}, outputs:{17,13}
// Op(elementwise_mul_grad), inputs:{7,6,1,11}, outputs:{12,15}
// Op(sigmoid_grad), inputs:{8,7,12}, outputs:{16}
// Op(tanh_grad), inputs:{9,4,13}, outputs:{14}
int
n
=
cpu_tensors
[
0
].
numel
();
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
std
::
vector
<
int
>
output_ids
=
{
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
};
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
=
TestMain
(
&
subgraph
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
// Check the results
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
fusion_group
::
CheckOutput
(
expressions
,
cpu_tensors
,
input_ids
,
output_ids
,
i
);
for
(
std
::
string
dtype
:
{
"float"
,
"float16"
})
{
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
graph
=
BuildGraph
(
true
,
dtype
);
fusion_group
::
SubGraph
subgraph
(
0
,
"elementwise_grad_kernel_1"
,
true
,
DistilGradNodes
(
graph
));
// Expressions generated by code_generator (they may be different):
// Op(elementwise_add_grad), inputs:{1,2,3,0}, outputs:{11,10}
// Op(elementwise_mul_grad), inputs:{5,4,2,10}, outputs:{17,13}
// Op(elementwise_mul_grad), inputs:{7,6,1,11}, outputs:{12,15}
// Op(sigmoid_grad), inputs:{8,7,12}, outputs:{16}
// Op(tanh_grad), inputs:{9,4,13}, outputs:{14}
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
std
::
vector
<
int
>
output_ids
=
{
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
};
TestMain
(
&
subgraph
,
input_ids
,
output_ids
);
}
}
#endif
paddle/fluid/framework/ir/fusion_group/cuda_resources.h
0 → 100644
浏览文件 @
22bbd547
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace
paddle
{
namespace
framework
{
namespace
ir
{
namespace
fusion_group
{
static
constexpr
char
predefined_cuda_functions_fp32
[]
=
R"(
__device__ inline float real_exp(float x) { return ::expf(x); }
__device__ inline float real_log(float x) { return ::logf(x); }
)"
;
static
constexpr
char
predefined_cuda_functions_fp64
[]
=
R"(
__device__ inline double real_exp(double x) { return ::exp(x); }
__device__ inline double real_log(double x) { return ::log(x); }
)"
;
static
constexpr
char
predefined_cuda_functions_fp16
[]
=
R"(
__device__ inline float real_exp(float x) { return ::expf(x); }
__device__ inline float real_log(float x) { return ::logf(x); }
#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
struct __align__(2) __half {
__device__ __half() { }
protected:
unsigned short __x;
};
__device__ __half __float2half(const float f) {
__half val;
asm("{ cvt.rn.f16.f32 %0, %1; }\n" : "=h"(__HALF_TO_US(val)
) : "f"(f));
return val;
}
__device__ float __half2float(const __half h) {
float val;
asm("{ cvt.f32.f16 %0, %1; }\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
return val;
}
#undef __HALF_TO_US
#undef __HALF_TO_CUS
typedef __half float16;
)"
;
static
constexpr
char
cuda_kernel_template_1d
[]
=
R"(
extern "C" __global__ void $func_name($parameters) {
for(int idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < N;
idx += gridDim.x * blockDim.x) {
$compute_body
}
}
)"
;
}
// namespace fusion_group
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
浏览文件 @
22bbd547
...
...
@@ -32,8 +32,7 @@ void FusionGroupPass::ApplyImpl(ir::Graph* graph) const {
if
(
Get
<
bool
>
(
"use_gpu"
))
{
fusion_group
::
OperationMap
::
Init
();
int
num_elementwise_groups
=
DetectFusionGroup
(
graph
,
0
);
VLOG
(
3
)
<<
"Detect "
<<
num_elementwise_groups
<<
" elementwise fusion groups."
;
AddStatis
(
num_elementwise_groups
);
}
}
...
...
@@ -49,23 +48,23 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
size_t
min_subgraph_size
=
2
;
bool
save_intermediate_out
=
true
;
for
(
auto
&
vec
:
subgraphs
)
{
if
(
vec
.
size
()
>=
min_subgraph_size
)
{
std
::
string
func_name
=
"fused_elementwise_"
+
std
::
to_string
(
index
++
);
fusion_group
::
SubGraph
subgraph
(
type
,
func_name
,
save_intermediate_out
,
std
::
unordered_set
<
Node
*>
(
vec
.
begin
(),
vec
.
end
()));
VLOG
(
3
)
<<
"subgraph: {
\n
"
<<
DebugString
(
subgraph
.
SortedNodes
())
<<
"}
\n
"
;
GenerateCode
(
&
subgraph
);
InsertFusionGroupOp
(
graph
,
&
subgraph
)
;
num_subgraphs
++
;
fusion_group
::
SubGraph
subgraph
(
type
,
""
,
save_intermediate_out
,
std
::
unordered_set
<
Node
*>
(
vec
.
begin
(),
vec
.
end
()));
VLOG
(
3
)
<<
"subgraph: {
\n
"
<<
DebugString
(
subgraph
.
SortedNodes
())
<<
"}
\n
"
;
if
(
subgraph
.
IsValid
(
min_subgraph_size
))
{
subgraph
.
SetFuncName
(
"fused_elementwise_"
+
std
::
to_string
(
index
++
))
;
if
(
GenerateCode
(
&
subgraph
))
{
InsertFusionGroupOp
(
graph
,
&
subgraph
);
num_subgraphs
++
;
}
}
}
return
num_subgraphs
;
}
void
FusionGroupPass
::
GenerateCode
(
fusion_group
::
SubGraph
*
subgraph
)
const
{
bool
FusionGroupPass
::
GenerateCode
(
fusion_group
::
SubGraph
*
subgraph
)
const
{
fusion_group
::
CodeGenerator
code_generator
;
std
::
string
code_str
=
code_generator
.
Generate
(
subgraph
);
VLOG
(
3
)
<<
code_str
;
...
...
@@ -74,10 +73,12 @@ void FusionGroupPass::GenerateCode(fusion_group::SubGraph* subgraph) const {
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
0
);
std
::
unique_ptr
<
platform
::
CUDADeviceCode
>
device_code
(
new
platform
::
CUDADeviceCode
(
place
,
subgraph
->
GetFuncName
(),
code_str
));
device_code
->
Compile
();
platform
::
DeviceCodePool
&
pool
=
platform
::
DeviceCodePool
::
Init
({
place
});
pool
.
Set
(
std
::
move
(
device_code
));
bool
is_compiled
=
device_code
->
Compile
();
if
(
is_compiled
)
{
platform
::
DeviceCodePool
&
pool
=
platform
::
DeviceCodePool
::
Init
({
place
});
pool
.
Set
(
std
::
move
(
device_code
));
}
return
is_compiled
;
}
static
int
ExtractOpRole
(
fusion_group
::
SubGraph
*
subgraph
)
{
...
...
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
浏览文件 @
22bbd547
...
...
@@ -29,7 +29,7 @@ class FusionGroupPass : public FusePassBase {
private:
int
DetectFusionGroup
(
Graph
*
graph
,
int
type
=
0
)
const
;
void
GenerateCode
(
fusion_group
::
SubGraph
*
subgraph
)
const
;
bool
GenerateCode
(
fusion_group
::
SubGraph
*
subgraph
)
const
;
void
InsertFusionGroupOp
(
Graph
*
graph
,
fusion_group
::
SubGraph
*
subgraph
)
const
;
...
...
paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
浏览文件 @
22bbd547
...
...
@@ -59,6 +59,11 @@ std::unique_ptr<Graph> BuildElementwiseListGraph(bool backward = false) {
}
std
::
unique_ptr
<
Graph
>
graph
(
new
Graph
(
layers
.
main_program
()));
for
(
auto
*
n
:
graph
->
Nodes
())
{
if
(
n
&&
n
->
IsVar
()
&&
n
->
Var
())
{
n
->
Var
()
->
SetDataType
(
proto
::
VarType
::
FP32
);
}
}
#ifdef __clang__
return
graph
;
#else
...
...
@@ -116,6 +121,11 @@ std::unique_ptr<Graph> BuildElementwiseTreeGraph(bool backward = false) {
}
std
::
unique_ptr
<
Graph
>
graph
(
new
Graph
(
layers
.
main_program
()));
for
(
auto
*
n
:
graph
->
Nodes
())
{
if
(
n
&&
n
->
IsVar
()
&&
n
->
Var
())
{
n
->
Var
()
->
SetDataType
(
proto
::
VarType
::
FP32
);
}
}
#ifdef __clang__
return
graph
;
#else
...
...
paddle/fluid/framework/ir/fusion_group/operation.cc
浏览文件 @
22bbd547
...
...
@@ -91,7 +91,7 @@ void OperationMap::InsertUnaryElementwiseOperations() {
// relu:
// out = f(x) = x > 0 ? x : 0
// dx = dout * (out > 0 ? 1 : 0)
insert_handler
(
"relu"
,
"
real_max(${0}, 0)
"
,
{
"${1} > 0 ? ${2} : 0"
});
insert_handler
(
"relu"
,
"
${0} > 0 ? ${0} : 0
"
,
{
"${1} > 0 ? ${2} : 0"
});
// sigmoid:
// out = f(x) = 1.0 / (1.0 + exp(-x))
// dx = dout * out * (1 - out)
...
...
@@ -133,9 +133,24 @@ void OperationMap::InsertBinaryElementwiseOperations() {
// dy = dout * x
insert_handler
(
"elementwise_mul"
,
"${0} * ${1}"
,
{
"${3} * ${1}"
,
"${3} * ${0}"
});
insert_handler
(
"elementwise_div"
,
"${0} / ${1}"
,
{});
insert_handler
(
"elementwise_min"
,
"real_min(${0}, ${1})"
,
{});
insert_handler
(
"elementwise_max"
,
"real_max(${0}, ${1})"
,
{});
// elementwise_div:
// out = x / y
// dx = dout / y
// dy = - dout * out / y
insert_handler
(
"elementwise_div"
,
"${0} / ${1}"
,
{
"${3} / ${1}"
,
"- ${3} * ${2} / ${1}"
});
// elementwise_min:
// out = x < y ? x : y
// dx = dout * (x < y)
// dy = dout * (x >= y)
insert_handler
(
"elementwise_min"
,
"${0} < ${1} ? ${0} : ${1}"
,
{
"${3} * (${0} < ${1})"
,
"${3} * (${0} >= ${1})"
});
// elementwise_max:
// out = x > y ? x : y
// dx = dout * (x > y)
// dy = dout * (x <= y)
insert_handler
(
"elementwise_max"
,
"${0} > ${1} ? ${0} : ${1}"
,
{
"${3} * (${0} > ${1})"
,
"${3} * (${0} <= ${1})"
});
}
}
// namespace fusion_group
...
...
paddle/fluid/framework/ir/fusion_group/subgraph.h
浏览文件 @
22bbd547
...
...
@@ -49,11 +49,23 @@ class SubGraph {
}
}
}
ExtractDataType
();
}
bool
IsEmpty
()
{
return
nodes_set_
.
empty
();
}
bool
IsValid
(
int
min_subgraph_size
)
{
int
num_operations
=
GetNumOperations
();
if
(
num_operations
<
min_subgraph_size
)
{
VLOG
(
2
)
<<
"There are only "
<<
num_operations
<<
" operations in the subgraph. Expected at least "
<<
min_subgraph_size
;
return
false
;
}
return
ExtractDataType
();
}
int
GetType
()
const
{
return
type_
;
}
std
::
string
GetDataType
()
const
{
return
data_type_
;
}
void
SetFuncName
(
std
::
string
func_name
)
{
func_name_
=
func_name
;
}
std
::
string
GetFuncName
()
const
{
return
func_name_
;
}
...
...
@@ -150,6 +162,37 @@ class SubGraph {
}
private:
bool
ExtractDataType
()
{
bool
is_first
=
true
;
proto
::
VarType
::
Type
data_type
=
proto
::
VarType
::
FP32
;
for
(
auto
*
n
:
nodes_set_
)
{
if
(
n
&&
n
->
IsVar
()
&&
n
->
Var
())
{
if
(
n
->
Var
()
->
GetType
()
!=
proto
::
VarType
::
LOD_TENSOR
)
{
// All var node in a subgraph should hold a LoDTensor.
return
false
;
}
if
(
is_first
)
{
data_type
=
n
->
Var
()
->
GetDataType
();
is_first
=
false
;
}
else
if
(
n
->
Var
()
->
GetDataType
()
!=
data_type
)
{
// DataType of VarDesc in a subgraph is not the same.
return
false
;
}
}
}
if
(
data_type
==
proto
::
VarType
::
FP32
)
{
data_type_
=
"float"
;
}
else
if
(
data_type
==
proto
::
VarType
::
FP64
)
{
data_type_
=
"double"
;
}
else
if
(
data_type
==
proto
::
VarType
::
FP16
)
{
data_type_
=
"float16"
;
}
else
{
VLOG
(
2
)
<<
"Only support fp32, fp64 and fp16 in fusion_group."
;
return
false
;
}
return
true
;
}
void
TopologicalSort
()
{
if
(
!
is_sorted_
)
{
std
::
unordered_map
<
Node
*
,
std
::
vector
<
Node
*>>
inputs_map
;
...
...
@@ -203,6 +246,7 @@ class SubGraph {
private:
int
type_
{
-
1
};
std
::
string
data_type_
;
std
::
string
func_name_
;
bool
save_intermediate_out_
{
true
};
...
...
paddle/fluid/framework/ir/pass_tester_helper.h
浏览文件 @
22bbd547
...
...
@@ -33,8 +33,9 @@ struct Layers {
const
ProgramDesc
&
main_program
()
{
return
program_
;
}
VarDesc
*
data
(
std
::
string
name
,
std
::
vector
<
int64_t
>
shape
=
{},
bool
is_persistable
=
false
)
{
return
lod_tensor
(
name
,
shape
,
is_persistable
);
bool
is_persistable
=
false
,
proto
::
VarType
::
Type
data_type
=
proto
::
VarType
::
FP32
)
{
return
lod_tensor
(
name
,
shape
,
is_persistable
,
data_type
);
}
VarDesc
*
conv2d
(
VarDesc
*
input
,
VarDesc
*
filter
,
VarDesc
*
bias
,
...
...
@@ -379,9 +380,11 @@ struct Layers {
private:
VarDesc
*
lod_tensor
(
std
::
string
name
,
std
::
vector
<
int64_t
>
shape
=
{},
bool
is_persistable
=
false
)
{
bool
is_persistable
=
false
,
proto
::
VarType
::
Type
data_type
=
proto
::
VarType
::
FP32
)
{
auto
*
var
=
program_
.
MutableBlock
(
0
)
->
Var
(
name
);
var
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
var
->
SetDataType
(
data_type
);
var
->
SetShape
(
shape
);
var
->
SetPersistable
(
is_persistable
);
return
var
;
...
...
paddle/fluid/operators/fused/fusion_group_op.cu.cc
浏览文件 @
22bbd547
...
...
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_group_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
fusion_group
,
ops
::
FusionGroupKernel
<
p
addle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
FusionGroupKernel
<
p
addle
::
platform
::
CUDADeviceContext
,
float
>
);
fusion_group
,
ops
::
FusionGroupKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
FusionGroupKernel
<
p
lat
::
CUDADeviceContext
,
double
>
,
ops
::
FusionGroupKernel
<
p
lat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/platform/device_code.cc
浏览文件 @
22bbd547
...
...
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/device_code.h"
#include <sys/stat.h>
#include <algorithm>
#include <set>
#include <utility>
#include "paddle/fluid/platform/enforce.h"
DECLARE_string
(
cuda_dir
);
namespace
paddle
{
namespace
platform
{
...
...
@@ -79,6 +82,46 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
}
#ifdef PADDLE_WITH_CUDA
static
std
::
string
FindCUDAIncludePath
()
{
auto
EndWith
=
[](
std
::
string
str
,
std
::
string
substr
)
->
bool
{
size_t
pos
=
str
.
rfind
(
substr
);
return
pos
!=
std
::
string
::
npos
&&
pos
==
(
str
.
length
()
-
substr
.
length
());
};
struct
stat
st
;
std
::
string
cuda_include_path
;
if
(
!
FLAGS_cuda_dir
.
empty
())
{
cuda_include_path
=
FLAGS_cuda_dir
;
if
(
EndWith
(
cuda_include_path
,
"/"
))
{
cuda_include_path
.
erase
(
cuda_include_path
.
end
()
-
1
);
}
for
(
std
::
string
suffix
:
{
"/lib"
,
"/lib64"
})
{
if
(
EndWith
(
FLAGS_cuda_dir
,
suffix
))
{
cuda_include_path
.
erase
(
cuda_include_path
.
end
()
-
suffix
.
length
());
break
;
}
}
if
(
!
EndWith
(
cuda_include_path
,
"include"
))
{
cuda_include_path
+=
"/include"
;
}
// Whether the cuda_include_path exists on the file system.
if
(
stat
(
cuda_include_path
.
c_str
(),
&
st
)
==
0
)
{
return
cuda_include_path
;
}
}
cuda_include_path
=
"/usr/local/cuda/include"
;
if
(
stat
(
cuda_include_path
.
c_str
(),
&
st
)
==
0
)
{
return
cuda_include_path
;
}
LOG
(
WARNING
)
<<
"Cannot find CUDA include path."
<<
"Please check whether CUDA is installed in the default "
"installation path, or specify it by export "
"FLAGS_cuda_dir=xxx."
;
return
""
;
}
CUDADeviceCode
::
CUDADeviceCode
(
const
Place
&
place
,
const
std
::
string
&
name
,
const
std
::
string
&
kernel
)
{
if
(
!
is_gpu_place
(
place
))
{
...
...
@@ -91,7 +134,7 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
kernel_
=
kernel
;
}
bool
CUDADeviceCode
::
Compile
()
{
bool
CUDADeviceCode
::
Compile
(
bool
include_path
)
{
is_compiled_
=
false
;
if
(
!
dynload
::
HasNVRTC
()
||
!
dynload
::
HasCUDADriver
())
{
LOG
(
WARNING
)
...
...
@@ -116,8 +159,14 @@ bool CUDADeviceCode::Compile() {
int
compute_capability
=
dev_ctx
->
GetComputeCapability
();
std
::
string
compute_flag
=
"--gpu-architecture=compute_"
+
std
::
to_string
(
compute_capability
);
const
std
::
vector
<
const
char
*>
options
=
{
"--std=c++11"
,
compute_flag
.
c_str
()};
std
::
vector
<
const
char
*>
options
=
{
"--std=c++11"
,
compute_flag
.
c_str
()};
if
(
include_path
)
{
std
::
string
cuda_include_path
=
FindCUDAIncludePath
();
if
(
!
cuda_include_path
.
empty
())
{
std
::
string
include_option
=
"--include-path="
+
cuda_include_path
;
options
.
push_back
(
include_option
.
c_str
());
}
}
nvrtcResult
compile_result
=
dynload
::
nvrtcCompileProgram
(
program
,
// program
options
.
size
(),
// numOptions
...
...
paddle/fluid/platform/device_code.h
浏览文件 @
22bbd547
...
...
@@ -31,7 +31,7 @@ namespace platform {
class
DeviceCode
{
public:
virtual
~
DeviceCode
()
{}
virtual
bool
Compile
()
=
0
;
virtual
bool
Compile
(
bool
include_path
=
false
)
=
0
;
virtual
void
Launch
(
const
size_t
n
,
std
::
vector
<
void
*>*
args
)
const
=
0
;
Place
GetPlace
()
const
{
return
place_
;
}
...
...
@@ -48,7 +48,7 @@ class CUDADeviceCode : public DeviceCode {
public:
explicit
CUDADeviceCode
(
const
Place
&
place
,
const
std
::
string
&
name
,
const
std
::
string
&
kernel
);
bool
Compile
()
override
;
bool
Compile
(
bool
include_path
=
false
)
override
;
void
Launch
(
const
size_t
n
,
std
::
vector
<
void
*>*
args
)
const
override
;
void
SetNumThreads
(
int
num_threads
)
{
num_threads_
=
num_threads
;
}
...
...
python/paddle/fluid/tests/unittests/ir/CMakeLists.txt
浏览文件 @
22bbd547
...
...
@@ -2,7 +2,7 @@ file(GLOB TEST_IR_PASSES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string
(
REPLACE
".py"
""
TEST_IR_PASSES
"
${
TEST_IR_PASSES
}
"
)
if
(
NOT WITH_GPU OR WIN32 OR APPLE
)
LIST
(
REMOVE_ITEM TEST_IR_PASSES test_ir_fusion_group
)
LIST
(
REMOVE_ITEM TEST_IR_PASSES test_ir_fusion_group
_pass
)
endif
()
foreach
(
target
${
TEST_IR_PASSES
}
)
...
...
python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group.py
→
python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group
_pass
.py
浏览文件 @
22bbd547
...
...
@@ -17,92 +17,125 @@ import unittest
import
numpy
as
np
from
pass_test
import
PassTest
import
paddle.fluid
as
fluid
import
paddle.fluid.layers
as
layers
import
paddle.fluid.core
as
core
class
FusionGroupPassTest
(
PassTest
):
def
setUp
(
self
):
def
build_program
(
self
,
dtype
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data1
=
fluid
.
data
(
name
=
"data1"
,
shape
=
[
32
,
128
],
dtype
=
"float32"
)
data2
=
fluid
.
data
(
name
=
"data2"
,
shape
=
[
32
,
128
],
dtype
=
"float32"
)
data3
=
fluid
.
data
(
name
=
"data3"
,
shape
=
[
32
,
128
],
dtype
=
"float32"
)
tmp_1
=
fluid
.
layers
.
elementwise_add
(
data1
,
data2
)
tmp_2
=
fluid
.
layers
.
elementwise_mul
(
data3
,
tmp_1
)
self
.
feeds
=
{
"data1"
:
np
.
random
.
random
((
32
,
128
)).
astype
(
"float32"
),
"data2"
:
np
.
random
.
random
((
32
,
128
)).
astype
(
"float32"
),
"data3"
:
np
.
random
.
random
((
32
,
128
)).
astype
(
"float32"
)
}
self
.
fetch_list
=
[
tmp_1
,
tmp_2
]
self
.
feed_vars
=
self
.
_prepare_feed_vars
([
32
,
128
],
dtype
,
2
)
self
.
feed_vars
.
append
(
fluid
.
data
(
name
=
"data2"
,
shape
=
[
128
,
128
],
dtype
=
dtype
))
# subgraph with only 1 op node
tmp_0
=
self
.
feed_vars
[
0
]
*
self
.
feed_vars
[
1
]
tmp_1
=
layers
.
mul
(
tmp_0
,
self
.
feed_vars
[
2
])
# subgraph with 2 op nodes
tmp_2
=
layers
.
relu
(
tmp_0
+
tmp_1
)
self
.
fetch_list
=
[
tmp_2
]
self
.
num_fused_ops
=
1
def
setUp
(
self
):
self
.
build_program
(
"float32"
)
self
.
feeds
=
self
.
_feed_random_data
(
self
.
feed_vars
)
self
.
pass_names
=
"fusion_group_pass"
self
.
fused_op_type
=
"fusion_group"
self
.
num_fused_ops
=
1
def
_prepare_feed_vars
(
self
,
shape
,
dtype
,
num_data
):
feed_vars
=
[]
for
i
in
range
(
num_data
):
var
=
fluid
.
data
(
name
=
(
"data"
+
str
(
i
)),
shape
=
shape
,
dtype
=
dtype
)
feed_vars
.
append
(
var
)
return
feed_vars
def
_feed_random_data
(
self
,
feed_vars
):
feeds
=
{}
for
var
in
feed_vars
:
if
var
.
type
!=
fluid
.
core
.
VarDesc
.
VarType
.
LOD_TENSOR
:
raise
TypeError
(
"Feed data of non LoDTensor is not supported."
)
shape
=
var
.
shape
if
var
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
:
dtype
=
"float32"
elif
var
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP64
:
dtype
=
"float64"
elif
var
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP16
:
dtype
=
"float16"
else
:
raise
ValueError
(
"Unsupported dtype %s"
%
var
.
dtype
)
feeds
[
var
.
name
]
=
np
.
random
.
random
(
shape
).
astype
(
dtype
)
return
feeds
def
test_check_output
(
self
):
use_gpu_set
=
[]
if
core
.
is_compiled_with_cuda
():
use_gpu_set
.
append
(
True
)
for
use_gpu
in
use_gpu_set
:
self
.
pass_attrs
=
{
"fusion_group_pass"
:
{
"use_gpu"
:
use_gpu
}}
place
=
fluid
.
CUDAPlace
(
0
)
if
use_gpu
else
fluid
.
CPUPlace
()
self
.
check_output_with_place
(
place
,
startup_on_cpu
=
False
)
self
.
pass_attrs
=
{
"fusion_group_pass"
:
{
"use_gpu"
:
True
}}
self
.
check_output_with_place
(
fluid
.
CUDAPlace
(
0
))
class
FusionGroupPassTest1
(
FusionGroupPassTest
):
def
setUp
(
self
):
def
build_program
(
self
,
dtype
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data
=
[]
for
i
in
range
(
5
):
data
.
append
(
fluid
.
data
(
name
=
(
"data"
+
str
(
i
)),
shape
=
[
32
,
128
],
dtype
=
"float32"
))
tmp_1
=
(
fluid
.
layers
.
assign
(
data
[
0
])
*
fluid
.
layers
.
sigmoid
(
data
[
1
])
)
+
(
fluid
.
layers
.
sigmoid
(
data
[
2
])
*
fluid
.
layers
.
tanh
(
data
[
3
]))
tmp_2
=
fluid
.
layers
.
tanh
(
tmp_1
)
+
fluid
.
layers
.
sigmoid
(
data
[
4
])
self
.
feeds
=
{}
for
i
in
range
(
5
):
self
.
feeds
[
"data"
+
str
(
i
)]
=
np
.
random
.
random
(
(
32
,
128
)).
astype
(
"float32"
)
self
.
feed_vars
=
self
.
_prepare_feed_vars
([
32
,
128
],
dtype
,
5
)
tmp_0
=
layers
.
assign
(
self
.
feed_vars
[
0
])
# subgraph with 9 op nodes
tmp_1
=
tmp_0
*
layers
.
sigmoid
(
self
.
feed_vars
[
1
])
+
layers
.
sigmoid
(
self
.
feed_vars
[
2
])
*
layers
.
tanh
(
self
.
feed_vars
[
3
])
tmp_2
=
layers
.
tanh
(
tmp_1
)
+
layers
.
sigmoid
(
self
.
feed_vars
[
4
])
self
.
fetch_list
=
[
tmp_1
,
tmp_2
]
self
.
pass_names
=
"fusion_group_pass"
self
.
fused_op_type
=
"fusion_group"
self
.
num_fused_ops
=
1
class
FusionGroupPassTest2
(
FusionGroupPassTest
):
def
setUp
(
self
):
def
build_program
(
self
,
dtype
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data
=
[]
for
i
in
range
(
3
):
data
.
append
(
fluid
.
data
(
name
=
(
"data"
+
str
(
i
)),
shape
=
[
32
,
128
],
dtype
=
"float32"
))
data
.
append
(
self
.
feed_vars
=
self
.
_prepare_feed_vars
([
32
,
128
],
dtype
,
3
)
self
.
feed_vars
.
append
(
fluid
.
data
(
name
=
"data3"
,
shape
=
[
128
,
32
],
dtype
=
"float32"
))
tmp_1
=
fluid
.
layers
.
relu
((
data
[
0
]
-
data
[
1
])
*
data
[
2
])
tmp_2
=
fluid
.
layers
.
sigmoid
(
data
[
3
])
tmp_3
=
fluid
.
layers
.
relu
(
tmp_2
)
tmp_4
=
fluid
.
layers
.
mul
(
tmp_1
,
tmp_3
)
self
.
feeds
=
{}
for
i
in
range
(
3
):
self
.
feeds
[
"data"
+
str
(
i
)]
=
np
.
random
.
random
(
(
32
,
128
)).
astype
(
"float32"
)
self
.
feeds
[
"data3"
]
=
np
.
random
.
random
((
128
,
32
)).
astype
(
"float32"
)
self
.
fetch_list
=
[
tmp_1
,
tmp_2
,
tmp_3
,
tmp_4
]
name
=
"data3"
,
shape
=
[
128
,
32
],
dtype
=
dtype
))
# subgraph with 3 op nodes
tmp_1
=
layers
.
relu
(
(
self
.
feed_vars
[
0
]
-
self
.
feed_vars
[
1
])
*
self
.
feed_vars
[
2
])
# subgraph with 2 op nodes
tmp_2
=
layers
.
relu
(
layers
.
sigmoid
(
self
.
feed_vars
[
3
]))
tmp_3
=
layers
.
mul
(
tmp_1
,
tmp_2
)
self
.
fetch_list
=
[
tmp_1
,
tmp_2
,
tmp_3
]
self
.
num_fused_ops
=
2
class
FusionGroupPassTestFP64
(
FusionGroupPassTest
):
def
setUp
(
self
):
self
.
build_program
(
"float64"
)
self
.
feeds
=
self
.
_feed_random_data
(
self
.
feed_vars
)
self
.
pass_names
=
"fusion_group_pass"
self
.
fused_op_type
=
"fusion_group"
self
.
num_fused_ops
=
2
class
FusionGroupPassTestFP16
(
FusionGroupPassTest
):
def
build_program
(
self
,
dtype
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
self
.
feed_vars
=
self
.
_prepare_feed_vars
([
32
,
128
],
dtype
,
2
)
self
.
feed_vars
.
append
(
fluid
.
data
(
name
=
"data2"
,
shape
=
[
128
,
128
],
dtype
=
dtype
))
# subgraph with only 1 op node
tmp_0
=
self
.
feed_vars
[
0
]
*
self
.
feed_vars
[
1
]
tmp_1
=
layers
.
mul
(
tmp_0
,
self
.
feed_vars
[
2
])
tmp_2
=
layers
.
cast
(
tmp_0
,
dtype
=
"float16"
)
tmp_3
=
layers
.
cast
(
tmp_1
,
dtype
=
"float16"
)
# subgraph with 2 op nodes
tmp_4
=
layers
.
relu
(
tmp_2
+
tmp_3
)
tmp_5
=
layers
.
cast
(
tmp_4
,
dtype
=
dtype
)
self
.
fetch_list
=
[
tmp_5
]
self
.
num_fused_ops
=
1
if
__name__
==
"__main__"
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录