Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
22bbd547
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
22bbd547
编写于
2月 21, 2020
作者:
Y
Yiqun Liu
提交者:
GitHub
2月 21, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Add the support of fp16 in fusion_group (#22239)
上级
d97475d5
变更
16
显示空白变更内容
内联
并排
Showing
16 changed file
with
570 addition
and
194 deletion
+570
-194
paddle/fluid/framework/ir/fusion_group/code_generator.cc
paddle/fluid/framework/ir/fusion_group/code_generator.cc
+28
-7
paddle/fluid/framework/ir/fusion_group/code_generator.h
paddle/fluid/framework/ir/fusion_group/code_generator.h
+2
-2
paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
...e/fluid/framework/ir/fusion_group/code_generator_helper.h
+0
-25
paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
.../fluid/framework/ir/fusion_group/code_generator_tester.cc
+157
-124
paddle/fluid/framework/ir/fusion_group/cuda_resources.h
paddle/fluid/framework/ir/fusion_group/cuda_resources.h
+82
-0
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
+19
-18
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
+1
-1
paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
...uid/framework/ir/fusion_group/fusion_group_pass_tester.cc
+10
-0
paddle/fluid/framework/ir/fusion_group/operation.cc
paddle/fluid/framework/ir/fusion_group/operation.cc
+19
-4
paddle/fluid/framework/ir/fusion_group/subgraph.h
paddle/fluid/framework/ir/fusion_group/subgraph.h
+45
-1
paddle/fluid/framework/ir/pass_tester_helper.h
paddle/fluid/framework/ir/pass_tester_helper.h
+6
-3
paddle/fluid/operators/fused/fusion_group_op.cu.cc
paddle/fluid/operators/fused/fusion_group_op.cu.cc
+4
-3
paddle/fluid/platform/device_code.cc
paddle/fluid/platform/device_code.cc
+52
-3
paddle/fluid/platform/device_code.h
paddle/fluid/platform/device_code.h
+2
-2
python/paddle/fluid/tests/unittests/ir/CMakeLists.txt
python/paddle/fluid/tests/unittests/ir/CMakeLists.txt
+1
-1
python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
...dle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
+142
-0
未找到文件。
paddle/fluid/framework/ir/fusion_group/code_generator.cc
浏览文件 @
22bbd547
...
@@ -16,6 +16,7 @@ limitations under the License. */
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include <sstream>
#include <sstream>
#include <unordered_set>
#include <unordered_set>
#include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h"
#include "paddle/fluid/framework/ir/fusion_group/code_generator_helper.h"
#include "paddle/fluid/framework/ir/fusion_group/cuda_resources.h"
#include "paddle/fluid/framework/ir/fusion_group/operation.h"
#include "paddle/fluid/framework/ir/fusion_group/operation.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -27,13 +28,14 @@ CodeGenerator::CodeGenerator() {
...
@@ -27,13 +28,14 @@ CodeGenerator::CodeGenerator() {
// Only support elementwise operations now.
// Only support elementwise operations now.
code_templates_
.
resize
(
1
);
code_templates_
.
resize
(
1
);
CodeTemplate
elementwise_t
(
elementwise_cuda_template
);
CodeTemplate
elementwise_t
(
cuda_kernel_template_1d
);
code_templates_
[
0
]
=
elementwise_t
;
code_templates_
[
0
]
=
elementwise_t
;
}
}
std
::
string
CodeGenerator
::
Generate
(
SubGraph
*
subgraph
)
{
std
::
string
CodeGenerator
::
Generate
(
SubGraph
*
subgraph
)
{
std
::
vector
<
OperationExpression
>
expressions
=
ConvertToExpressions
(
subgraph
);
std
::
vector
<
OperationExpression
>
expressions
=
ConvertToExpressions
(
subgraph
);
return
Generate
(
subgraph
->
GetFuncName
(),
expressions
);
return
Generate
(
subgraph
->
GetFuncName
(),
subgraph
->
GetDataType
(),
expressions
);
}
}
static
bool
HasInput
(
Node
*
n
,
std
::
string
name
)
{
static
bool
HasInput
(
Node
*
n
,
std
::
string
name
)
{
...
@@ -100,9 +102,9 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
...
@@ -100,9 +102,9 @@ std::vector<OperationExpression> CodeGenerator::ConvertToExpressions(
// In order to get the right result of expression, we need to calculate and
// In order to get the right result of expression, we need to calculate and
// store the expression as suffix Expressions using vector.
// store the expression as suffix Expressions using vector.
std
::
string
CodeGenerator
::
Generate
(
std
::
string
CodeGenerator
::
Generate
(
std
::
string
func_name
,
std
::
vector
<
OperationExpression
>
expressions
)
{
std
::
string
func_name
,
std
::
string
dtype
,
const
std
::
vector
<
OperationExpression
>&
expressions
)
{
// TODO(liuyiqun): Check whether all expressions are elementwise operations.
// TODO(liuyiqun): Check whether all expressions are elementwise operations.
std
::
string
dtype
=
"float"
;
std
::
set
<
int
>
input_ids
=
DistilInputIds
(
expressions
);
std
::
set
<
int
>
input_ids
=
DistilInputIds
(
expressions
);
std
::
set
<
int
>
output_ids
=
DistilOutputIds
(
expressions
);
std
::
set
<
int
>
output_ids
=
DistilOutputIds
(
expressions
);
...
@@ -111,6 +113,15 @@ std::string CodeGenerator::Generate(
...
@@ -111,6 +113,15 @@ std::string CodeGenerator::Generate(
template_var
.
Add
(
"parameters"
,
EmitParameters
(
input_ids
,
output_ids
,
dtype
));
template_var
.
Add
(
"parameters"
,
EmitParameters
(
input_ids
,
output_ids
,
dtype
));
template_var
.
Add
(
"compute_body"
,
template_var
.
Add
(
"compute_body"
,
EmitComputeBody
(
expressions
,
input_ids
,
output_ids
,
dtype
));
EmitComputeBody
(
expressions
,
input_ids
,
output_ids
,
dtype
));
std
::
string
predefined_cuda_functions
;
if
(
dtype
==
"float"
)
{
predefined_cuda_functions
=
predefined_cuda_functions_fp32
;
}
else
if
(
dtype
==
"double"
)
{
predefined_cuda_functions
=
predefined_cuda_functions_fp64
;
}
else
if
(
dtype
==
"float16"
)
{
predefined_cuda_functions
=
predefined_cuda_functions_fp16
;
}
return
predefined_cuda_functions
+
code_templates_
[
0
].
Format
(
template_var
);
return
predefined_cuda_functions
+
code_templates_
[
0
].
Format
(
template_var
);
}
}
...
@@ -173,9 +184,10 @@ std::string CodeGenerator::EmitComputeBody(
...
@@ -173,9 +184,10 @@ std::string CodeGenerator::EmitComputeBody(
std
::
string
dtype
)
{
std
::
string
dtype
)
{
std
::
ostringstream
compute
;
std
::
ostringstream
compute
;
std
::
unordered_set
<
int
>
used
;
std
::
unordered_set
<
int
>
used
;
std
::
string
compute_dtype
=
(
dtype
==
"float16"
)
?
"float"
:
dtype
;
for
(
size_t
i
=
0
;
i
<
expressions
.
size
();
i
++
)
{
for
(
size_t
i
=
0
;
i
<
expressions
.
size
();
i
++
)
{
VLOG
(
3
)
<<
DebugString
(
expressions
[
i
]);
VLOG
(
3
)
<<
DebugString
(
expressions
[
i
]);
compute
<<
expressions
[
i
].
GetExpression
(
dtype
,
&
used
);
compute
<<
expressions
[
i
].
GetExpression
(
compute_
dtype
,
&
used
);
}
}
// Load input to temporal variables.
// Load input to temporal variables.
...
@@ -183,15 +195,24 @@ std::string CodeGenerator::EmitComputeBody(
...
@@ -183,15 +195,24 @@ std::string CodeGenerator::EmitComputeBody(
for
(
auto
id
:
input_ids
)
{
for
(
auto
id
:
input_ids
)
{
if
(
output_ids
.
find
(
id
)
==
output_ids
.
end
()
&&
if
(
output_ids
.
find
(
id
)
==
output_ids
.
end
()
&&
used
.
find
(
id
)
!=
used
.
end
())
{
used
.
find
(
id
)
!=
used
.
end
())
{
if
(
dtype
==
"float16"
)
{
load
<<
"float "
<<
TmpName
(
id
)
<<
" = __half2float("
<<
ArgName
(
id
)
<<
"[idx]);"
;
}
else
{
load
<<
dtype
<<
" "
<<
TmpName
(
id
)
<<
" = "
<<
ArgName
(
id
)
<<
"[idx];"
;
load
<<
dtype
<<
" "
<<
TmpName
(
id
)
<<
" = "
<<
ArgName
(
id
)
<<
"[idx];"
;
}
}
}
}
}
// Store temporal variables to memory.
// Store temporal variables to memory.
std
::
ostringstream
store
;
std
::
ostringstream
store
;
for
(
auto
id
:
output_ids
)
{
for
(
auto
id
:
output_ids
)
{
if
(
dtype
==
"float16"
)
{
store
<<
ArgName
(
id
)
<<
"[idx] = __float2half("
<<
TmpName
(
id
)
<<
");"
;
}
else
{
store
<<
ArgName
(
id
)
<<
"[idx] = "
<<
TmpName
(
id
)
<<
";"
;
store
<<
ArgName
(
id
)
<<
"[idx] = "
<<
TmpName
(
id
)
<<
";"
;
}
}
}
return
load
.
str
()
+
compute
.
str
()
+
store
.
str
();
return
load
.
str
()
+
compute
.
str
()
+
store
.
str
();
}
}
...
...
paddle/fluid/framework/ir/fusion_group/code_generator.h
浏览文件 @
22bbd547
...
@@ -30,8 +30,8 @@ class CodeGenerator {
...
@@ -30,8 +30,8 @@ class CodeGenerator {
public:
public:
CodeGenerator
();
CodeGenerator
();
std
::
string
Generate
(
std
::
string
func_name
,
std
::
string
Generate
(
std
::
string
func_name
,
std
::
string
dtype
,
std
::
vector
<
OperationExpression
>
expressions
);
const
std
::
vector
<
OperationExpression
>&
expressions
);
std
::
string
Generate
(
SubGraph
*
subgraph
);
std
::
string
Generate
(
SubGraph
*
subgraph
);
...
...
paddle/fluid/framework/ir/fusion_group/code_generator_helper.h
浏览文件 @
22bbd547
...
@@ -149,31 +149,6 @@ class CodeTemplate {
...
@@ -149,31 +149,6 @@ class CodeTemplate {
std
::
string
template_str_
;
std
::
string
template_str_
;
};
};
static
const
char
predefined_cuda_functions
[]
=
R"(
__device__ float real_exp(float x) { return ::expf(x); }
__device__ double real_exp(double x) { return ::exp(x); }
__device__ float real_log(float x) { return ::logf(x); }
__device__ double real_log(double x) { return ::log(x); }
__device__ float real_min(float x, float y) { return ::fminf(x, y); }
__device__ double real_min(double x, double y) { return ::fmin(x, y); }
__device__ float real_max(float x, float y) { return ::fmaxf(x, y); }
__device__ double real_max(double x, double y) { return ::fmax(x, y); }
)"
;
static
const
char
elementwise_cuda_template
[]
=
R"(
extern "C" __global__ void $func_name($parameters) {
for(int idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < N;
idx += gridDim.x * blockDim.x) {
$compute_body
}
}
)"
;
static
std
::
string
DebugString
(
const
OperationExpression
&
expr
)
{
static
std
::
string
DebugString
(
const
OperationExpression
&
expr
)
{
std
::
stringstream
ret
;
std
::
stringstream
ret
;
ret
<<
"Op("
<<
expr
.
GetOpType
()
<<
"), inputs:{"
;
ret
<<
"Op("
<<
expr
.
GetOpType
()
<<
"), inputs:{"
;
...
...
paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
浏览文件 @
22bbd547
...
@@ -22,6 +22,7 @@ limitations under the License. */
...
@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/operators/math.h"
#include "paddle/fluid/platform/device_code.h"
#include "paddle/fluid/platform/device_code.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/init.h"
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
...
@@ -88,7 +89,8 @@ inline float elementwise_mul_grad_dy(float x, float y, float out, float dout) {
...
@@ -88,7 +89,8 @@ inline float elementwise_mul_grad_dy(float x, float y, float out, float dout) {
void
CheckOutput
(
const
std
::
vector
<
OperationExpression
>&
expressions
,
void
CheckOutput
(
const
std
::
vector
<
OperationExpression
>&
expressions
,
const
std
::
vector
<
LoDTensor
>
cpu_tensors
,
const
std
::
vector
<
LoDTensor
>
cpu_tensors
,
const
std
::
vector
<
int
>
input_ids_of_subgraph
,
const
std
::
vector
<
int
>
input_ids_of_subgraph
,
const
std
::
vector
<
int
>
output_ids_of_subgraph
,
int
i
)
{
const
std
::
vector
<
int
>
output_ids_of_subgraph
,
int
i
,
float
eps
)
{
std
::
vector
<
float
>
var
(
cpu_tensors
.
size
());
std
::
vector
<
float
>
var
(
cpu_tensors
.
size
());
for
(
auto
id
:
input_ids_of_subgraph
)
{
for
(
auto
id
:
input_ids_of_subgraph
)
{
if
(
id
>=
0
)
{
if
(
id
>=
0
)
{
...
@@ -138,7 +140,12 @@ void CheckOutput(const std::vector<OperationExpression>& expressions,
...
@@ -138,7 +140,12 @@ void CheckOutput(const std::vector<OperationExpression>& expressions,
for
(
auto
id
:
output_ids_of_subgraph
)
{
for
(
auto
id
:
output_ids_of_subgraph
)
{
float
actual
=
cpu_tensors
[
id
].
data
<
float
>
()[
i
];
float
actual
=
cpu_tensors
[
id
].
data
<
float
>
()[
i
];
float
expect
=
var
[
id
];
float
expect
=
var
[
id
];
EXPECT_LT
(
fabs
(
actual
-
expect
),
1.E-05
);
if
(
fabs
(
actual
-
expect
)
>
eps
)
{
LOG
(
INFO
)
<<
"Precision check failed from i = "
<<
id
<<
", expect: "
<<
expect
<<
", actual: "
<<
actual
;
EXPECT_LT
(
fabs
(
actual
-
expect
),
eps
);
break
;
}
}
}
}
}
...
@@ -162,33 +169,49 @@ void SetupRandomCPUTensor(LoDTensor* tensor) {
...
@@ -162,33 +169,49 @@ void SetupRandomCPUTensor(LoDTensor* tensor) {
namespace
fusion_group
=
paddle
::
framework
::
ir
::
fusion_group
;
namespace
fusion_group
=
paddle
::
framework
::
ir
::
fusion_group
;
template
<
typename
T
>
void
TestMainImpl
(
std
::
string
func_name
,
std
::
string
code_str
,
void
TestMainImpl
(
std
::
string
func_name
,
std
::
string
code_str
,
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
,
int
n
,
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
,
int
n
,
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
)
{
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
)
{
bool
is_float16
=
std
::
type_index
(
typeid
(
T
))
==
std
::
type_index
(
typeid
(
paddle
::
platform
::
float16
));
paddle
::
framework
::
InitDevices
(
false
,
{
0
});
paddle
::
framework
::
InitDevices
(
false
,
{
0
});
paddle
::
platform
::
CUDAPlace
place
=
paddle
::
platform
::
CUDAPlace
(
0
);
paddle
::
platform
::
CUDAPlace
place
=
paddle
::
platform
::
CUDAPlace
(
0
);
paddle
::
platform
::
CUDADeviceCode
device_code
(
place
,
func_name
,
code_str
);
paddle
::
platform
::
CUDADeviceCode
device_code
(
place
,
func_name
,
code_str
);
device_code
.
Compile
();
device_code
.
Compile
(
is_float16
);
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
gpu_tensors
(
cpu_tensors
.
size
());
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
gpu_tensors
(
cpu_tensors
.
size
());
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
tmp_cpu_tensors
(
cpu_tensors
.
size
());
std
::
vector
<
float
*>
gpu_ptrs
(
gpu_tensors
.
size
());
std
::
vector
<
T
*>
gpu_ptrs
(
gpu_tensors
.
size
());
std
::
vector
<
void
*>
args
;
std
::
vector
<
void
*>
args
;
args
.
push_back
(
&
n
);
args
.
push_back
(
&
n
);
for
(
auto
id
:
input_ids
)
{
for
(
auto
id
:
input_ids
)
{
if
(
id
>=
0
)
{
if
(
id
>=
0
)
{
gpu_ptrs
[
id
]
=
gpu_ptrs
[
id
]
=
gpu_tensors
[
id
].
mutable_data
<
float
>
(
cpu_tensors
[
id
].
dims
(),
place
);
gpu_tensors
[
id
].
mutable_data
<
T
>
(
cpu_tensors
[
id
].
dims
(),
place
);
fusion_group
::
SetupRandomCPUTensor
<
float
>
(
&
cpu_tensors
[
id
]);
fusion_group
::
SetupRandomCPUTensor
<
float
>
(
&
cpu_tensors
[
id
]);
if
(
is_float16
)
{
paddle
::
platform
::
float16
*
tmp_cpu_ptr
=
tmp_cpu_tensors
[
id
].
mutable_data
<
paddle
::
platform
::
float16
>
(
cpu_tensors
[
id
].
dims
(),
paddle
::
platform
::
CPUPlace
());
const
float
*
cpu_ptr
=
cpu_tensors
[
id
].
data
<
float
>
();
for
(
int64_t
i
=
0
;
i
<
cpu_tensors
[
id
].
numel
();
++
i
)
{
tmp_cpu_ptr
[
i
]
=
paddle
::
platform
::
float16
(
cpu_ptr
[
i
]);
}
TensorCopySync
(
tmp_cpu_tensors
[
id
],
place
,
&
gpu_tensors
[
id
]);
}
else
{
TensorCopySync
(
cpu_tensors
[
id
],
place
,
&
gpu_tensors
[
id
]);
TensorCopySync
(
cpu_tensors
[
id
],
place
,
&
gpu_tensors
[
id
]);
}
args
.
push_back
(
&
gpu_ptrs
[
id
]);
args
.
push_back
(
&
gpu_ptrs
[
id
]);
}
}
}
}
for
(
auto
id
:
output_ids
)
{
for
(
auto
id
:
output_ids
)
{
gpu_ptrs
[
id
]
=
gpu_ptrs
[
id
]
=
gpu_tensors
[
id
].
mutable_data
<
float
>
(
cpu_tensors
[
id
].
dims
(),
place
);
gpu_tensors
[
id
].
mutable_data
<
T
>
(
cpu_tensors
[
id
].
dims
(),
place
);
args
.
push_back
(
&
gpu_ptrs
[
id
]);
args
.
push_back
(
&
gpu_ptrs
[
id
]);
}
}
...
@@ -200,38 +223,93 @@ void TestMainImpl(std::string func_name, std::string code_str,
...
@@ -200,38 +223,93 @@ void TestMainImpl(std::string func_name, std::string code_str,
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
paddle
::
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
dev_ctx
->
Wait
();
dev_ctx
->
Wait
();
// Copy the results back to CPU.
for
(
auto
id
:
output_ids
)
{
for
(
auto
id
:
output_ids
)
{
if
(
is_float16
)
{
paddle
::
platform
::
float16
*
tmp_cpu_ptr
=
tmp_cpu_tensors
[
id
].
mutable_data
<
paddle
::
platform
::
float16
>
(
cpu_tensors
[
id
].
dims
(),
paddle
::
platform
::
CPUPlace
());
TensorCopySync
(
gpu_tensors
[
id
],
paddle
::
platform
::
CPUPlace
(),
&
tmp_cpu_tensors
[
id
]);
float
*
cpu_ptr
=
cpu_tensors
[
id
].
mutable_data
<
float
>
(
cpu_tensors
[
id
].
dims
(),
paddle
::
platform
::
CPUPlace
());
for
(
int64_t
i
=
0
;
i
<
cpu_tensors
[
id
].
numel
();
++
i
)
{
cpu_ptr
[
i
]
=
static_cast
<
float
>
(
tmp_cpu_ptr
[
i
]);
}
}
else
{
TensorCopySync
(
gpu_tensors
[
id
],
paddle
::
platform
::
CPUPlace
(),
TensorCopySync
(
gpu_tensors
[
id
],
paddle
::
platform
::
CPUPlace
(),
&
cpu_tensors
[
id
]);
&
cpu_tensors
[
id
]);
}
}
}
}
void
TestElementwiseMain
(
std
::
string
func_name
,
std
::
string
code_str
,
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
,
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
,
std
::
string
dtype
)
{
std
::
unordered_set
<
int
>
ids
;
for
(
auto
id
:
input_ids
)
{
ids
.
insert
(
id
);
}
for
(
auto
id
:
output_ids
)
{
ids
.
insert
(
id
);
}
// Prepare CPU tensors which always hold float.
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
(
ids
.
size
());
auto
dims
=
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
256
),
static_cast
<
int64_t
>
(
1024
)});
for
(
size_t
i
=
0
;
i
<
cpu_tensors
.
size
();
++
i
)
{
cpu_tensors
[
i
].
mutable_data
<
float
>
(
dims
,
paddle
::
platform
::
CPUPlace
());
}
int
n
=
cpu_tensors
[
0
].
numel
();
if
(
dtype
==
"float16"
)
{
TestMainImpl
<
paddle
::
platform
::
float16
>
(
func_name
,
code_str
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
}
else
{
TestMainImpl
<
float
>
(
func_name
,
code_str
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
}
// Check the results
float
eps
=
(
dtype
==
"float16"
)
?
1E-2
:
1E-5
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
fusion_group
::
CheckOutput
(
expressions
,
cpu_tensors
,
input_ids
,
output_ids
,
i
,
eps
);
}
}
}
void
TestMain
(
std
::
string
func_name
,
void
TestMain
(
std
::
string
func_name
,
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
,
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
,
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
,
int
n
,
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
,
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
)
{
std
::
string
dtype
)
{
fusion_group
::
OperationMap
::
Init
();
fusion_group
::
OperationMap
::
Init
();
fusion_group
::
CodeGenerator
code_generator
;
fusion_group
::
CodeGenerator
code_generator
;
std
::
string
code_str
=
code_generator
.
Generate
(
func_name
,
expressions
);
std
::
string
code_str
=
code_generator
.
Generate
(
func_name
,
dtype
,
expressions
);
VLOG
(
3
)
<<
code_str
;
VLOG
(
3
)
<<
code_str
;
TestMainImpl
(
func_name
,
code_str
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
LOG
(
INFO
)
<<
"dtype: "
<<
dtype
;
TestElementwiseMain
(
func_name
,
code_str
,
expressions
,
input_ids
,
output_ids
,
dtype
);
}
}
std
::
vector
<
fusion_group
::
OperationExpression
>
TestMain
(
void
TestMain
(
fusion_group
::
SubGraph
*
subgraph
,
std
::
vector
<
int
>
input_ids
,
fusion_group
::
SubGraph
*
subgraph
,
std
::
vector
<
int
>
output_ids
)
{
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
,
int
n
,
std
::
vector
<
int
>
input_ids
,
std
::
vector
<
int
>
output_ids
)
{
fusion_group
::
OperationMap
::
Init
();
fusion_group
::
OperationMap
::
Init
();
fusion_group
::
CodeGenerator
code_generator
;
fusion_group
::
CodeGenerator
code_generator
;
std
::
string
code_str
=
code_generator
.
Generate
(
subgraph
);
std
::
string
code_str
=
code_generator
.
Generate
(
subgraph
);
VLOG
(
3
)
<<
code_str
;
VLOG
(
3
)
<<
code_str
;
TestMainImpl
(
subgraph
->
GetFuncName
(),
code_str
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
// Need to check the accuracy according to expressions.
// Need to check the accuracy according to expressions.
return
code_generator
.
ConvertToExpressions
(
subgraph
);
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
=
code_generator
.
ConvertToExpressions
(
subgraph
);
LOG
(
INFO
)
<<
"dtype: "
<<
subgraph
->
GetDataType
();
TestElementwiseMain
(
subgraph
->
GetFuncName
(),
code_str
,
expressions
,
input_ids
,
output_ids
,
subgraph
->
GetDataType
());
}
}
TEST
(
code_generator
,
elementwise
)
{
TEST
(
code_generator
,
elementwise
)
{
...
@@ -248,30 +326,16 @@ TEST(code_generator, elementwise) {
...
@@ -248,30 +326,16 @@ TEST(code_generator, elementwise) {
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
=
{
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
=
{
exp1
,
exp2
,
exp3
,
exp4
,
exp5
};
exp1
,
exp2
,
exp3
,
exp4
,
exp5
};
// Prepare CPU tensors
for
(
std
::
string
dtype
:
{
"float"
,
"float16"
})
{
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
(
9
);
auto
dims
=
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
256
),
static_cast
<
int64_t
>
(
1024
)});
for
(
size_t
i
=
0
;
i
<
cpu_tensors
.
size
();
++
i
)
{
cpu_tensors
[
i
].
mutable_data
<
float
>
(
dims
,
paddle
::
platform
::
CPUPlace
());
}
// Expressions:
// Expressions:
// Op(elementwise_mul), inputs:{0,1}, outputs:{2}
// Op(elementwise_mul), inputs:{0,1}, outputs:{2}
// Op(elementwise_add), inputs:{2,3}, outputs:{4}
// Op(elementwise_add), inputs:{2,3}, outputs:{4}
// Op(elementwise_sub), inputs:{4,5}, outputs:{6}
// Op(elementwise_sub), inputs:{4,5}, outputs:{6}
// Op(relu), inputs:{6}, outputs:{7}
// Op(relu), inputs:{6}, outputs:{7}
// Op(sigmoid), inputs:{7}, outputs:{8}
// Op(sigmoid), inputs:{7}, outputs:{8}
int
n
=
cpu_tensors
[
0
].
numel
();
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
3
,
5
};
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
3
,
5
};
std
::
vector
<
int
>
output_ids
=
{
2
,
4
,
6
,
7
,
8
};
std
::
vector
<
int
>
output_ids
=
{
2
,
4
,
6
,
7
,
8
};
TestMain
(
"elementwise_kernel_0"
,
expressions
,
cpu_tensors
,
n
,
input_ids
,
TestMain
(
"elementwise_kernel_0"
,
expressions
,
input_ids
,
output_ids
,
dtype
);
output_ids
);
// Check the results
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
fusion_group
::
CheckOutput
(
expressions
,
cpu_tensors
,
input_ids
,
output_ids
,
i
);
}
}
}
}
...
@@ -286,32 +350,19 @@ TEST(code_generator, elementwise_grad) {
...
@@ -286,32 +350,19 @@ TEST(code_generator, elementwise_grad) {
{
4
,
5
});
{
4
,
5
});
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
=
{
exp1
,
exp2
};
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
=
{
exp1
,
exp2
};
// Prepare CPU tensors
for
(
std
::
string
dtype
:
{
"float"
,
"float16"
})
{
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
(
8
);
auto
dims
=
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
256
),
static_cast
<
int64_t
>
(
1024
)});
for
(
size_t
i
=
0
;
i
<
cpu_tensors
.
size
();
++
i
)
{
cpu_tensors
[
i
].
mutable_data
<
float
>
(
dims
,
paddle
::
platform
::
CPUPlace
());
}
// Expressions:
// Expressions:
// Op(relu_grad), inputs:{2,3,7}, outputs:{6}
// Op(relu_grad), inputs:{2,3,7}, outputs:{6}
// Op(elementwise_mul_grad), inputs:{0,1,2,6}, outputs:{4,5}
// Op(elementwise_mul_grad), inputs:{0,1,2,6}, outputs:{4,5}
int
n
=
cpu_tensors
[
0
].
numel
();
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
2
,
3
,
7
};
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
2
,
3
,
7
};
std
::
vector
<
int
>
output_ids
=
{
4
,
5
,
6
};
std
::
vector
<
int
>
output_ids
=
{
4
,
5
,
6
};
TestMain
(
"elementwise_grad_kernel_0"
,
expressions
,
cpu_tensors
,
n
,
input_ids
,
TestMain
(
"elementwise_grad_kernel_0"
,
expressions
,
input_ids
,
output_ids
,
output_ids
);
dtype
);
// Check the results
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
fusion_group
::
CheckOutput
(
expressions
,
cpu_tensors
,
input_ids
,
output_ids
,
i
);
}
}
}
}
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
BuildGraph
(
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
BuildGraph
(
bool
backward
,
bool
backward
=
fals
e
)
{
std
::
string
dtyp
e
)
{
// inputs operator output
// inputs operator output
// --------------------------------------------------------
// --------------------------------------------------------
// x0 sigmoid -> tmp_0
// x0 sigmoid -> tmp_0
...
@@ -353,6 +404,14 @@ std::unique_ptr<paddle::framework::ir::Graph> BuildGraph(
...
@@ -353,6 +404,14 @@ std::unique_ptr<paddle::framework::ir::Graph> BuildGraph(
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
graph
(
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
graph
(
new
paddle
::
framework
::
ir
::
Graph
(
layers
.
main_program
()));
new
paddle
::
framework
::
ir
::
Graph
(
layers
.
main_program
()));
auto
proto_dtype
=
(
dtype
==
"float16"
)
?
paddle
::
framework
::
proto
::
VarType
::
FP16
:
paddle
::
framework
::
proto
::
VarType
::
FP32
;
for
(
auto
*
n
:
graph
->
Nodes
())
{
if
(
n
&&
n
->
IsVar
()
&&
n
->
Var
())
{
n
->
Var
()
->
SetDataType
(
proto_dtype
);
}
}
#ifdef __clang__
#ifdef __clang__
return
graph
;
return
graph
;
#else
#else
...
@@ -401,66 +460,40 @@ std::unordered_set<paddle::framework::ir::Node*> DistilGradNodes(
...
@@ -401,66 +460,40 @@ std::unordered_set<paddle::framework::ir::Node*> DistilGradNodes(
}
}
TEST
(
code_generator
,
subgraph
)
{
TEST
(
code_generator
,
subgraph
)
{
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
graph
=
BuildGraph
(
false
);
for
(
std
::
string
dtype
:
{
"float"
,
"float16"
})
{
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
graph
=
BuildGraph
(
false
,
dtype
);
fusion_group
::
SubGraph
subgraph
(
0
,
"elementwise_kernel_1"
,
true
,
fusion_group
::
SubGraph
subgraph
(
0
,
"elementwise_kernel_1"
,
true
,
graph
->
Nodes
());
graph
->
Nodes
());
// Prepare CPU tensors
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
(
9
);
auto
dims
=
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
256
),
static_cast
<
int64_t
>
(
1024
)});
for
(
size_t
i
=
0
;
i
<
cpu_tensors
.
size
();
++
i
)
{
cpu_tensors
[
i
].
mutable_data
<
float
>
(
dims
,
paddle
::
platform
::
CPUPlace
());
}
// Expressions generated by code_generator (they may be different):
// Expressions generated by code_generator (they may be different):
// Op(sigmoid), inputs:{0}, outputs:{4}
// Op(sigmoid), inputs:{0}, outputs:{4}
// Op(elementwise_mul), inputs:{4,1}, outputs:{7}
// Op(elementwise_mul), inputs:{4,1}, outputs:{7}
// Op(tanh), inputs:{2}, outputs:{5}
// Op(tanh), inputs:{2}, outputs:{5}
// Op(elementwise_mul), inputs:{3,5}, outputs:{6}
// Op(elementwise_mul), inputs:{3,5}, outputs:{6}
// Op(elementwise_add), inputs:{7,6}, outputs:{8}
// Op(elementwise_add), inputs:{7,6}, outputs:{8}
int
n
=
cpu_tensors
[
0
].
numel
();
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
2
,
3
};
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
2
,
3
};
std
::
vector
<
int
>
output_ids
=
{
4
,
5
,
6
,
7
,
8
};
std
::
vector
<
int
>
output_ids
=
{
4
,
5
,
6
,
7
,
8
};
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
=
TestMain
(
&
subgraph
,
input_ids
,
output_ids
);
TestMain
(
&
subgraph
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
// Check the results
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
fusion_group
::
CheckOutput
(
expressions
,
cpu_tensors
,
input_ids
,
output_ids
,
i
);
}
}
}
}
TEST
(
code_generator
,
subgraph_grad
)
{
TEST
(
code_generator
,
subgraph_grad
)
{
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
graph
=
BuildGraph
(
true
);
for
(
std
::
string
dtype
:
{
"float"
,
"float16"
})
{
std
::
unique_ptr
<
paddle
::
framework
::
ir
::
Graph
>
graph
=
BuildGraph
(
true
,
dtype
);
fusion_group
::
SubGraph
subgraph
(
0
,
"elementwise_grad_kernel_1"
,
true
,
fusion_group
::
SubGraph
subgraph
(
0
,
"elementwise_grad_kernel_1"
,
true
,
DistilGradNodes
(
graph
));
DistilGradNodes
(
graph
));
// Prepare CPU tensors
std
::
vector
<
paddle
::
framework
::
LoDTensor
>
cpu_tensors
(
18
);
auto
dims
=
paddle
::
framework
::
make_ddim
(
{
static_cast
<
int64_t
>
(
256
),
static_cast
<
int64_t
>
(
1024
)});
for
(
size_t
i
=
0
;
i
<
cpu_tensors
.
size
();
++
i
)
{
cpu_tensors
[
i
].
mutable_data
<
float
>
(
dims
,
paddle
::
platform
::
CPUPlace
());
}
// Expressions generated by code_generator (they may be different):
// Expressions generated by code_generator (they may be different):
// Op(elementwise_add_grad), inputs:{1,2,3,0}, outputs:{11,10}
// Op(elementwise_add_grad), inputs:{1,2,3,0}, outputs:{11,10}
// Op(elementwise_mul_grad), inputs:{5,4,2,10}, outputs:{17,13}
// Op(elementwise_mul_grad), inputs:{5,4,2,10}, outputs:{17,13}
// Op(elementwise_mul_grad), inputs:{7,6,1,11}, outputs:{12,15}
// Op(elementwise_mul_grad), inputs:{7,6,1,11}, outputs:{12,15}
// Op(sigmoid_grad), inputs:{8,7,12}, outputs:{16}
// Op(sigmoid_grad), inputs:{8,7,12}, outputs:{16}
// Op(tanh_grad), inputs:{9,4,13}, outputs:{14}
// Op(tanh_grad), inputs:{9,4,13}, outputs:{14}
int
n
=
cpu_tensors
[
0
].
numel
();
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
std
::
vector
<
int
>
input_ids
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
};
std
::
vector
<
int
>
output_ids
=
{
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
};
std
::
vector
<
int
>
output_ids
=
{
10
,
11
,
12
,
13
,
14
,
15
,
16
,
17
};
std
::
vector
<
fusion_group
::
OperationExpression
>
expressions
=
TestMain
(
&
subgraph
,
input_ids
,
output_ids
);
TestMain
(
&
subgraph
,
cpu_tensors
,
n
,
input_ids
,
output_ids
);
// Check the results
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
fusion_group
::
CheckOutput
(
expressions
,
cpu_tensors
,
input_ids
,
output_ids
,
i
);
}
}
}
}
#endif
#endif
paddle/fluid/framework/ir/fusion_group/cuda_resources.h
0 → 100644
浏览文件 @
22bbd547
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace
paddle
{
namespace
framework
{
namespace
ir
{
namespace
fusion_group
{
static
constexpr
char
predefined_cuda_functions_fp32
[]
=
R"(
__device__ inline float real_exp(float x) { return ::expf(x); }
__device__ inline float real_log(float x) { return ::logf(x); }
)"
;
static
constexpr
char
predefined_cuda_functions_fp64
[]
=
R"(
__device__ inline double real_exp(double x) { return ::exp(x); }
__device__ inline double real_log(double x) { return ::log(x); }
)"
;
static
constexpr
char
predefined_cuda_functions_fp16
[]
=
R"(
__device__ inline float real_exp(float x) { return ::expf(x); }
__device__ inline float real_log(float x) { return ::logf(x); }
#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
struct __align__(2) __half {
__device__ __half() { }
protected:
unsigned short __x;
};
__device__ __half __float2half(const float f) {
__half val;
asm("{ cvt.rn.f16.f32 %0, %1; }\n" : "=h"(__HALF_TO_US(val)
) : "f"(f));
return val;
}
__device__ float __half2float(const __half h) {
float val;
asm("{ cvt.f32.f16 %0, %1; }\n" : "=f"(val) : "h"(__HALF_TO_CUS(h)));
return val;
}
#undef __HALF_TO_US
#undef __HALF_TO_CUS
typedef __half float16;
)"
;
static
constexpr
char
cuda_kernel_template_1d
[]
=
R"(
extern "C" __global__ void $func_name($parameters) {
for(int idx = blockIdx.x * blockDim.x + threadIdx.x;
idx < N;
idx += gridDim.x * blockDim.x) {
$compute_body
}
}
)"
;
}
// namespace fusion_group
}
// namespace ir
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
浏览文件 @
22bbd547
...
@@ -32,8 +32,7 @@ void FusionGroupPass::ApplyImpl(ir::Graph* graph) const {
...
@@ -32,8 +32,7 @@ void FusionGroupPass::ApplyImpl(ir::Graph* graph) const {
if
(
Get
<
bool
>
(
"use_gpu"
))
{
if
(
Get
<
bool
>
(
"use_gpu"
))
{
fusion_group
::
OperationMap
::
Init
();
fusion_group
::
OperationMap
::
Init
();
int
num_elementwise_groups
=
DetectFusionGroup
(
graph
,
0
);
int
num_elementwise_groups
=
DetectFusionGroup
(
graph
,
0
);
VLOG
(
3
)
<<
"Detect "
<<
num_elementwise_groups
AddStatis
(
num_elementwise_groups
);
<<
" elementwise fusion groups."
;
}
}
}
}
...
@@ -49,23 +48,23 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
...
@@ -49,23 +48,23 @@ int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
size_t
min_subgraph_size
=
2
;
size_t
min_subgraph_size
=
2
;
bool
save_intermediate_out
=
true
;
bool
save_intermediate_out
=
true
;
for
(
auto
&
vec
:
subgraphs
)
{
for
(
auto
&
vec
:
subgraphs
)
{
if
(
vec
.
size
()
>=
min_subgraph_size
)
{
std
::
string
func_name
=
"fused_elementwise_"
+
std
::
to_string
(
index
++
);
fusion_group
::
SubGraph
subgraph
(
fusion_group
::
SubGraph
subgraph
(
type
,
func_name
,
save_intermediate_out
,
type
,
""
,
save_intermediate_out
,
std
::
unordered_set
<
Node
*>
(
vec
.
begin
(),
vec
.
end
()));
std
::
unordered_set
<
Node
*>
(
vec
.
begin
(),
vec
.
end
()));
VLOG
(
3
)
<<
"subgraph: {
\n
"
VLOG
(
3
)
<<
"subgraph: {
\n
"
<<
DebugString
(
subgraph
.
SortedNodes
())
<<
"}
\n
"
;
<<
DebugString
(
subgraph
.
SortedNodes
())
<<
"}
\n
"
;
GenerateCode
(
&
subgraph
);
if
(
subgraph
.
IsValid
(
min_subgraph_size
))
{
subgraph
.
SetFuncName
(
"fused_elementwise_"
+
std
::
to_string
(
index
++
));
if
(
GenerateCode
(
&
subgraph
))
{
InsertFusionGroupOp
(
graph
,
&
subgraph
);
InsertFusionGroupOp
(
graph
,
&
subgraph
);
num_subgraphs
++
;
num_subgraphs
++
;
}
}
}
}
}
return
num_subgraphs
;
return
num_subgraphs
;
}
}
void
FusionGroupPass
::
GenerateCode
(
fusion_group
::
SubGraph
*
subgraph
)
const
{
bool
FusionGroupPass
::
GenerateCode
(
fusion_group
::
SubGraph
*
subgraph
)
const
{
fusion_group
::
CodeGenerator
code_generator
;
fusion_group
::
CodeGenerator
code_generator
;
std
::
string
code_str
=
code_generator
.
Generate
(
subgraph
);
std
::
string
code_str
=
code_generator
.
Generate
(
subgraph
);
VLOG
(
3
)
<<
code_str
;
VLOG
(
3
)
<<
code_str
;
...
@@ -74,10 +73,12 @@ void FusionGroupPass::GenerateCode(fusion_group::SubGraph* subgraph) const {
...
@@ -74,10 +73,12 @@ void FusionGroupPass::GenerateCode(fusion_group::SubGraph* subgraph) const {
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
0
);
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
0
);
std
::
unique_ptr
<
platform
::
CUDADeviceCode
>
device_code
(
std
::
unique_ptr
<
platform
::
CUDADeviceCode
>
device_code
(
new
platform
::
CUDADeviceCode
(
place
,
subgraph
->
GetFuncName
(),
code_str
));
new
platform
::
CUDADeviceCode
(
place
,
subgraph
->
GetFuncName
(),
code_str
));
device_code
->
Compile
();
bool
is_compiled
=
device_code
->
Compile
();
if
(
is_compiled
)
{
platform
::
DeviceCodePool
&
pool
=
platform
::
DeviceCodePool
::
Init
({
place
});
platform
::
DeviceCodePool
&
pool
=
platform
::
DeviceCodePool
::
Init
({
place
});
pool
.
Set
(
std
::
move
(
device_code
));
pool
.
Set
(
std
::
move
(
device_code
));
}
return
is_compiled
;
}
}
static
int
ExtractOpRole
(
fusion_group
::
SubGraph
*
subgraph
)
{
static
int
ExtractOpRole
(
fusion_group
::
SubGraph
*
subgraph
)
{
...
...
paddle/fluid/framework/ir/fusion_group/fusion_group_pass.h
浏览文件 @
22bbd547
...
@@ -29,7 +29,7 @@ class FusionGroupPass : public FusePassBase {
...
@@ -29,7 +29,7 @@ class FusionGroupPass : public FusePassBase {
private:
private:
int
DetectFusionGroup
(
Graph
*
graph
,
int
type
=
0
)
const
;
int
DetectFusionGroup
(
Graph
*
graph
,
int
type
=
0
)
const
;
void
GenerateCode
(
fusion_group
::
SubGraph
*
subgraph
)
const
;
bool
GenerateCode
(
fusion_group
::
SubGraph
*
subgraph
)
const
;
void
InsertFusionGroupOp
(
Graph
*
graph
,
void
InsertFusionGroupOp
(
Graph
*
graph
,
fusion_group
::
SubGraph
*
subgraph
)
const
;
fusion_group
::
SubGraph
*
subgraph
)
const
;
...
...
paddle/fluid/framework/ir/fusion_group/fusion_group_pass_tester.cc
浏览文件 @
22bbd547
...
@@ -59,6 +59,11 @@ std::unique_ptr<Graph> BuildElementwiseListGraph(bool backward = false) {
...
@@ -59,6 +59,11 @@ std::unique_ptr<Graph> BuildElementwiseListGraph(bool backward = false) {
}
}
std
::
unique_ptr
<
Graph
>
graph
(
new
Graph
(
layers
.
main_program
()));
std
::
unique_ptr
<
Graph
>
graph
(
new
Graph
(
layers
.
main_program
()));
for
(
auto
*
n
:
graph
->
Nodes
())
{
if
(
n
&&
n
->
IsVar
()
&&
n
->
Var
())
{
n
->
Var
()
->
SetDataType
(
proto
::
VarType
::
FP32
);
}
}
#ifdef __clang__
#ifdef __clang__
return
graph
;
return
graph
;
#else
#else
...
@@ -116,6 +121,11 @@ std::unique_ptr<Graph> BuildElementwiseTreeGraph(bool backward = false) {
...
@@ -116,6 +121,11 @@ std::unique_ptr<Graph> BuildElementwiseTreeGraph(bool backward = false) {
}
}
std
::
unique_ptr
<
Graph
>
graph
(
new
Graph
(
layers
.
main_program
()));
std
::
unique_ptr
<
Graph
>
graph
(
new
Graph
(
layers
.
main_program
()));
for
(
auto
*
n
:
graph
->
Nodes
())
{
if
(
n
&&
n
->
IsVar
()
&&
n
->
Var
())
{
n
->
Var
()
->
SetDataType
(
proto
::
VarType
::
FP32
);
}
}
#ifdef __clang__
#ifdef __clang__
return
graph
;
return
graph
;
#else
#else
...
...
paddle/fluid/framework/ir/fusion_group/operation.cc
浏览文件 @
22bbd547
...
@@ -91,7 +91,7 @@ void OperationMap::InsertUnaryElementwiseOperations() {
...
@@ -91,7 +91,7 @@ void OperationMap::InsertUnaryElementwiseOperations() {
// relu:
// relu:
// out = f(x) = x > 0 ? x : 0
// out = f(x) = x > 0 ? x : 0
// dx = dout * (out > 0 ? 1 : 0)
// dx = dout * (out > 0 ? 1 : 0)
insert_handler
(
"relu"
,
"
real_max(${0}, 0)
"
,
{
"${1} > 0 ? ${2} : 0"
});
insert_handler
(
"relu"
,
"
${0} > 0 ? ${0} : 0
"
,
{
"${1} > 0 ? ${2} : 0"
});
// sigmoid:
// sigmoid:
// out = f(x) = 1.0 / (1.0 + exp(-x))
// out = f(x) = 1.0 / (1.0 + exp(-x))
// dx = dout * out * (1 - out)
// dx = dout * out * (1 - out)
...
@@ -133,9 +133,24 @@ void OperationMap::InsertBinaryElementwiseOperations() {
...
@@ -133,9 +133,24 @@ void OperationMap::InsertBinaryElementwiseOperations() {
// dy = dout * x
// dy = dout * x
insert_handler
(
"elementwise_mul"
,
"${0} * ${1}"
,
insert_handler
(
"elementwise_mul"
,
"${0} * ${1}"
,
{
"${3} * ${1}"
,
"${3} * ${0}"
});
{
"${3} * ${1}"
,
"${3} * ${0}"
});
insert_handler
(
"elementwise_div"
,
"${0} / ${1}"
,
{});
// elementwise_div:
insert_handler
(
"elementwise_min"
,
"real_min(${0}, ${1})"
,
{});
// out = x / y
insert_handler
(
"elementwise_max"
,
"real_max(${0}, ${1})"
,
{});
// dx = dout / y
// dy = - dout * out / y
insert_handler
(
"elementwise_div"
,
"${0} / ${1}"
,
{
"${3} / ${1}"
,
"- ${3} * ${2} / ${1}"
});
// elementwise_min:
// out = x < y ? x : y
// dx = dout * (x < y)
// dy = dout * (x >= y)
insert_handler
(
"elementwise_min"
,
"${0} < ${1} ? ${0} : ${1}"
,
{
"${3} * (${0} < ${1})"
,
"${3} * (${0} >= ${1})"
});
// elementwise_max:
// out = x > y ? x : y
// dx = dout * (x > y)
// dy = dout * (x <= y)
insert_handler
(
"elementwise_max"
,
"${0} > ${1} ? ${0} : ${1}"
,
{
"${3} * (${0} > ${1})"
,
"${3} * (${0} <= ${1})"
});
}
}
}
// namespace fusion_group
}
// namespace fusion_group
...
...
paddle/fluid/framework/ir/fusion_group/subgraph.h
浏览文件 @
22bbd547
...
@@ -49,11 +49,23 @@ class SubGraph {
...
@@ -49,11 +49,23 @@ class SubGraph {
}
}
}
}
}
}
ExtractDataType
();
}
}
bool
IsEmpty
()
{
return
nodes_set_
.
empty
();
}
bool
IsValid
(
int
min_subgraph_size
)
{
int
num_operations
=
GetNumOperations
();
if
(
num_operations
<
min_subgraph_size
)
{
VLOG
(
2
)
<<
"There are only "
<<
num_operations
<<
" operations in the subgraph. Expected at least "
<<
min_subgraph_size
;
return
false
;
}
return
ExtractDataType
();
}
int
GetType
()
const
{
return
type_
;
}
int
GetType
()
const
{
return
type_
;
}
std
::
string
GetDataType
()
const
{
return
data_type_
;
}
void
SetFuncName
(
std
::
string
func_name
)
{
func_name_
=
func_name
;
}
void
SetFuncName
(
std
::
string
func_name
)
{
func_name_
=
func_name
;
}
std
::
string
GetFuncName
()
const
{
return
func_name_
;
}
std
::
string
GetFuncName
()
const
{
return
func_name_
;
}
...
@@ -150,6 +162,37 @@ class SubGraph {
...
@@ -150,6 +162,37 @@ class SubGraph {
}
}
private:
private:
bool
ExtractDataType
()
{
bool
is_first
=
true
;
proto
::
VarType
::
Type
data_type
=
proto
::
VarType
::
FP32
;
for
(
auto
*
n
:
nodes_set_
)
{
if
(
n
&&
n
->
IsVar
()
&&
n
->
Var
())
{
if
(
n
->
Var
()
->
GetType
()
!=
proto
::
VarType
::
LOD_TENSOR
)
{
// All var node in a subgraph should hold a LoDTensor.
return
false
;
}
if
(
is_first
)
{
data_type
=
n
->
Var
()
->
GetDataType
();
is_first
=
false
;
}
else
if
(
n
->
Var
()
->
GetDataType
()
!=
data_type
)
{
// DataType of VarDesc in a subgraph is not the same.
return
false
;
}
}
}
if
(
data_type
==
proto
::
VarType
::
FP32
)
{
data_type_
=
"float"
;
}
else
if
(
data_type
==
proto
::
VarType
::
FP64
)
{
data_type_
=
"double"
;
}
else
if
(
data_type
==
proto
::
VarType
::
FP16
)
{
data_type_
=
"float16"
;
}
else
{
VLOG
(
2
)
<<
"Only support fp32, fp64 and fp16 in fusion_group."
;
return
false
;
}
return
true
;
}
void
TopologicalSort
()
{
void
TopologicalSort
()
{
if
(
!
is_sorted_
)
{
if
(
!
is_sorted_
)
{
std
::
unordered_map
<
Node
*
,
std
::
vector
<
Node
*>>
inputs_map
;
std
::
unordered_map
<
Node
*
,
std
::
vector
<
Node
*>>
inputs_map
;
...
@@ -203,6 +246,7 @@ class SubGraph {
...
@@ -203,6 +246,7 @@ class SubGraph {
private:
private:
int
type_
{
-
1
};
int
type_
{
-
1
};
std
::
string
data_type_
;
std
::
string
func_name_
;
std
::
string
func_name_
;
bool
save_intermediate_out_
{
true
};
bool
save_intermediate_out_
{
true
};
...
...
paddle/fluid/framework/ir/pass_tester_helper.h
浏览文件 @
22bbd547
...
@@ -33,8 +33,9 @@ struct Layers {
...
@@ -33,8 +33,9 @@ struct Layers {
const
ProgramDesc
&
main_program
()
{
return
program_
;
}
const
ProgramDesc
&
main_program
()
{
return
program_
;
}
VarDesc
*
data
(
std
::
string
name
,
std
::
vector
<
int64_t
>
shape
=
{},
VarDesc
*
data
(
std
::
string
name
,
std
::
vector
<
int64_t
>
shape
=
{},
bool
is_persistable
=
false
)
{
bool
is_persistable
=
false
,
return
lod_tensor
(
name
,
shape
,
is_persistable
);
proto
::
VarType
::
Type
data_type
=
proto
::
VarType
::
FP32
)
{
return
lod_tensor
(
name
,
shape
,
is_persistable
,
data_type
);
}
}
VarDesc
*
conv2d
(
VarDesc
*
input
,
VarDesc
*
filter
,
VarDesc
*
bias
,
VarDesc
*
conv2d
(
VarDesc
*
input
,
VarDesc
*
filter
,
VarDesc
*
bias
,
...
@@ -379,9 +380,11 @@ struct Layers {
...
@@ -379,9 +380,11 @@ struct Layers {
private:
private:
VarDesc
*
lod_tensor
(
std
::
string
name
,
std
::
vector
<
int64_t
>
shape
=
{},
VarDesc
*
lod_tensor
(
std
::
string
name
,
std
::
vector
<
int64_t
>
shape
=
{},
bool
is_persistable
=
false
)
{
bool
is_persistable
=
false
,
proto
::
VarType
::
Type
data_type
=
proto
::
VarType
::
FP32
)
{
auto
*
var
=
program_
.
MutableBlock
(
0
)
->
Var
(
name
);
auto
*
var
=
program_
.
MutableBlock
(
0
)
->
Var
(
name
);
var
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
var
->
SetType
(
proto
::
VarType
::
LOD_TENSOR
);
var
->
SetDataType
(
data_type
);
var
->
SetShape
(
shape
);
var
->
SetShape
(
shape
);
var
->
SetPersistable
(
is_persistable
);
var
->
SetPersistable
(
is_persistable
);
return
var
;
return
var
;
...
...
paddle/fluid/operators/fused/fusion_group_op.cu.cc
浏览文件 @
22bbd547
...
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
...
@@ -13,10 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_group_op.h"
#include "paddle/fluid/operators/fused/fusion_group_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
REGISTER_OP_CUDA_KERNEL
(
fusion_group
,
fusion_group
,
ops
::
FusionGroupKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
FusionGroupKernel
<
p
addle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
FusionGroupKernel
<
p
lat
::
CUDADeviceContext
,
double
>
,
ops
::
FusionGroupKernel
<
p
addle
::
platform
::
CUDADeviceContext
,
float
>
);
ops
::
FusionGroupKernel
<
p
lat
::
CUDADeviceContext
,
plat
::
float16
>
);
paddle/fluid/platform/device_code.cc
浏览文件 @
22bbd547
...
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
...
@@ -13,11 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/platform/device_code.h"
#include "paddle/fluid/platform/device_code.h"
#include <sys/stat.h>
#include <algorithm>
#include <algorithm>
#include <set>
#include <set>
#include <utility>
#include <utility>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
DECLARE_string
(
cuda_dir
);
namespace
paddle
{
namespace
paddle
{
namespace
platform
{
namespace
platform
{
...
@@ -79,6 +82,46 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
...
@@ -79,6 +82,46 @@ DeviceCodePool::DeviceCodePool(const std::vector<platform::Place>& places) {
}
}
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUDA
static
std
::
string
FindCUDAIncludePath
()
{
auto
EndWith
=
[](
std
::
string
str
,
std
::
string
substr
)
->
bool
{
size_t
pos
=
str
.
rfind
(
substr
);
return
pos
!=
std
::
string
::
npos
&&
pos
==
(
str
.
length
()
-
substr
.
length
());
};
struct
stat
st
;
std
::
string
cuda_include_path
;
if
(
!
FLAGS_cuda_dir
.
empty
())
{
cuda_include_path
=
FLAGS_cuda_dir
;
if
(
EndWith
(
cuda_include_path
,
"/"
))
{
cuda_include_path
.
erase
(
cuda_include_path
.
end
()
-
1
);
}
for
(
std
::
string
suffix
:
{
"/lib"
,
"/lib64"
})
{
if
(
EndWith
(
FLAGS_cuda_dir
,
suffix
))
{
cuda_include_path
.
erase
(
cuda_include_path
.
end
()
-
suffix
.
length
());
break
;
}
}
if
(
!
EndWith
(
cuda_include_path
,
"include"
))
{
cuda_include_path
+=
"/include"
;
}
// Whether the cuda_include_path exists on the file system.
if
(
stat
(
cuda_include_path
.
c_str
(),
&
st
)
==
0
)
{
return
cuda_include_path
;
}
}
cuda_include_path
=
"/usr/local/cuda/include"
;
if
(
stat
(
cuda_include_path
.
c_str
(),
&
st
)
==
0
)
{
return
cuda_include_path
;
}
LOG
(
WARNING
)
<<
"Cannot find CUDA include path."
<<
"Please check whether CUDA is installed in the default "
"installation path, or specify it by export "
"FLAGS_cuda_dir=xxx."
;
return
""
;
}
CUDADeviceCode
::
CUDADeviceCode
(
const
Place
&
place
,
const
std
::
string
&
name
,
CUDADeviceCode
::
CUDADeviceCode
(
const
Place
&
place
,
const
std
::
string
&
name
,
const
std
::
string
&
kernel
)
{
const
std
::
string
&
kernel
)
{
if
(
!
is_gpu_place
(
place
))
{
if
(
!
is_gpu_place
(
place
))
{
...
@@ -91,7 +134,7 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
...
@@ -91,7 +134,7 @@ CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name,
kernel_
=
kernel
;
kernel_
=
kernel
;
}
}
bool
CUDADeviceCode
::
Compile
()
{
bool
CUDADeviceCode
::
Compile
(
bool
include_path
)
{
is_compiled_
=
false
;
is_compiled_
=
false
;
if
(
!
dynload
::
HasNVRTC
()
||
!
dynload
::
HasCUDADriver
())
{
if
(
!
dynload
::
HasNVRTC
()
||
!
dynload
::
HasCUDADriver
())
{
LOG
(
WARNING
)
LOG
(
WARNING
)
...
@@ -116,8 +159,14 @@ bool CUDADeviceCode::Compile() {
...
@@ -116,8 +159,14 @@ bool CUDADeviceCode::Compile() {
int
compute_capability
=
dev_ctx
->
GetComputeCapability
();
int
compute_capability
=
dev_ctx
->
GetComputeCapability
();
std
::
string
compute_flag
=
std
::
string
compute_flag
=
"--gpu-architecture=compute_"
+
std
::
to_string
(
compute_capability
);
"--gpu-architecture=compute_"
+
std
::
to_string
(
compute_capability
);
const
std
::
vector
<
const
char
*>
options
=
{
"--std=c++11"
,
std
::
vector
<
const
char
*>
options
=
{
"--std=c++11"
,
compute_flag
.
c_str
()};
compute_flag
.
c_str
()};
if
(
include_path
)
{
std
::
string
cuda_include_path
=
FindCUDAIncludePath
();
if
(
!
cuda_include_path
.
empty
())
{
std
::
string
include_option
=
"--include-path="
+
cuda_include_path
;
options
.
push_back
(
include_option
.
c_str
());
}
}
nvrtcResult
compile_result
=
nvrtcResult
compile_result
=
dynload
::
nvrtcCompileProgram
(
program
,
// program
dynload
::
nvrtcCompileProgram
(
program
,
// program
options
.
size
(),
// numOptions
options
.
size
(),
// numOptions
...
...
paddle/fluid/platform/device_code.h
浏览文件 @
22bbd547
...
@@ -31,7 +31,7 @@ namespace platform {
...
@@ -31,7 +31,7 @@ namespace platform {
class
DeviceCode
{
class
DeviceCode
{
public:
public:
virtual
~
DeviceCode
()
{}
virtual
~
DeviceCode
()
{}
virtual
bool
Compile
()
=
0
;
virtual
bool
Compile
(
bool
include_path
=
false
)
=
0
;
virtual
void
Launch
(
const
size_t
n
,
std
::
vector
<
void
*>*
args
)
const
=
0
;
virtual
void
Launch
(
const
size_t
n
,
std
::
vector
<
void
*>*
args
)
const
=
0
;
Place
GetPlace
()
const
{
return
place_
;
}
Place
GetPlace
()
const
{
return
place_
;
}
...
@@ -48,7 +48,7 @@ class CUDADeviceCode : public DeviceCode {
...
@@ -48,7 +48,7 @@ class CUDADeviceCode : public DeviceCode {
public:
public:
explicit
CUDADeviceCode
(
const
Place
&
place
,
const
std
::
string
&
name
,
explicit
CUDADeviceCode
(
const
Place
&
place
,
const
std
::
string
&
name
,
const
std
::
string
&
kernel
);
const
std
::
string
&
kernel
);
bool
Compile
()
override
;
bool
Compile
(
bool
include_path
=
false
)
override
;
void
Launch
(
const
size_t
n
,
std
::
vector
<
void
*>*
args
)
const
override
;
void
Launch
(
const
size_t
n
,
std
::
vector
<
void
*>*
args
)
const
override
;
void
SetNumThreads
(
int
num_threads
)
{
num_threads_
=
num_threads
;
}
void
SetNumThreads
(
int
num_threads
)
{
num_threads_
=
num_threads
;
}
...
...
python/paddle/fluid/tests/unittests/ir/CMakeLists.txt
浏览文件 @
22bbd547
...
@@ -2,7 +2,7 @@ file(GLOB TEST_IR_PASSES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
...
@@ -2,7 +2,7 @@ file(GLOB TEST_IR_PASSES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string
(
REPLACE
".py"
""
TEST_IR_PASSES
"
${
TEST_IR_PASSES
}
"
)
string
(
REPLACE
".py"
""
TEST_IR_PASSES
"
${
TEST_IR_PASSES
}
"
)
if
(
NOT WITH_GPU OR WIN32 OR APPLE
)
if
(
NOT WITH_GPU OR WIN32 OR APPLE
)
LIST
(
REMOVE_ITEM TEST_IR_PASSES test_ir_fusion_group
)
LIST
(
REMOVE_ITEM TEST_IR_PASSES test_ir_fusion_group
_pass
)
endif
()
endif
()
foreach
(
target
${
TEST_IR_PASSES
}
)
foreach
(
target
${
TEST_IR_PASSES
}
)
...
...
python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group.py
→
python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group
_pass
.py
浏览文件 @
22bbd547
...
@@ -17,92 +17,125 @@ import unittest
...
@@ -17,92 +17,125 @@ import unittest
import
numpy
as
np
import
numpy
as
np
from
pass_test
import
PassTest
from
pass_test
import
PassTest
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.layers
as
layers
import
paddle.fluid.core
as
core
import
paddle.fluid.core
as
core
class
FusionGroupPassTest
(
PassTest
):
class
FusionGroupPassTest
(
PassTest
):
def
setUp
(
self
):
def
build_program
(
self
,
dtype
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data1
=
fluid
.
data
(
name
=
"data1"
,
shape
=
[
32
,
128
],
dtype
=
"float32"
)
self
.
feed_vars
=
self
.
_prepare_feed_vars
([
32
,
128
],
dtype
,
2
)
data2
=
fluid
.
data
(
name
=
"data2"
,
shape
=
[
32
,
128
],
dtype
=
"float32"
)
self
.
feed_vars
.
append
(
data3
=
fluid
.
data
(
name
=
"data3"
,
shape
=
[
32
,
128
],
dtype
=
"float32"
)
fluid
.
data
(
tmp_1
=
fluid
.
layers
.
elementwise_add
(
data1
,
data2
)
name
=
"data2"
,
shape
=
[
128
,
128
],
dtype
=
dtype
))
tmp_2
=
fluid
.
layers
.
elementwise_mul
(
data3
,
tmp_1
)
# subgraph with only 1 op node
self
.
feeds
=
{
tmp_0
=
self
.
feed_vars
[
0
]
*
self
.
feed_vars
[
1
]
"data1"
:
np
.
random
.
random
((
32
,
128
)).
astype
(
"float32"
),
tmp_1
=
layers
.
mul
(
tmp_0
,
self
.
feed_vars
[
2
])
"data2"
:
np
.
random
.
random
((
32
,
128
)).
astype
(
"float32"
),
# subgraph with 2 op nodes
"data3"
:
np
.
random
.
random
((
32
,
128
)).
astype
(
"float32"
)
tmp_2
=
layers
.
relu
(
tmp_0
+
tmp_1
)
}
self
.
fetch_list
=
[
tmp_1
,
tmp_2
]
self
.
fetch_list
=
[
tmp_2
]
self
.
num_fused_ops
=
1
def
setUp
(
self
):
self
.
build_program
(
"float32"
)
self
.
feeds
=
self
.
_feed_random_data
(
self
.
feed_vars
)
self
.
pass_names
=
"fusion_group_pass"
self
.
pass_names
=
"fusion_group_pass"
self
.
fused_op_type
=
"fusion_group"
self
.
fused_op_type
=
"fusion_group"
self
.
num_fused_ops
=
1
def
_prepare_feed_vars
(
self
,
shape
,
dtype
,
num_data
):
feed_vars
=
[]
for
i
in
range
(
num_data
):
var
=
fluid
.
data
(
name
=
(
"data"
+
str
(
i
)),
shape
=
shape
,
dtype
=
dtype
)
feed_vars
.
append
(
var
)
return
feed_vars
def
_feed_random_data
(
self
,
feed_vars
):
feeds
=
{}
for
var
in
feed_vars
:
if
var
.
type
!=
fluid
.
core
.
VarDesc
.
VarType
.
LOD_TENSOR
:
raise
TypeError
(
"Feed data of non LoDTensor is not supported."
)
shape
=
var
.
shape
if
var
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP32
:
dtype
=
"float32"
elif
var
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP64
:
dtype
=
"float64"
elif
var
.
dtype
==
fluid
.
core
.
VarDesc
.
VarType
.
FP16
:
dtype
=
"float16"
else
:
raise
ValueError
(
"Unsupported dtype %s"
%
var
.
dtype
)
feeds
[
var
.
name
]
=
np
.
random
.
random
(
shape
).
astype
(
dtype
)
return
feeds
def
test_check_output
(
self
):
def
test_check_output
(
self
):
use_gpu_set
=
[]
if
core
.
is_compiled_with_cuda
():
if
core
.
is_compiled_with_cuda
():
use_gpu_set
.
append
(
True
)
self
.
pass_attrs
=
{
"fusion_group_pass"
:
{
"use_gpu"
:
True
}}
for
use_gpu
in
use_gpu_set
:
self
.
check_output_with_place
(
fluid
.
CUDAPlace
(
0
))
self
.
pass_attrs
=
{
"fusion_group_pass"
:
{
"use_gpu"
:
use_gpu
}}
place
=
fluid
.
CUDAPlace
(
0
)
if
use_gpu
else
fluid
.
CPUPlace
()
self
.
check_output_with_place
(
place
,
startup_on_cpu
=
False
)
class
FusionGroupPassTest1
(
FusionGroupPassTest
):
class
FusionGroupPassTest1
(
FusionGroupPassTest
):
def
setUp
(
self
):
def
build_program
(
self
,
dtype
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data
=
[]
self
.
feed_vars
=
self
.
_prepare_feed_vars
([
32
,
128
],
dtype
,
5
)
for
i
in
range
(
5
):
data
.
append
(
tmp_0
=
layers
.
assign
(
self
.
feed_vars
[
0
])
fluid
.
data
(
# subgraph with 9 op nodes
name
=
(
"data"
+
str
(
i
)),
tmp_1
=
tmp_0
*
layers
.
sigmoid
(
self
.
feed_vars
[
1
])
+
layers
.
sigmoid
(
shape
=
[
32
,
128
],
self
.
feed_vars
[
2
])
*
layers
.
tanh
(
self
.
feed_vars
[
3
])
dtype
=
"float32"
))
tmp_2
=
layers
.
tanh
(
tmp_1
)
+
layers
.
sigmoid
(
self
.
feed_vars
[
4
])
tmp_1
=
(
fluid
.
layers
.
assign
(
data
[
0
])
*
fluid
.
layers
.
sigmoid
(
data
[
1
])
)
+
(
fluid
.
layers
.
sigmoid
(
data
[
2
])
*
fluid
.
layers
.
tanh
(
data
[
3
]))
tmp_2
=
fluid
.
layers
.
tanh
(
tmp_1
)
+
fluid
.
layers
.
sigmoid
(
data
[
4
])
self
.
feeds
=
{}
for
i
in
range
(
5
):
self
.
feeds
[
"data"
+
str
(
i
)]
=
np
.
random
.
random
(
(
32
,
128
)).
astype
(
"float32"
)
self
.
fetch_list
=
[
tmp_1
,
tmp_2
]
self
.
fetch_list
=
[
tmp_1
,
tmp_2
]
self
.
pass_names
=
"fusion_group_pass"
self
.
fused_op_type
=
"fusion_group"
self
.
num_fused_ops
=
1
self
.
num_fused_ops
=
1
class
FusionGroupPassTest2
(
FusionGroupPassTest
):
class
FusionGroupPassTest2
(
FusionGroupPassTest
):
def
setUp
(
self
):
def
build_program
(
self
,
dtype
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
data
=
[]
self
.
feed_vars
=
self
.
_prepare_feed_vars
([
32
,
128
],
dtype
,
3
)
for
i
in
range
(
3
):
self
.
feed_vars
.
append
(
data
.
append
(
fluid
.
data
(
name
=
(
"data"
+
str
(
i
)),
shape
=
[
32
,
128
],
dtype
=
"float32"
))
data
.
append
(
fluid
.
data
(
fluid
.
data
(
name
=
"data3"
,
shape
=
[
128
,
32
],
dtype
=
"float32"
))
name
=
"data3"
,
shape
=
[
128
,
32
],
dtype
=
dtype
))
tmp_1
=
fluid
.
layers
.
relu
((
data
[
0
]
-
data
[
1
])
*
data
[
2
])
tmp_2
=
fluid
.
layers
.
sigmoid
(
data
[
3
])
# subgraph with 3 op nodes
tmp_3
=
fluid
.
layers
.
relu
(
tmp_2
)
tmp_1
=
layers
.
relu
(
tmp_4
=
fluid
.
layers
.
mul
(
tmp_1
,
tmp_3
)
(
self
.
feed_vars
[
0
]
-
self
.
feed_vars
[
1
])
*
self
.
feed_vars
[
2
])
# subgraph with 2 op nodes
self
.
feeds
=
{}
tmp_2
=
layers
.
relu
(
layers
.
sigmoid
(
self
.
feed_vars
[
3
]))
for
i
in
range
(
3
):
tmp_3
=
layers
.
mul
(
tmp_1
,
tmp_2
)
self
.
feeds
[
"data"
+
str
(
i
)]
=
np
.
random
.
random
(
(
32
,
128
)).
astype
(
"float32"
)
self
.
fetch_list
=
[
tmp_1
,
tmp_2
,
tmp_3
]
self
.
feeds
[
"data3"
]
=
np
.
random
.
random
((
128
,
32
)).
astype
(
"float32"
)
self
.
num_fused_ops
=
2
self
.
fetch_list
=
[
tmp_1
,
tmp_2
,
tmp_3
,
tmp_4
]
class
FusionGroupPassTestFP64
(
FusionGroupPassTest
):
def
setUp
(
self
):
self
.
build_program
(
"float64"
)
self
.
feeds
=
self
.
_feed_random_data
(
self
.
feed_vars
)
self
.
pass_names
=
"fusion_group_pass"
self
.
pass_names
=
"fusion_group_pass"
self
.
fused_op_type
=
"fusion_group"
self
.
fused_op_type
=
"fusion_group"
self
.
num_fused_ops
=
2
class
FusionGroupPassTestFP16
(
FusionGroupPassTest
):
def
build_program
(
self
,
dtype
):
with
fluid
.
program_guard
(
self
.
main_program
,
self
.
startup_program
):
self
.
feed_vars
=
self
.
_prepare_feed_vars
([
32
,
128
],
dtype
,
2
)
self
.
feed_vars
.
append
(
fluid
.
data
(
name
=
"data2"
,
shape
=
[
128
,
128
],
dtype
=
dtype
))
# subgraph with only 1 op node
tmp_0
=
self
.
feed_vars
[
0
]
*
self
.
feed_vars
[
1
]
tmp_1
=
layers
.
mul
(
tmp_0
,
self
.
feed_vars
[
2
])
tmp_2
=
layers
.
cast
(
tmp_0
,
dtype
=
"float16"
)
tmp_3
=
layers
.
cast
(
tmp_1
,
dtype
=
"float16"
)
# subgraph with 2 op nodes
tmp_4
=
layers
.
relu
(
tmp_2
+
tmp_3
)
tmp_5
=
layers
.
cast
(
tmp_4
,
dtype
=
dtype
)
self
.
fetch_list
=
[
tmp_5
]
self
.
num_fused_ops
=
1
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录