Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
f265a313
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
f265a313
编写于
2月 28, 2023
作者:
Z
zhupengyang
提交者:
GitHub
2月 28, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[XPU] support convert fp16 model (#50790)
上级
569b018e
变更
15
隐藏空白更改
内联
并排
Showing
15 changed file
with
426 addition
and
138 deletion
+426
-138
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+1
-1
paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+55
-25
paddle/fluid/framework/ir/auto_mixed_precision_pass.h
paddle/fluid/framework/ir/auto_mixed_precision_pass.h
+2
-2
paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+6
-0
paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc
paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc
+112
-24
paddle/fluid/framework/ir/xpu/quant_utils.cc
paddle/fluid/framework/ir/xpu/quant_utils.cc
+85
-24
paddle/fluid/framework/ir/xpu/quant_utils.h
paddle/fluid/framework/ir/xpu/quant_utils.h
+5
-2
paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
...d/inference/analysis/passes/convert_to_mixed_precision.cc
+29
-17
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+6
-1
paddle/fluid/inference/api/paddle_pass_builder.cc
paddle/fluid/inference/api/paddle_pass_builder.cc
+1
-1
paddle/phi/backends/xpu/xpu2_op_list.cc
paddle/phi/backends/xpu/xpu2_op_list.cc
+4
-2
paddle/phi/common/backend.h
paddle/phi/common/backend.h
+2
-0
paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
+30
-21
paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc
paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc
+32
-18
python/paddle/fluid/tests/unittests/ir/inference/test_xpu_convert_mixed_precision.py
...nittests/ir/inference/test_xpu_convert_mixed_precision.py
+56
-0
未找到文件。
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
f265a313
...
...
@@ -215,7 +215,7 @@ if(WITH_XPU)
cc_library
(
xpu_quant_utils
SRCS xpu/quant_utils.cc
DEPS pass
)
DEPS pass
phi
)
cc_library
(
xpu_pass_utils
SRCS xpu/pass_utils.cc
...
...
paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
浏览文件 @
f265a313
...
...
@@ -47,6 +47,23 @@ bool PhiKernelSupportPrecision(
return
phi
::
KernelFactory
::
Instance
().
HasKernel
(
op_type
,
kernel_key
);
}
static
phi
::
Backend
ConvertPlaceToBackend
(
const
phi
::
Place
&
place
)
{
switch
(
place
.
GetType
())
{
case
phi
::
AllocationType
::
CPU
:
return
phi
::
Backend
::
CPU
;
case
phi
::
AllocationType
::
GPU
:
return
phi
::
Backend
::
GPU
;
case
phi
::
AllocationType
::
XPU
:
return
phi
::
Backend
::
XPU
;
case
phi
::
AllocationType
::
NPU
:
return
phi
::
Backend
::
NPU
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Cannot convert place(%d)."
,
static_cast
<
int
>
(
place
.
GetType
())));
}
return
phi
::
Backend
::
UNDEFINED
;
}
bool
KernelSupportPrecision
(
const
std
::
string
&
op_type
,
phi
::
Backend
backend
,
...
...
@@ -65,7 +82,7 @@ bool KernelSupportPrecision(
auto
it
=
all_kernels
.
find
(
op_type
);
if
(
it
!=
all_kernels
.
end
())
{
for
(
const
auto
&
kern_pair
:
it
->
second
)
{
if
(
platform
::
is_gpu_place
(
kern_pair
.
first
.
place_
)
&&
if
(
ConvertPlaceToBackend
(
kern_pair
.
first
.
place_
)
==
backend
&&
kern_pair
.
first
.
data_type_
==
framework
::
TransToProtoVarType
(
precision
))
{
support
=
true
;
...
...
@@ -150,20 +167,8 @@ bool OpSupportPrecision(const std::string& op_type,
phi
::
Backend
backend
,
phi
::
DataType
precision
,
const
std
::
unordered_set
<
std
::
string
>&
black_list
)
{
bool
support
=
false
;
if
(
black_list
.
count
(
op_type
)
==
0
)
{
// Actual custom backend will be added after the NUM_BACKENDS.
// We use this feature to determine whether backend is custom device.
if
(
backend
==
phi
::
Backend
::
GPU
||
static_cast
<
size_t
>
(
backend
)
>
static_cast
<
size_t
>
(
phi
::
Backend
::
NUM_BACKENDS
))
{
support
=
KernelSupportPrecision
(
op_type
,
backend
,
precision
);
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
InvalidArgument
(
"Now, only support backend of GPU and Custom Device ."
));
}
}
return
support
;
return
black_list
.
count
(
op_type
)
==
0
&&
KernelSupportPrecision
(
op_type
,
backend
,
precision
);
}
// The set of ops that support fp16 calculation and are considered
...
...
@@ -192,15 +197,13 @@ void AutoMixedPrecisionPass::SetDefaultBlacklist() const {
}
void
AutoMixedPrecisionPass
::
Init
(
Graph
*
graph
)
const
{
bool
enable_gpu_mixed
=
Get
<
bool
>
(
"enable_gpu_mixed"
);
bool
enable_custom_device_mixed
=
false
;
if
(
Has
(
"enable_custom_device_mixed"
))
{
enable_custom_device_mixed
=
Get
<
bool
>
(
"enable_custom_device_mixed"
);
}
if
(
enable_gpu_mixed
)
{
if
(
Has
(
"enable_gpu_mixed"
)
&&
Get
<
bool
>
(
"enable_gpu_mixed"
))
{
backend_
=
phi
::
Backend
::
GPU
;
}
else
if
(
enable_custom_device_mixed
)
{
// transform Backend::CUSTOM to actual backend.
}
else
if
(
Has
(
"enable_xpu_mixed"
)
&&
Get
<
bool
>
(
"enable_xpu_mixed"
))
{
backend_
=
phi
::
Backend
::
XPU
;
}
else
if
(
Has
(
"enable_custom_device_mixed"
)
&&
Get
<
bool
>
(
"enable_custom_device_mixed"
))
{
// transform Backend::CUSTOM to actual backend.
// Here, we only consider one custom backend.
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto
device_type
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
()[
0
];
...
...
@@ -214,7 +217,7 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const {
"Cannot enable custom_device_mixed."
));
#endif
}
skip_pass_
=
!
enable_gpu_mixed
&&
!
enable_custom_device_mixed
;
skip_pass_
=
backend_
==
phi
::
Backend
::
UNDEFINED
;
low_precision_
=
static_cast
<
phi
::
DataType
>
(
Get
<
int
>
(
"mixed_precision_mode"
));
...
...
@@ -225,7 +228,6 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const {
VLOG
(
4
)
<<
" - "
<<
name
;
}
keep_io_types_
=
true
;
if
(
Has
(
"keep_io_types"
))
{
keep_io_types_
=
Get
<
bool
>
(
"keep_io_types"
);
}
...
...
@@ -607,6 +609,20 @@ bool AutoMixedPrecisionPass::InputVarsNotConvert(
return
true
;
}
}
if
(
backend_
==
phi
::
Backend
::
XPU
)
{
if
(
GetOpOriginalType
(
op_desc
->
Type
())
==
"layer_norm"
)
{
auto
vecs
=
op_desc
->
Input
(
"Bias"
);
if
(
std
::
find
(
vecs
.
begin
(),
vecs
.
end
(),
var_name
)
!=
vecs
.
end
())
{
return
true
;
}
vecs
=
op_desc
->
Input
(
"Scale"
);
if
(
std
::
find
(
vecs
.
begin
(),
vecs
.
end
(),
var_name
)
!=
vecs
.
end
())
{
return
true
;
}
}
}
return
false
;
}
...
...
@@ -632,6 +648,20 @@ bool AutoMixedPrecisionPass::OutputVarsNotConvert(
return
true
;
}
}
if
(
backend_
==
phi
::
Backend
::
XPU
)
{
if
(
GetOpOriginalType
(
op_desc
->
Type
())
==
"layer_norm"
)
{
auto
vecs
=
op_desc
->
Output
(
"Mean"
);
if
(
std
::
find
(
vecs
.
begin
(),
vecs
.
end
(),
var_name
)
!=
vecs
.
end
())
{
return
true
;
}
vecs
=
op_desc
->
Output
(
"Variance"
);
if
(
std
::
find
(
vecs
.
begin
(),
vecs
.
end
(),
var_name
)
!=
vecs
.
end
())
{
return
true
;
}
}
}
return
false
;
}
...
...
paddle/fluid/framework/ir/auto_mixed_precision_pass.h
浏览文件 @
f265a313
...
...
@@ -68,11 +68,11 @@ class AutoMixedPrecisionPass : public FusePassBase {
private:
mutable
bool
skip_pass_
{
false
};
mutable
bool
keep_io_types_
{
fals
e
};
mutable
bool
keep_io_types_
{
tru
e
};
// float16 or bfloat16 now
mutable
phi
::
DataType
low_precision_
{
phi
::
DataType
::
FLOAT16
};
mutable
phi
::
Backend
backend_
{
phi
::
Backend
::
GPU
};
mutable
phi
::
Backend
backend_
{
phi
::
Backend
::
UNDEFINED
};
mutable
std
::
unordered_set
<
std
::
string
>
black_list_
;
...
...
paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
浏览文件 @
f265a313
...
...
@@ -245,6 +245,12 @@ void FcXPUFusePass::ApplyImpl(ir::Graph* graph,
QuantWeight
<
int16_t
>
(
mul_w_tensor
,
mul_w_max_tensor
,
!
transpose_w
);
}
if
(
bias
!=
nullptr
)
{
auto
*
bias_tensor
=
scope
->
Var
(
bias
->
Name
())
->
GetMutable
<
phi
::
DenseTensor
>
();
CastToFp32
(
bias_tensor
);
}
std
::
string
fc_out_name
;
if
(
act_out
)
{
fc_out_name
=
act_out
->
Name
();
...
...
paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc
浏览文件 @
f265a313
...
...
@@ -31,6 +31,7 @@
#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/kernels/concat_kernel.h"
namespace
phi
{
class
DenseTensor
;
...
...
@@ -617,6 +618,9 @@ class MultiEncoderXPUFusePass : public FusePassBase {
bool
ApplyMultiEncoderXPUFuse
(
ir
::
Graph
*
graph
)
const
;
// Mask must be fp32 even if model is fp16
int
CastMask
(
ir
::
Graph
*
graph
)
const
;
// 1. Transpose q_w, k_w, v_w
// 2. Concat q_w, k_w, v_w
// 3. Generate qkv_w_max tensor
...
...
@@ -674,8 +678,11 @@ void MultiEncoderXPUFusePass::ApplyImpl(ir::Graph* graph) const {
}
}
}
int
cast_mask_counts
=
CastMask
(
graph
);
AddStatis
(
single_encoder_fused_counts
);
AddStatis
(
multi_encoder_fused_counts
);
AddStatis
(
cast_mask_counts
);
}
void
MultiEncoderXPUFusePass
::
PrepareQKVWeight
(
...
...
@@ -685,29 +692,28 @@ void MultiEncoderXPUFusePass::PrepareQKVWeight(
phi
::
DenseTensor
*
qkv_w
,
phi
::
DenseTensor
*
qkv_w_max
)
const
{
// Transpose
phi
::
DenseTensor
q_w_trans
;
phi
::
DenseTensor
k_w_trans
;
phi
::
DenseTensor
v_w_trans
;
Transpose2D
<
float
>
(
q_w
,
&
q_w_trans
);
Transpose2D
<
float
>
(
k_w
,
&
k_w_trans
);
Transpose2D
<
float
>
(
v_w
,
&
v_w_trans
);
phi
::
DenseTensor
q_w_t
;
phi
::
DenseTensor
k_w_t
;
phi
::
DenseTensor
v_w_t
;
Assign
(
q_w
,
&
q_w_t
);
Assign
(
k_w
,
&
k_w_t
);
Assign
(
v_w
,
&
v_w_t
);
Transpose2D
(
&
q_w_t
);
Transpose2D
(
&
k_w_t
);
Transpose2D
(
&
v_w_t
);
// Concat
auto
q_w_trans_dims
=
q_w_trans
.
dims
();
auto
k_w_trans_dims
=
k_w_trans
.
dims
();
auto
v_w_trans_dims
=
v_w_trans
.
dims
();
qkv_w
->
Resize
(
DDim
({
q_w_trans_dims
[
0
]
+
k_w_trans_dims
[
0
]
+
v_w_trans_dims
[
0
],
q_w_trans_dims
[
1
]}));
qkv_w
->
Resize
(
DDim
(
{
q_w_t
.
dims
()[
0
]
+
k_w_t
.
dims
()[
0
]
+
v_w_t
.
dims
()[
0
],
q_w_t
.
dims
()[
1
]}));
qkv_w
->
set_type
(
q_w
.
type
());
auto
*
dev_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
phi
::
CPUPlace
()));
int
size
=
q_w
.
numel
();
auto
*
qkv_w_data
=
dev_ctx
->
Alloc
<
float
>
(
qkv_w
);
memcpy
(
qkv_w_data
,
q_w_trans
.
data
(),
size
*
sizeof
(
float
));
qkv_w_data
+=
size
;
memcpy
(
qkv_w_data
,
k_w_trans
.
data
(),
size
*
sizeof
(
float
));
qkv_w_data
+=
size
;
memcpy
(
qkv_w_data
,
v_w_trans
.
data
(),
size
*
sizeof
(
float
));
std
::
vector
<
const
phi
::
DenseTensor
*>
in_tensors
{
&
q_w_t
,
&
k_w_t
,
&
v_w_t
};
if
(
q_w
.
type
()
==
phi
::
DataType
::
FLOAT16
)
{
phi
::
ConcatKernel
<
float16
>
(
*
dev_ctx
,
in_tensors
,
0
,
qkv_w
);
}
else
{
phi
::
ConcatKernel
<
float
>
(
*
dev_ctx
,
in_tensors
,
0
,
qkv_w
);
}
// Quant to int16
QuantWeight
<
int16_t
>
(
qkv_w
,
qkv_w_max
,
false
);
...
...
@@ -846,6 +852,9 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse(
auto
*
block
=
q_matmul
->
Op
()
->
Block
();
auto
*
scope
=
param_scope
();
bool
enable_fp16
=
scope
->
FindVar
(
q_matmul_w
->
Name
())
->
Get
<
phi
::
DenseTensor
>
().
dtype
()
==
phi
::
DataType
::
FLOAT16
;
// Prepare q,k,v weight
std
::
string
q_w_name
=
q_matmul_w
->
Name
();
...
...
@@ -905,12 +914,32 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse(
auto
*
qkv_add_bias
=
graph
->
CreateVarNode
(
&
qkv_add_bias_desc
);
auto
*
qkv_add_bias_var
=
block
->
Var
(
qkv_add_bias_name
);
qkv_add_bias_var
->
SetPersistable
(
true
);
auto
*
q_add_bias_tensor
=
scope
->
FindVar
(
q_add_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
();
auto
*
k_add_bias_tensor
=
scope
->
FindVar
(
k_add_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
();
auto
*
v_add_bias_tensor
=
scope
->
FindVar
(
v_add_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
();
CastToFp32
(
q_add_bias_tensor
);
CastToFp32
(
k_add_bias_tensor
);
CastToFp32
(
v_add_bias_tensor
);
ConcatQKVBias
(
scope
->
FindVar
(
q_add_bias_name
)
->
Get
<
phi
::
DenseTensor
>
()
,
scope
->
FindVar
(
k_add_bias_name
)
->
Get
<
phi
::
DenseTensor
>
()
,
scope
->
FindVar
(
v_add_bias_name
)
->
Get
<
phi
::
DenseTensor
>
()
,
*
q_add_bias_tensor
,
*
k_add_bias_tensor
,
*
v_add_bias_tensor
,
scope
->
Var
(
qkv_add_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
());
// Prepare qkv_add_0_bias, qkv_add_2_bias, qkv_add_3_bias
auto
qkv_add_0_bias_name
=
qkv_add_0_bias
->
Name
();
CastToFp32
(
scope
->
FindVar
(
qkv_add_0_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
());
auto
qkv_add_2_bias_name
=
qkv_add_2_bias
->
Name
();
CastToFp32
(
scope
->
FindVar
(
qkv_add_2_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
());
auto
qkv_add_3_bias_name
=
qkv_add_3_bias
->
Name
();
CastToFp32
(
scope
->
FindVar
(
qkv_add_3_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
());
// Generate single_encoder_xpu op
framework
::
OpDesc
op_desc
(
block
);
op_desc
.
SetType
(
"single_encoder_xpu"
);
...
...
@@ -927,9 +956,9 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse(
qkv_matmul_3_w_max_name
});
op_desc
.
SetInput
(
"fc_bias"
,
{
qkv_add_bias_name
,
qkv_add_0_bias
->
Name
()
,
qkv_add_2_bias
->
Name
()
,
qkv_add_3_bias
->
Name
()
});
qkv_add_0_bias
_name
,
qkv_add_2_bias
_name
,
qkv_add_3_bias
_name
});
if
(
norm_before
)
{
op_desc
.
SetInput
(
"ln_scale"
,
{
ln_0_scale
->
Name
(),
ln_1_scale
->
Name
()});
op_desc
.
SetInput
(
"ln_bias"
,
{
ln_0_bias
->
Name
(),
ln_1_bias
->
Name
()});
...
...
@@ -953,6 +982,7 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse(
static_cast
<
int
>
(
qkv_matmul_2_w_shape
[
1
]
/
qkv_matmul_2_w_shape
[
0
]));
op_desc
.
SetAttr
(
"act_type"
,
ConvertActivationType
(
act_type
));
op_desc
.
SetAttr
(
"relative_type"
,
static_cast
<
int
>
(
0
));
op_desc
.
SetAttr
(
"enable_fp16"
,
enable_fp16
);
if
(
norm_before
)
{
op_desc
.
SetOutput
(
"out"
,
{
qkv_add_4_out
->
Name
()});
}
else
{
...
...
@@ -1186,6 +1216,9 @@ bool MultiEncoderXPUFusePass::ApplyMultiEncoderXPUFuse(ir::Graph* graph) const {
PADDLE_GET_CONST
(
int
,
single_encoders
[
0
]
->
Op
()
->
GetAttr
(
attr_name
)));
}
op_desc
.
SetAttr
(
"slice_idx"
,
static_cast
<
int
>
(
-
1
));
op_desc
.
SetAttr
(
"enable_fp16"
,
PADDLE_GET_CONST
(
bool
,
single_encoders
[
0
]
->
Op
()
->
GetAttr
(
"enable_fp16"
)));
op_desc
.
SetOutput
(
"out"
,
{
out_name
});
op_desc
.
SetOutput
(
"x_fp16"
,
{
x_fp16_name
});
op_desc
.
SetOutput
(
"out_fp16"
,
{
out_fp16_name
});
...
...
@@ -1213,6 +1246,61 @@ bool MultiEncoderXPUFusePass::ApplyMultiEncoderXPUFuse(ir::Graph* graph) const {
return
true
;
}
int
MultiEncoderXPUFusePass
::
CastMask
(
ir
::
Graph
*
graph
)
const
{
int
cast_counts
=
0
;
auto
nodes
=
graph
->
Nodes
();
for
(
auto
node
:
nodes
)
{
if
(
node
->
IsVar
())
continue
;
auto
op_desc
=
node
->
Op
();
if
(
node
->
IsVar
()
||
//
op_desc
->
Type
()
!=
"multi_encoder_xpu"
||
!
op_desc
->
GetAttrIfExists
<
bool
>
(
"enable_fp16"
)
||
op_desc
->
Inputs
().
count
(
"mask"
)
==
0
)
continue
;
auto
*
block
=
op_desc
->
Block
();
auto
*
scope
=
param_scope
();
// Find mask node
std
::
string
mask_name
=
op_desc
->
Inputs
().
at
(
"mask"
)[
0
];
Node
*
mask
=
nullptr
;
for
(
auto
*
in_node
:
node
->
inputs
)
{
if
(
in_node
->
Var
()
->
Name
()
==
mask_name
)
{
mask
=
in_node
;
break
;
}
}
// Create new_mask node/var/tensor
std
::
string
new_mask_name
=
mask_name
+
"_fp32"
;
VarDesc
new_mask_desc
(
new_mask_name
);
auto
*
new_mask
=
graph
->
CreateVarNode
(
&
new_mask_desc
);
block
->
Var
(
new_mask_name
);
scope
->
Var
(
new_mask_name
)
->
GetMutable
<
phi
::
DenseTensor
>
();
// Create cast op
framework
::
OpDesc
cast_op_desc
(
block
);
cast_op_desc
.
SetType
(
"cast"
);
cast_op_desc
.
SetInput
(
"X"
,
{
mask_name
});
cast_op_desc
.
SetAttr
(
"in_dtype"
,
static_cast
<
int
>
(
framework
::
proto
::
VarType
::
FP16
));
cast_op_desc
.
SetAttr
(
"out_dtype"
,
static_cast
<
int
>
(
framework
::
proto
::
VarType
::
FP32
));
cast_op_desc
.
SetOutput
(
"Out"
,
{
new_mask_name
});
auto
*
cast
=
graph
->
CreateOpNode
(
&
cast_op_desc
);
IR_NODE_LINK_TO
(
mask
,
cast
);
IR_NODE_LINK_TO
(
cast
,
new_mask
);
// Update encoder
op_desc
->
SetInput
(
"mask"
,
{
new_mask_name
});
IR_NODE_LINK_TO
(
new_mask
,
node
);
IR_NODE_UNLINK
(
node
,
mask
);
cast_counts
++
;
}
return
cast_counts
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
...
...
paddle/fluid/framework/ir/xpu/quant_utils.cc
浏览文件 @
f265a313
...
...
@@ -16,33 +16,92 @@
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/assign_kernel.h"
#include "paddle/phi/kernels/cast_kernel.h"
#include "paddle/phi/kernels/transpose_kernel.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
template
<
typename
T
>
void
Transpose2D
(
const
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
*
out
)
{
auto
in_dims
=
in
.
dims
();
void
Assign
(
const
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
*
out
)
{
auto
*
cpu_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
phi
::
CPUPlace
()));
out
->
Resize
(
in
.
dims
());
out
->
set_type
(
in
.
dtype
());
out
->
set_layout
(
in
.
layout
());
phi
::
AssignKernel
(
*
cpu_ctx
,
in
,
out
);
}
void
Transpose2D
(
phi
::
DenseTensor
*
in
,
phi
::
DenseTensor
*
out
)
{
auto
in_dims
=
in
->
dims
();
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
2
,
platform
::
errors
::
InvalidArgument
(
"In dims rank should be 2, but received in dims size is [%d]."
,
in_dims
.
size
()));
out
->
Resize
({
in_dims
[
1
],
in_dims
[
0
]});
out
->
set_type
(
in
.
type
());
auto
*
dev_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
phi
::
DenseTensor
trans_tensor
;
phi
::
DenseTensor
*
out_ptr
=
out
==
nullptr
?
&
trans_tensor
:
out
;
out_ptr
->
Resize
({
in_dims
[
1
],
in_dims
[
0
]});
out_ptr
->
set_type
(
in
->
type
());
out_ptr
->
set_layout
(
in
->
layout
());
auto
*
cpu_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
phi
::
CPUPlace
()));
dev_ctx
->
Alloc
<
T
>
(
out
);
std
::
vector
<
int
>
axis
{
1
,
0
};
phi
::
funcs
::
Transpose
<
phi
::
CPUContext
,
T
,
2
>
trans2d
;
trans2d
(
*
dev_ctx
,
in
,
out
,
axis
);
switch
(
in
->
dtype
())
{
case
phi
::
DataType
::
FLOAT16
:
phi
::
TransposeKernel
<
float16
>
(
*
cpu_ctx
,
*
in
,
axis
,
out_ptr
);
break
;
case
phi
::
DataType
::
FLOAT32
:
phi
::
TransposeKernel
<
float
>
(
*
cpu_ctx
,
*
in
,
axis
,
out_ptr
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Only support fp16 and fp32, but received dtype is %s."
,
phi
::
DataTypeToString
(
in
->
dtype
())));
break
;
}
if
(
out
==
nullptr
)
{
Assign
(
*
out_ptr
,
in
);
}
}
template
void
Transpose2D
<
float
>(
const
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
*
out
);
void
CastToFp32
(
phi
::
DenseTensor
*
in
,
phi
::
DenseTensor
*
out
)
{
auto
*
cpu_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
phi
::
CPUPlace
()));
phi
::
DenseTensor
fp32_tensor
;
phi
::
DenseTensor
*
out_ptr
=
out
==
nullptr
?
&
fp32_tensor
:
out
;
out_ptr
->
Resize
(
in
->
dims
());
out_ptr
->
set_type
(
phi
::
DataType
::
FLOAT32
);
out_ptr
->
set_layout
(
in
->
layout
());
switch
(
in
->
dtype
())
{
case
phi
::
DataType
::
FLOAT16
:
phi
::
CastKernel
<
float16
>
(
*
cpu_ctx
,
*
in
,
phi
::
DataType
::
FLOAT32
,
out_ptr
);
break
;
case
phi
::
DataType
::
FLOAT32
:
if
(
out
==
nullptr
)
{
return
;
}
else
{
phi
::
AssignKernel
(
*
cpu_ctx
,
*
in
,
out_ptr
);
}
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Only support fp16 and fp32, but received dtype is %s."
,
phi
::
DataTypeToString
(
in
->
dtype
())));
break
;
}
if
(
out
==
nullptr
)
{
Assign
(
*
out_ptr
,
in
);
}
}
static
float
FindMaxAbs
(
const
float
*
data
,
int
len
)
{
float
max_f
=
0.0
f
;
...
...
@@ -151,14 +210,15 @@ template <typename T>
void
QuantWeight
(
phi
::
DenseTensor
*
weight
,
phi
::
DenseTensor
*
weight_max
,
bool
transpose
)
{
// Convert fp16 to fp32
phi
::
DenseTensor
weight_fp32
;
CastToFp32
(
weight
,
&
weight_fp32
);
// Transpose
auto
*
weight_data
=
weight
->
data
<
float
>
();
phi
::
DenseTensor
weight_trans
;
if
(
transpose
)
{
Transpose2D
<
float
>
(
*
weight
,
&
weight_trans
);
weight_data
=
weight_trans
.
data
<
float
>
();
weight
->
Resize
(
weight_trans
.
dims
());
Transpose2D
(
&
weight_fp32
);
}
// Find max
paddle
::
platform
::
DeviceContextPool
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
...
...
@@ -171,21 +231,22 @@ void QuantWeight(phi::DenseTensor* weight,
}
phi
::
XPUContext
*
xpu_ctx
=
static_cast
<
phi
::
XPUContext
*>
(
pool
.
Get
(
place
));
int
max_ptr_size
=
xpu_ctx
->
x_context
()
->
max_ptr_size
();
int
size
=
weight
->
numel
();
int
size
=
weight_fp32
.
numel
();
auto
*
weight_data
=
weight_fp32
.
data
<
float
>
();
float
max_val
=
FindMaxAbs
(
weight_data
,
size
);
std
::
vector
<
float
>
max_vec
(
max_ptr_size
,
max_val
);
weight_max
->
set_type
(
p
addle
::
experimental
::
CppTypeToDataType
<
float
>::
Type
()
);
weight_max
->
set_type
(
p
hi
::
DataType
::
FLOAT32
);
weight_max
->
Resize
({
max_ptr_size
});
auto
*
dev
_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
auto
*
cpu
_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
phi
::
CPUPlace
()));
memcpy
(
dev
_ctx
->
Alloc
<
float
>
(
weight_max
),
memcpy
(
cpu
_ctx
->
Alloc
<
float
>
(
weight_max
),
max_vec
.
data
(),
max_ptr_size
*
sizeof
(
float
));
// Quant
std
::
vector
<
T
>
quant_data
(
size
);
QuantFP32ToIntX
(
weight_data
,
quant_data
.
data
(),
max_val
,
size
);
weight
->
set_type
(
paddle
::
experimental
::
CppTypeToDataType
<
T
>::
Type
());
memcpy
(
dev_ctx
->
Alloc
<
T
>
(
weight
),
quant_data
.
data
(),
size
*
sizeof
(
T
));
weight
->
Resize
(
weight_fp32
.
dims
());
QuantFP32ToIntX
(
weight_data
,
cpu_ctx
->
Alloc
<
T
>
(
weight
),
max_val
,
size
);
}
template
void
QuantWeight
<
int16_t
>(
phi
::
DenseTensor
*
weight
,
...
...
paddle/fluid/framework/ir/xpu/quant_utils.h
浏览文件 @
f265a313
...
...
@@ -19,8 +19,11 @@ namespace paddle {
namespace
framework
{
namespace
ir
{
template
<
typename
T
>
void
Transpose2D
(
const
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
*
out
);
void
Assign
(
const
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
*
out
);
void
Transpose2D
(
phi
::
DenseTensor
*
in
,
phi
::
DenseTensor
*
out
=
nullptr
);
void
CastToFp32
(
phi
::
DenseTensor
*
in
,
phi
::
DenseTensor
*
out
=
nullptr
);
// 1. Quant weight from fp32 to int16/int31
// 2. Weight data is in-place update.
...
...
paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
浏览文件 @
f265a313
...
...
@@ -41,18 +41,31 @@ ConvertToMixedPrecisionPass::ConvertToMixedPrecisionPass(
backend_
(
backend
),
keep_io_types_
(
keep_io_types
),
black_list_
(
black_list
)
{
if
(
mixed_precision_
!=
phi
::
DataType
::
FLOAT16
&&
mixed_precision_
!=
phi
::
DataType
::
BFLOAT16
)
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
InvalidArgument
(
"mixed_precision currently not supported dtype %d, we now only "
"support fp16 and bf16."
,
static_cast
<
int
>
(
mixed_precision_
)));
}
if
(
backend_
!=
phi
::
Backend
::
GPU
&&
backend_
!=
phi
::
Backend
::
CUSTOM
)
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
InvalidArgument
(
"mixed_precision currently not supported place %d, we now only "
"support gpu and custom device ."
,
static_cast
<
int
>
(
backend_
)));
switch
(
backend_
)
{
case
phi
::
Backend
::
GPU
:
PADDLE_ENFORCE
(
mixed_precision_
==
phi
::
DataType
::
FLOAT16
||
mixed_precision_
==
phi
::
DataType
::
BFLOAT16
,
platform
::
errors
::
InvalidArgument
(
"mixed_precision of %s currently only supported fp16 "
"and bf16, not support %s."
,
experimental
::
BackendToString
(
backend_
),
phi
::
DataTypeToString
(
mixed_precision_
)));
break
;
case
phi
::
Backend
::
XPU
:
case
phi
::
Backend
::
CUSTOM
:
PADDLE_ENFORCE
(
mixed_precision_
==
phi
::
DataType
::
FLOAT16
,
platform
::
errors
::
InvalidArgument
(
"mixed_precision of %s currently only supported fp16 "
"and bf16, not support %s."
,
experimental
::
BackendToString
(
backend_
),
phi
::
DataTypeToString
(
mixed_precision_
)));
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"mixed_precision currently not supported place GPU or XPU or CUSTOM, "
"not support %s."
,
experimental
::
BackendToString
(
backend_
)));
break
;
}
}
...
...
@@ -70,17 +83,16 @@ void ConvertToMixedPrecisionPass::Run() {
framework
::
ir
::
AutoMixedPrecisionPass
pass
;
pass
.
Set
(
"mixed_precision_mode"
,
new
int
{
static_cast
<
int
>
(
mixed_precision_
)});
pass
.
Set
(
"mixed_black_list"
,
new
std
::
unordered_set
<
std
::
string
>
{
black_list_
});
if
(
backend_
==
phi
::
Backend
::
GPU
)
{
pass
.
Set
(
"enable_gpu_mixed"
,
new
bool
{
true
});
pass
.
Set
(
"enable_custom_device_mixed"
,
new
bool
{
false
});
}
else
if
(
backend_
==
phi
::
Backend
::
XPU
)
{
pass
.
Set
(
"enable_xpu_mixed"
,
new
bool
{
true
});
}
else
if
(
backend_
==
phi
::
Backend
::
CUSTOM
)
{
pass
.
Set
(
"enable_gpu_mixed"
,
new
bool
{
false
});
pass
.
Set
(
"enable_custom_device_mixed"
,
new
bool
{
true
});
}
pass
.
Set
(
"mixed_black_list"
,
new
std
::
unordered_set
<
std
::
string
>
{
black_list_
});
pass
.
Set
(
"keep_io_types"
,
new
bool
{
keep_io_types_
});
pass
.
Apply
(
main_graph_
.
get
());
SaveMixedModel
();
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
f265a313
...
...
@@ -1302,18 +1302,23 @@ void AnalysisPredictor::PrepareArgument() {
<<
", we will use a new PassStrategy. Note that only the GPU "
"backend is supported for now."
;
if
(
!
config_
.
use_cinn_compiler_
)
{
pass_builder
->
ClearPasses
();
const
auto
&
deleted_passes
=
pass_builder
->
GetAllDeletedPasses
();
if
(
config_
.
tensorrt_engine_enabled
())
{
pass_builder
->
ClearPasses
();
for
(
const
auto
&
pass
:
kTrtLowerPrecisionPasses
)
{
if
(
deleted_passes
.
count
(
pass
))
continue
;
pass_builder
->
AppendPass
(
pass
);
}
}
else
if
(
config_
.
use_gpu
())
{
pass_builder
->
ClearPasses
();
for
(
const
auto
&
pass
:
kGpuLowerPrecisionPasses
)
{
if
(
deleted_passes
.
count
(
pass
))
continue
;
pass_builder
->
AppendPass
(
pass
);
}
}
else
if
(
config_
.
use_xpu
())
{
// All passes support fp16. Not reset pass_builder.
}
else
{
pass_builder
->
ClearPasses
();
}
}
}
...
...
paddle/fluid/inference/api/paddle_pass_builder.cc
浏览文件 @
f265a313
...
...
@@ -519,9 +519,9 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
"delete_dropout_op_pass"
,
"identity_scale_op_clean_pass"
,
"generate_sequence_xpu_fuse_pass"
,
"embedding_with_eltwise_add_xpu_fuse_pass"
,
"multi_encoder_xpu_fuse_pass"
,
"multi_encoder_xpu_slice_fuse_pass"
,
"embedding_with_eltwise_add_xpu_fuse_pass"
,
"fc_xpu_fuse_pass"
,
"link_xpu_op_max_pass"
,
});
...
...
paddle/phi/backends/xpu/xpu2_op_list.cc
浏览文件 @
f265a313
...
...
@@ -253,7 +253,8 @@ XPUOpMap& get_kl2_ops() {
phi
::
DataType
::
BOOL
,
phi
::
DataType
::
FLOAT16
,
phi
::
DataType
::
FLOAT32
})},
{
"fc_xpu"
,
XPUKernelSet
({
phi
::
DataType
::
FLOAT32
})},
{
"fc_xpu"
,
XPUKernelSet
({
phi
::
DataType
::
FLOAT32
,
phi
::
DataType
::
FLOAT16
})},
{
"fill"
,
XPUKernelSet
({
phi
::
DataType
::
INT64
,
phi
::
DataType
::
INT32
,
...
...
@@ -461,7 +462,8 @@ XPUOpMap& get_kl2_ops() {
phi
::
DataType
::
FLOAT16
,
phi
::
DataType
::
INT32
,
phi
::
DataType
::
INT64
})},
{
"multi_encoder_xpu"
,
XPUKernelSet
({
phi
::
DataType
::
FLOAT32
})},
{
"multi_encoder_xpu"
,
XPUKernelSet
({
phi
::
DataType
::
FLOAT32
,
phi
::
DataType
::
FLOAT16
})},
{
"nearest_interp_v2"
,
XPUKernelSet
({
phi
::
DataType
::
FLOAT32
})},
{
"nearest_interp_v2_grad"
,
XPUKernelSet
({
phi
::
DataType
::
FLOAT32
})},
{
"not_equal"
,
...
...
paddle/phi/common/backend.h
浏览文件 @
f265a313
...
...
@@ -210,6 +210,8 @@ inline std::string BackendToString(const Backend& backend) {
return
"KPS"
;
case
Backend
::
IPU
:
return
"IPU"
;
case
Backend
::
CUSTOM
:
return
"CUSTOM"
;
default:
{
size_t
device_type_id_
=
static_cast
<
size_t
>
(
backend
)
-
static_cast
<
size_t
>
(
Backend
::
NUM_BACKENDS
);
...
...
paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
浏览文件 @
f265a313
...
...
@@ -33,44 +33,53 @@ void FcXPUKernel(const Context& ctx,
float
act_alpha
,
DenseTensor
*
out
,
DenseTensor
*
out_max
)
{
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
auto
in_mat_dims
=
flatten_to_2d
(
x
.
dims
(),
in_num_col_dims
);
int
m
=
in_mat_dims
[
0
];
int
k
=
in_mat_dims
[
1
];
int
n
=
w
.
dims
()[
0
];
auto
*
x_data
=
reinterpret_cast
<
const
XPUType
*>
(
x
.
data
<
T
>
());
const
float
*
x_max_data
=
x_max
.
get_ptr
()
==
nullptr
?
nullptr
:
x_max
.
get_ptr
()
->
data
<
float
>
();
const
float
*
bias_data
=
bias
.
get_ptr
()
==
nullptr
?
nullptr
:
bias
.
get_ptr
()
->
data
<
float
>
();
auto
*
out_data
=
reinterpret_cast
<
XPUType
*>
(
ctx
.
template
Alloc
<
T
>(
out
));
xpu
::
Activation_t
act
(
static_cast
<
xpu
::
Activation_t
::
act_enum
>
(
act_type
));
if
(
act_type
==
5
)
{
act
.
leaky_alpha
=
act_alpha
;
}
else
if
(
act_type
==
15
)
{
act
.
hard_sigmoid_slope
=
act_alpha
;
}
int
r
=
xpu
::
fc_fusion
<
T
,
int16_t
,
T
,
int16_t
>
(
// TX, TW. TY, TGEMM
ctx
.
x_context
(),
// ctx
x
.
data
<
T
>
(),
// x
w
.
data
<
int16_t
>
(),
// w
ctx
.
template
Alloc
<
T
>(
out
),
// y
m
,
// m
n
,
// n
k
,
// k
transpose_x
,
// x_trans
true
,
// w_trans
x_max_data
,
// x_maxptr
w_max
.
data
<
float
>
(),
// w_maxptr
ctx
.
template
Alloc
<
float
>(
out_max
),
// y_maxptr
transpose_x
?
m
:
k
,
// ldx
k
,
// ldw
n
,
// ldy
alpha
,
// alpha
beta
,
// beta
bias_data
,
// bias
act
);
int
r
=
xpu
::
fc_fusion
<
XPUType
,
int16_t
,
XPUType
,
int16_t
>
(
// TX, TW. TY, TGEMM
ctx
.
x_context
(),
// ctx
x_data
,
// x
w
.
data
<
int16_t
>
(),
// w
out_data
,
// y
m
,
// m
n
,
// n
k
,
// k
transpose_x
,
// x_trans
true
,
// w_trans
x_max_data
,
// x_maxptr
w_max
.
data
<
float
>
(),
// w_maxptr
ctx
.
template
Alloc
<
float
>(
out_max
),
// y_maxptr
transpose_x
?
m
:
k
,
// ldx
k
,
// ldw
n
,
// ldy
alpha
,
// alpha
beta
,
// beta
bias_data
,
// bias
act
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"fc_xpu"
);
}
}
// namespace fusion
}
// namespace phi
PD_REGISTER_KERNEL
(
fc_xpu
,
XPU
,
ALL_LAYOUT
,
phi
::
fusion
::
FcXPUKernel
,
float
)
{}
PD_REGISTER_KERNEL
(
fc_xpu
,
XPU
,
ALL_LAYOUT
,
phi
::
fusion
::
FcXPUKernel
,
float
,
phi
::
dtype
::
float16
)
{}
paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc
浏览文件 @
f265a313
...
...
@@ -40,18 +40,26 @@ void MultiEncoderXPUKernel(const Context& ctx,
DenseTensor
*
out
,
DenseTensor
*
x_fp16
,
DenseTensor
*
out_fp16
)
{
using
float16
=
typename
XPUTypeTrait
<
phi
::
dtype
::
float16
>::
Type
;
// XPU2 only support fp16 input/output.
float16
*
x_fp16_data
=
reinterpret_cast
<
float16
*>
(
ctx
.
template
Alloc
<
phi
::
dtype
::
float16
>(
x_fp16
));
int
r_cast_x
=
xpu
::
cast_v2
<
float
,
float16
>
(
ctx
.
x_context
(),
x
.
data
<
T
>
(),
x_fp16_data
,
x
.
numel
());
PADDLE_ENFORCE_XDNN_SUCCESS
(
r_cast_x
,
"multi_encoder_xpu(cast x from fp32 to fp16)"
);
float16
*
out_fp16_data
=
reinterpret_cast
<
float16
*>
(
ctx
.
template
Alloc
<
phi
::
dtype
::
float16
>(
out_fp16
));
auto
x_dtype
=
x
.
dtype
();
const
float16
*
x_fp16_data
=
nullptr
;
float16
*
out_fp16_data
=
nullptr
;
if
(
x_dtype
==
phi
::
DataType
::
FLOAT32
)
{
auto
*
x_fp16_data_t
=
reinterpret_cast
<
float16
*>
(
ctx
.
template
Alloc
<
phi
::
dtype
::
float16
>(
x_fp16
));
int
r_cast_x
=
xpu
::
cast_v2
<
float
,
float16
>
(
ctx
.
x_context
(),
x
.
data
<
float
>
(),
x_fp16_data_t
,
x
.
numel
());
PADDLE_ENFORCE_XDNN_SUCCESS
(
r_cast_x
,
"multi_encoder_xpu(cast x from fp32 to fp16)"
);
x_fp16_data
=
x_fp16_data_t
;
out_fp16_data
=
reinterpret_cast
<
float16
*>
(
ctx
.
template
Alloc
<
phi
::
dtype
::
float16
>(
out_fp16
));
}
else
{
x_fp16_data
=
reinterpret_cast
<
const
float16
*>
(
x
.
data
<
phi
::
dtype
::
float16
>
());
out_fp16_data
=
reinterpret_cast
<
float16
*>
(
ctx
.
template
Alloc
<
phi
::
dtype
::
float16
>(
out
));
}
// q,k,v weight are fused.
// Each encoder's weight should be: w0, null, null, w3, w4, w5
...
...
@@ -78,8 +86,8 @@ void MultiEncoderXPUKernel(const Context& ctx,
ln_scale_data
.
push_back
(
ln_scale
[
i
]
->
data
<
float
>
());
ln_bias_data
.
push_back
(
ln_bias
[
i
]
->
data
<
float
>
());
}
const
T
*
mask_data
=
mask
.
get_ptr
()
==
nullptr
?
nullptr
:
mask
.
get_ptr
()
->
data
<
T
>
();
const
float
*
mask_data
=
mask
.
get_ptr
()
==
nullptr
?
nullptr
:
mask
.
get_ptr
()
->
data
<
float
>
();
xpu
::
Activation_t
qkv_act
(
static_cast
<
xpu
::
Activation_t
::
act_enum
>
(
act_type
));
int
batch
=
x
.
dims
()[
0
];
...
...
@@ -152,10 +160,15 @@ void MultiEncoderXPUKernel(const Context& ctx,
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"multi_encoder_xpu"
);
}
int
r_cast_out
=
xpu
::
cast_v2
<
float16
,
float
>
(
ctx
.
x_context
(),
out_fp16_data
,
ctx
.
template
Alloc
<
T
>(
out
),
out
->
numel
());
PADDLE_ENFORCE_XDNN_SUCCESS
(
r_cast_out
,
"multi_encoder_xpu(cast out from fp16 to fp32)"
);
if
(
x_dtype
==
phi
::
DataType
::
FLOAT32
)
{
int
r_cast_out
=
xpu
::
cast_v2
<
float16
,
float
>
(
ctx
.
x_context
(),
out_fp16_data
,
ctx
.
template
Alloc
<
float
>(
out
),
out
->
numel
());
PADDLE_ENFORCE_XDNN_SUCCESS
(
r_cast_out
,
"multi_encoder_xpu(cast out from fp16 to fp32)"
);
}
}
}
// namespace fusion
...
...
@@ -165,4 +178,5 @@ PD_REGISTER_KERNEL(multi_encoder_xpu,
XPU
,
ALL_LAYOUT
,
phi
::
fusion
::
MultiEncoderXPUKernel
,
float
)
{}
float
,
phi
::
dtype
::
float16
)
{}
python/paddle/fluid/tests/unittests/ir/inference/test_xpu_convert_mixed_precision.py
0 → 100644
浏览文件 @
f265a313
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
tempfile
import
unittest
import
paddle
from
paddle.inference
import
(
PlaceType
,
PrecisionType
,
convert_to_mixed_precision
,
)
from
paddle.jit
import
to_static
from
paddle.static
import
InputSpec
from
paddle.vision.models
import
resnet50
class
ConvertMixedPrecison
(
unittest
.
TestCase
):
def
test
(
self
):
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
model
=
resnet50
(
True
)
net
=
to_static
(
model
,
input_spec
=
[
InputSpec
(
shape
=
[
None
,
3
,
224
,
224
],
name
=
'x'
)]
)
paddle
.
jit
.
save
(
net
,
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'resnet50/inference'
)
)
convert_to_mixed_precision
(
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'resnet50/inference.pdmodel'
),
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'resnet50/inference.pdiparams'
),
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'mixed_precision/inference.pdmodel'
),
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'mixed_precision/inference.pdiparams'
),
backend
=
PlaceType
.
XPU
,
mixed_precision
=
PrecisionType
.
Half
,
)
self
.
temp_dir
.
cleanup
()
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录