Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
wmsofts
Paddle
提交
f265a313
P
Paddle
项目概览
wmsofts
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
f265a313
编写于
2月 28, 2023
作者:
Z
zhupengyang
提交者:
GitHub
2月 28, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[XPU] support convert fp16 model (#50790)
上级
569b018e
变更
15
隐藏空白更改
内联
并排
Showing
15 changed file
with
426 addition
and
138 deletion
+426
-138
paddle/fluid/framework/ir/CMakeLists.txt
paddle/fluid/framework/ir/CMakeLists.txt
+1
-1
paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+55
-25
paddle/fluid/framework/ir/auto_mixed_precision_pass.h
paddle/fluid/framework/ir/auto_mixed_precision_pass.h
+2
-2
paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+6
-0
paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc
paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc
+112
-24
paddle/fluid/framework/ir/xpu/quant_utils.cc
paddle/fluid/framework/ir/xpu/quant_utils.cc
+85
-24
paddle/fluid/framework/ir/xpu/quant_utils.h
paddle/fluid/framework/ir/xpu/quant_utils.h
+5
-2
paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
...d/inference/analysis/passes/convert_to_mixed_precision.cc
+29
-17
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+6
-1
paddle/fluid/inference/api/paddle_pass_builder.cc
paddle/fluid/inference/api/paddle_pass_builder.cc
+1
-1
paddle/phi/backends/xpu/xpu2_op_list.cc
paddle/phi/backends/xpu/xpu2_op_list.cc
+4
-2
paddle/phi/common/backend.h
paddle/phi/common/backend.h
+2
-0
paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
+30
-21
paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc
paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc
+32
-18
python/paddle/fluid/tests/unittests/ir/inference/test_xpu_convert_mixed_precision.py
...nittests/ir/inference/test_xpu_convert_mixed_precision.py
+56
-0
未找到文件。
paddle/fluid/framework/ir/CMakeLists.txt
浏览文件 @
f265a313
...
...
@@ -215,7 +215,7 @@ if(WITH_XPU)
cc_library
(
xpu_quant_utils
SRCS xpu/quant_utils.cc
DEPS pass
)
DEPS pass
phi
)
cc_library
(
xpu_pass_utils
SRCS xpu/pass_utils.cc
...
...
paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
浏览文件 @
f265a313
...
...
@@ -47,6 +47,23 @@ bool PhiKernelSupportPrecision(
return
phi
::
KernelFactory
::
Instance
().
HasKernel
(
op_type
,
kernel_key
);
}
static
phi
::
Backend
ConvertPlaceToBackend
(
const
phi
::
Place
&
place
)
{
switch
(
place
.
GetType
())
{
case
phi
::
AllocationType
::
CPU
:
return
phi
::
Backend
::
CPU
;
case
phi
::
AllocationType
::
GPU
:
return
phi
::
Backend
::
GPU
;
case
phi
::
AllocationType
::
XPU
:
return
phi
::
Backend
::
XPU
;
case
phi
::
AllocationType
::
NPU
:
return
phi
::
Backend
::
NPU
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Cannot convert place(%d)."
,
static_cast
<
int
>
(
place
.
GetType
())));
}
return
phi
::
Backend
::
UNDEFINED
;
}
bool
KernelSupportPrecision
(
const
std
::
string
&
op_type
,
phi
::
Backend
backend
,
...
...
@@ -65,7 +82,7 @@ bool KernelSupportPrecision(
auto
it
=
all_kernels
.
find
(
op_type
);
if
(
it
!=
all_kernels
.
end
())
{
for
(
const
auto
&
kern_pair
:
it
->
second
)
{
if
(
platform
::
is_gpu_place
(
kern_pair
.
first
.
place_
)
&&
if
(
ConvertPlaceToBackend
(
kern_pair
.
first
.
place_
)
==
backend
&&
kern_pair
.
first
.
data_type_
==
framework
::
TransToProtoVarType
(
precision
))
{
support
=
true
;
...
...
@@ -150,20 +167,8 @@ bool OpSupportPrecision(const std::string& op_type,
phi
::
Backend
backend
,
phi
::
DataType
precision
,
const
std
::
unordered_set
<
std
::
string
>&
black_list
)
{
bool
support
=
false
;
if
(
black_list
.
count
(
op_type
)
==
0
)
{
// Actual custom backend will be added after the NUM_BACKENDS.
// We use this feature to determine whether backend is custom device.
if
(
backend
==
phi
::
Backend
::
GPU
||
static_cast
<
size_t
>
(
backend
)
>
static_cast
<
size_t
>
(
phi
::
Backend
::
NUM_BACKENDS
))
{
support
=
KernelSupportPrecision
(
op_type
,
backend
,
precision
);
}
else
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
InvalidArgument
(
"Now, only support backend of GPU and Custom Device ."
));
}
}
return
support
;
return
black_list
.
count
(
op_type
)
==
0
&&
KernelSupportPrecision
(
op_type
,
backend
,
precision
);
}
// The set of ops that support fp16 calculation and are considered
...
...
@@ -192,15 +197,13 @@ void AutoMixedPrecisionPass::SetDefaultBlacklist() const {
}
void
AutoMixedPrecisionPass
::
Init
(
Graph
*
graph
)
const
{
bool
enable_gpu_mixed
=
Get
<
bool
>
(
"enable_gpu_mixed"
);
bool
enable_custom_device_mixed
=
false
;
if
(
Has
(
"enable_custom_device_mixed"
))
{
enable_custom_device_mixed
=
Get
<
bool
>
(
"enable_custom_device_mixed"
);
}
if
(
enable_gpu_mixed
)
{
if
(
Has
(
"enable_gpu_mixed"
)
&&
Get
<
bool
>
(
"enable_gpu_mixed"
))
{
backend_
=
phi
::
Backend
::
GPU
;
}
else
if
(
enable_custom_device_mixed
)
{
// transform Backend::CUSTOM to actual backend.
}
else
if
(
Has
(
"enable_xpu_mixed"
)
&&
Get
<
bool
>
(
"enable_xpu_mixed"
))
{
backend_
=
phi
::
Backend
::
XPU
;
}
else
if
(
Has
(
"enable_custom_device_mixed"
)
&&
Get
<
bool
>
(
"enable_custom_device_mixed"
))
{
// transform Backend::CUSTOM to actual backend.
// Here, we only consider one custom backend.
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto
device_type
=
phi
::
DeviceManager
::
GetAllCustomDeviceTypes
()[
0
];
...
...
@@ -214,7 +217,7 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const {
"Cannot enable custom_device_mixed."
));
#endif
}
skip_pass_
=
!
enable_gpu_mixed
&&
!
enable_custom_device_mixed
;
skip_pass_
=
backend_
==
phi
::
Backend
::
UNDEFINED
;
low_precision_
=
static_cast
<
phi
::
DataType
>
(
Get
<
int
>
(
"mixed_precision_mode"
));
...
...
@@ -225,7 +228,6 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const {
VLOG
(
4
)
<<
" - "
<<
name
;
}
keep_io_types_
=
true
;
if
(
Has
(
"keep_io_types"
))
{
keep_io_types_
=
Get
<
bool
>
(
"keep_io_types"
);
}
...
...
@@ -607,6 +609,20 @@ bool AutoMixedPrecisionPass::InputVarsNotConvert(
return
true
;
}
}
if
(
backend_
==
phi
::
Backend
::
XPU
)
{
if
(
GetOpOriginalType
(
op_desc
->
Type
())
==
"layer_norm"
)
{
auto
vecs
=
op_desc
->
Input
(
"Bias"
);
if
(
std
::
find
(
vecs
.
begin
(),
vecs
.
end
(),
var_name
)
!=
vecs
.
end
())
{
return
true
;
}
vecs
=
op_desc
->
Input
(
"Scale"
);
if
(
std
::
find
(
vecs
.
begin
(),
vecs
.
end
(),
var_name
)
!=
vecs
.
end
())
{
return
true
;
}
}
}
return
false
;
}
...
...
@@ -632,6 +648,20 @@ bool AutoMixedPrecisionPass::OutputVarsNotConvert(
return
true
;
}
}
if
(
backend_
==
phi
::
Backend
::
XPU
)
{
if
(
GetOpOriginalType
(
op_desc
->
Type
())
==
"layer_norm"
)
{
auto
vecs
=
op_desc
->
Output
(
"Mean"
);
if
(
std
::
find
(
vecs
.
begin
(),
vecs
.
end
(),
var_name
)
!=
vecs
.
end
())
{
return
true
;
}
vecs
=
op_desc
->
Output
(
"Variance"
);
if
(
std
::
find
(
vecs
.
begin
(),
vecs
.
end
(),
var_name
)
!=
vecs
.
end
())
{
return
true
;
}
}
}
return
false
;
}
...
...
paddle/fluid/framework/ir/auto_mixed_precision_pass.h
浏览文件 @
f265a313
...
...
@@ -68,11 +68,11 @@ class AutoMixedPrecisionPass : public FusePassBase {
private:
mutable
bool
skip_pass_
{
false
};
mutable
bool
keep_io_types_
{
fals
e
};
mutable
bool
keep_io_types_
{
tru
e
};
// float16 or bfloat16 now
mutable
phi
::
DataType
low_precision_
{
phi
::
DataType
::
FLOAT16
};
mutable
phi
::
Backend
backend_
{
phi
::
Backend
::
GPU
};
mutable
phi
::
Backend
backend_
{
phi
::
Backend
::
UNDEFINED
};
mutable
std
::
unordered_set
<
std
::
string
>
black_list_
;
...
...
paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
浏览文件 @
f265a313
...
...
@@ -245,6 +245,12 @@ void FcXPUFusePass::ApplyImpl(ir::Graph* graph,
QuantWeight
<
int16_t
>
(
mul_w_tensor
,
mul_w_max_tensor
,
!
transpose_w
);
}
if
(
bias
!=
nullptr
)
{
auto
*
bias_tensor
=
scope
->
Var
(
bias
->
Name
())
->
GetMutable
<
phi
::
DenseTensor
>
();
CastToFp32
(
bias_tensor
);
}
std
::
string
fc_out_name
;
if
(
act_out
)
{
fc_out_name
=
act_out
->
Name
();
...
...
paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc
浏览文件 @
f265a313
...
...
@@ -31,6 +31,7 @@
#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/kernels/concat_kernel.h"
namespace
phi
{
class
DenseTensor
;
...
...
@@ -617,6 +618,9 @@ class MultiEncoderXPUFusePass : public FusePassBase {
bool
ApplyMultiEncoderXPUFuse
(
ir
::
Graph
*
graph
)
const
;
// Mask must be fp32 even if model is fp16
int
CastMask
(
ir
::
Graph
*
graph
)
const
;
// 1. Transpose q_w, k_w, v_w
// 2. Concat q_w, k_w, v_w
// 3. Generate qkv_w_max tensor
...
...
@@ -674,8 +678,11 @@ void MultiEncoderXPUFusePass::ApplyImpl(ir::Graph* graph) const {
}
}
}
int
cast_mask_counts
=
CastMask
(
graph
);
AddStatis
(
single_encoder_fused_counts
);
AddStatis
(
multi_encoder_fused_counts
);
AddStatis
(
cast_mask_counts
);
}
void
MultiEncoderXPUFusePass
::
PrepareQKVWeight
(
...
...
@@ -685,29 +692,28 @@ void MultiEncoderXPUFusePass::PrepareQKVWeight(
phi
::
DenseTensor
*
qkv_w
,
phi
::
DenseTensor
*
qkv_w_max
)
const
{
// Transpose
phi
::
DenseTensor
q_w_trans
;
phi
::
DenseTensor
k_w_trans
;
phi
::
DenseTensor
v_w_trans
;
Transpose2D
<
float
>
(
q_w
,
&
q_w_trans
);
Transpose2D
<
float
>
(
k_w
,
&
k_w_trans
);
Transpose2D
<
float
>
(
v_w
,
&
v_w_trans
);
phi
::
DenseTensor
q_w_t
;
phi
::
DenseTensor
k_w_t
;
phi
::
DenseTensor
v_w_t
;
Assign
(
q_w
,
&
q_w_t
);
Assign
(
k_w
,
&
k_w_t
);
Assign
(
v_w
,
&
v_w_t
);
Transpose2D
(
&
q_w_t
);
Transpose2D
(
&
k_w_t
);
Transpose2D
(
&
v_w_t
);
// Concat
auto
q_w_trans_dims
=
q_w_trans
.
dims
();
auto
k_w_trans_dims
=
k_w_trans
.
dims
();
auto
v_w_trans_dims
=
v_w_trans
.
dims
();
qkv_w
->
Resize
(
DDim
({
q_w_trans_dims
[
0
]
+
k_w_trans_dims
[
0
]
+
v_w_trans_dims
[
0
],
q_w_trans_dims
[
1
]}));
qkv_w
->
Resize
(
DDim
(
{
q_w_t
.
dims
()[
0
]
+
k_w_t
.
dims
()[
0
]
+
v_w_t
.
dims
()[
0
],
q_w_t
.
dims
()[
1
]}));
qkv_w
->
set_type
(
q_w
.
type
());
auto
*
dev_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
phi
::
CPUPlace
()));
int
size
=
q_w
.
numel
();
auto
*
qkv_w_data
=
dev_ctx
->
Alloc
<
float
>
(
qkv_w
);
memcpy
(
qkv_w_data
,
q_w_trans
.
data
(),
size
*
sizeof
(
float
));
qkv_w_data
+=
size
;
memcpy
(
qkv_w_data
,
k_w_trans
.
data
(),
size
*
sizeof
(
float
));
qkv_w_data
+=
size
;
memcpy
(
qkv_w_data
,
v_w_trans
.
data
(),
size
*
sizeof
(
float
));
std
::
vector
<
const
phi
::
DenseTensor
*>
in_tensors
{
&
q_w_t
,
&
k_w_t
,
&
v_w_t
};
if
(
q_w
.
type
()
==
phi
::
DataType
::
FLOAT16
)
{
phi
::
ConcatKernel
<
float16
>
(
*
dev_ctx
,
in_tensors
,
0
,
qkv_w
);
}
else
{
phi
::
ConcatKernel
<
float
>
(
*
dev_ctx
,
in_tensors
,
0
,
qkv_w
);
}
// Quant to int16
QuantWeight
<
int16_t
>
(
qkv_w
,
qkv_w_max
,
false
);
...
...
@@ -846,6 +852,9 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse(
auto
*
block
=
q_matmul
->
Op
()
->
Block
();
auto
*
scope
=
param_scope
();
bool
enable_fp16
=
scope
->
FindVar
(
q_matmul_w
->
Name
())
->
Get
<
phi
::
DenseTensor
>
().
dtype
()
==
phi
::
DataType
::
FLOAT16
;
// Prepare q,k,v weight
std
::
string
q_w_name
=
q_matmul_w
->
Name
();
...
...
@@ -905,12 +914,32 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse(
auto
*
qkv_add_bias
=
graph
->
CreateVarNode
(
&
qkv_add_bias_desc
);
auto
*
qkv_add_bias_var
=
block
->
Var
(
qkv_add_bias_name
);
qkv_add_bias_var
->
SetPersistable
(
true
);
auto
*
q_add_bias_tensor
=
scope
->
FindVar
(
q_add_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
();
auto
*
k_add_bias_tensor
=
scope
->
FindVar
(
k_add_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
();
auto
*
v_add_bias_tensor
=
scope
->
FindVar
(
v_add_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
();
CastToFp32
(
q_add_bias_tensor
);
CastToFp32
(
k_add_bias_tensor
);
CastToFp32
(
v_add_bias_tensor
);
ConcatQKVBias
(
scope
->
FindVar
(
q_add_bias_name
)
->
Get
<
phi
::
DenseTensor
>
()
,
scope
->
FindVar
(
k_add_bias_name
)
->
Get
<
phi
::
DenseTensor
>
()
,
scope
->
FindVar
(
v_add_bias_name
)
->
Get
<
phi
::
DenseTensor
>
()
,
*
q_add_bias_tensor
,
*
k_add_bias_tensor
,
*
v_add_bias_tensor
,
scope
->
Var
(
qkv_add_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
());
// Prepare qkv_add_0_bias, qkv_add_2_bias, qkv_add_3_bias
auto
qkv_add_0_bias_name
=
qkv_add_0_bias
->
Name
();
CastToFp32
(
scope
->
FindVar
(
qkv_add_0_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
());
auto
qkv_add_2_bias_name
=
qkv_add_2_bias
->
Name
();
CastToFp32
(
scope
->
FindVar
(
qkv_add_2_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
());
auto
qkv_add_3_bias_name
=
qkv_add_3_bias
->
Name
();
CastToFp32
(
scope
->
FindVar
(
qkv_add_3_bias_name
)
->
GetMutable
<
phi
::
DenseTensor
>
());
// Generate single_encoder_xpu op
framework
::
OpDesc
op_desc
(
block
);
op_desc
.
SetType
(
"single_encoder_xpu"
);
...
...
@@ -927,9 +956,9 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse(
qkv_matmul_3_w_max_name
});
op_desc
.
SetInput
(
"fc_bias"
,
{
qkv_add_bias_name
,
qkv_add_0_bias
->
Name
()
,
qkv_add_2_bias
->
Name
()
,
qkv_add_3_bias
->
Name
()
});
qkv_add_0_bias
_name
,
qkv_add_2_bias
_name
,
qkv_add_3_bias
_name
});
if
(
norm_before
)
{
op_desc
.
SetInput
(
"ln_scale"
,
{
ln_0_scale
->
Name
(),
ln_1_scale
->
Name
()});
op_desc
.
SetInput
(
"ln_bias"
,
{
ln_0_bias
->
Name
(),
ln_1_bias
->
Name
()});
...
...
@@ -953,6 +982,7 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse(
static_cast
<
int
>
(
qkv_matmul_2_w_shape
[
1
]
/
qkv_matmul_2_w_shape
[
0
]));
op_desc
.
SetAttr
(
"act_type"
,
ConvertActivationType
(
act_type
));
op_desc
.
SetAttr
(
"relative_type"
,
static_cast
<
int
>
(
0
));
op_desc
.
SetAttr
(
"enable_fp16"
,
enable_fp16
);
if
(
norm_before
)
{
op_desc
.
SetOutput
(
"out"
,
{
qkv_add_4_out
->
Name
()});
}
else
{
...
...
@@ -1186,6 +1216,9 @@ bool MultiEncoderXPUFusePass::ApplyMultiEncoderXPUFuse(ir::Graph* graph) const {
PADDLE_GET_CONST
(
int
,
single_encoders
[
0
]
->
Op
()
->
GetAttr
(
attr_name
)));
}
op_desc
.
SetAttr
(
"slice_idx"
,
static_cast
<
int
>
(
-
1
));
op_desc
.
SetAttr
(
"enable_fp16"
,
PADDLE_GET_CONST
(
bool
,
single_encoders
[
0
]
->
Op
()
->
GetAttr
(
"enable_fp16"
)));
op_desc
.
SetOutput
(
"out"
,
{
out_name
});
op_desc
.
SetOutput
(
"x_fp16"
,
{
x_fp16_name
});
op_desc
.
SetOutput
(
"out_fp16"
,
{
out_fp16_name
});
...
...
@@ -1213,6 +1246,61 @@ bool MultiEncoderXPUFusePass::ApplyMultiEncoderXPUFuse(ir::Graph* graph) const {
return
true
;
}
int
MultiEncoderXPUFusePass
::
CastMask
(
ir
::
Graph
*
graph
)
const
{
int
cast_counts
=
0
;
auto
nodes
=
graph
->
Nodes
();
for
(
auto
node
:
nodes
)
{
if
(
node
->
IsVar
())
continue
;
auto
op_desc
=
node
->
Op
();
if
(
node
->
IsVar
()
||
//
op_desc
->
Type
()
!=
"multi_encoder_xpu"
||
!
op_desc
->
GetAttrIfExists
<
bool
>
(
"enable_fp16"
)
||
op_desc
->
Inputs
().
count
(
"mask"
)
==
0
)
continue
;
auto
*
block
=
op_desc
->
Block
();
auto
*
scope
=
param_scope
();
// Find mask node
std
::
string
mask_name
=
op_desc
->
Inputs
().
at
(
"mask"
)[
0
];
Node
*
mask
=
nullptr
;
for
(
auto
*
in_node
:
node
->
inputs
)
{
if
(
in_node
->
Var
()
->
Name
()
==
mask_name
)
{
mask
=
in_node
;
break
;
}
}
// Create new_mask node/var/tensor
std
::
string
new_mask_name
=
mask_name
+
"_fp32"
;
VarDesc
new_mask_desc
(
new_mask_name
);
auto
*
new_mask
=
graph
->
CreateVarNode
(
&
new_mask_desc
);
block
->
Var
(
new_mask_name
);
scope
->
Var
(
new_mask_name
)
->
GetMutable
<
phi
::
DenseTensor
>
();
// Create cast op
framework
::
OpDesc
cast_op_desc
(
block
);
cast_op_desc
.
SetType
(
"cast"
);
cast_op_desc
.
SetInput
(
"X"
,
{
mask_name
});
cast_op_desc
.
SetAttr
(
"in_dtype"
,
static_cast
<
int
>
(
framework
::
proto
::
VarType
::
FP16
));
cast_op_desc
.
SetAttr
(
"out_dtype"
,
static_cast
<
int
>
(
framework
::
proto
::
VarType
::
FP32
));
cast_op_desc
.
SetOutput
(
"Out"
,
{
new_mask_name
});
auto
*
cast
=
graph
->
CreateOpNode
(
&
cast_op_desc
);
IR_NODE_LINK_TO
(
mask
,
cast
);
IR_NODE_LINK_TO
(
cast
,
new_mask
);
// Update encoder
op_desc
->
SetInput
(
"mask"
,
{
new_mask_name
});
IR_NODE_LINK_TO
(
new_mask
,
node
);
IR_NODE_UNLINK
(
node
,
mask
);
cast_counts
++
;
}
return
cast_counts
;
}
}
// namespace ir
}
// namespace framework
}
// namespace paddle
...
...
paddle/fluid/framework/ir/xpu/quant_utils.cc
浏览文件 @
f265a313
...
...
@@ -16,33 +16,92 @@
#include <vector>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/assign_kernel.h"
#include "paddle/phi/kernels/cast_kernel.h"
#include "paddle/phi/kernels/transpose_kernel.h"
namespace
paddle
{
namespace
framework
{
namespace
ir
{
template
<
typename
T
>
void
Transpose2D
(
const
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
*
out
)
{
auto
in_dims
=
in
.
dims
();
void
Assign
(
const
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
*
out
)
{
auto
*
cpu_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
phi
::
CPUPlace
()));
out
->
Resize
(
in
.
dims
());
out
->
set_type
(
in
.
dtype
());
out
->
set_layout
(
in
.
layout
());
phi
::
AssignKernel
(
*
cpu_ctx
,
in
,
out
);
}
void
Transpose2D
(
phi
::
DenseTensor
*
in
,
phi
::
DenseTensor
*
out
)
{
auto
in_dims
=
in
->
dims
();
PADDLE_ENFORCE_EQ
(
in_dims
.
size
(),
2
,
platform
::
errors
::
InvalidArgument
(
"In dims rank should be 2, but received in dims size is [%d]."
,
in_dims
.
size
()));
out
->
Resize
({
in_dims
[
1
],
in_dims
[
0
]});
out
->
set_type
(
in
.
type
());
auto
*
dev_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
phi
::
DenseTensor
trans_tensor
;
phi
::
DenseTensor
*
out_ptr
=
out
==
nullptr
?
&
trans_tensor
:
out
;
out_ptr
->
Resize
({
in_dims
[
1
],
in_dims
[
0
]});
out_ptr
->
set_type
(
in
->
type
());
out_ptr
->
set_layout
(
in
->
layout
());
auto
*
cpu_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
phi
::
CPUPlace
()));
dev_ctx
->
Alloc
<
T
>
(
out
);
std
::
vector
<
int
>
axis
{
1
,
0
};
phi
::
funcs
::
Transpose
<
phi
::
CPUContext
,
T
,
2
>
trans2d
;
trans2d
(
*
dev_ctx
,
in
,
out
,
axis
);
switch
(
in
->
dtype
())
{
case
phi
::
DataType
::
FLOAT16
:
phi
::
TransposeKernel
<
float16
>
(
*
cpu_ctx
,
*
in
,
axis
,
out_ptr
);
break
;
case
phi
::
DataType
::
FLOAT32
:
phi
::
TransposeKernel
<
float
>
(
*
cpu_ctx
,
*
in
,
axis
,
out_ptr
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Only support fp16 and fp32, but received dtype is %s."
,
phi
::
DataTypeToString
(
in
->
dtype
())));
break
;
}
if
(
out
==
nullptr
)
{
Assign
(
*
out_ptr
,
in
);
}
}
template
void
Transpose2D
<
float
>(
const
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
*
out
);
void
CastToFp32
(
phi
::
DenseTensor
*
in
,
phi
::
DenseTensor
*
out
)
{
auto
*
cpu_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
phi
::
CPUPlace
()));
phi
::
DenseTensor
fp32_tensor
;
phi
::
DenseTensor
*
out_ptr
=
out
==
nullptr
?
&
fp32_tensor
:
out
;
out_ptr
->
Resize
(
in
->
dims
());
out_ptr
->
set_type
(
phi
::
DataType
::
FLOAT32
);
out_ptr
->
set_layout
(
in
->
layout
());
switch
(
in
->
dtype
())
{
case
phi
::
DataType
::
FLOAT16
:
phi
::
CastKernel
<
float16
>
(
*
cpu_ctx
,
*
in
,
phi
::
DataType
::
FLOAT32
,
out_ptr
);
break
;
case
phi
::
DataType
::
FLOAT32
:
if
(
out
==
nullptr
)
{
return
;
}
else
{
phi
::
AssignKernel
(
*
cpu_ctx
,
*
in
,
out_ptr
);
}
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Only support fp16 and fp32, but received dtype is %s."
,
phi
::
DataTypeToString
(
in
->
dtype
())));
break
;
}
if
(
out
==
nullptr
)
{
Assign
(
*
out_ptr
,
in
);
}
}
static
float
FindMaxAbs
(
const
float
*
data
,
int
len
)
{
float
max_f
=
0.0
f
;
...
...
@@ -151,14 +210,15 @@ template <typename T>
void
QuantWeight
(
phi
::
DenseTensor
*
weight
,
phi
::
DenseTensor
*
weight_max
,
bool
transpose
)
{
// Convert fp16 to fp32
phi
::
DenseTensor
weight_fp32
;
CastToFp32
(
weight
,
&
weight_fp32
);
// Transpose
auto
*
weight_data
=
weight
->
data
<
float
>
();
phi
::
DenseTensor
weight_trans
;
if
(
transpose
)
{
Transpose2D
<
float
>
(
*
weight
,
&
weight_trans
);
weight_data
=
weight_trans
.
data
<
float
>
();
weight
->
Resize
(
weight_trans
.
dims
());
Transpose2D
(
&
weight_fp32
);
}
// Find max
paddle
::
platform
::
DeviceContextPool
&
pool
=
paddle
::
platform
::
DeviceContextPool
::
Instance
();
...
...
@@ -171,21 +231,22 @@ void QuantWeight(phi::DenseTensor* weight,
}
phi
::
XPUContext
*
xpu_ctx
=
static_cast
<
phi
::
XPUContext
*>
(
pool
.
Get
(
place
));
int
max_ptr_size
=
xpu_ctx
->
x_context
()
->
max_ptr_size
();
int
size
=
weight
->
numel
();
int
size
=
weight_fp32
.
numel
();
auto
*
weight_data
=
weight_fp32
.
data
<
float
>
();
float
max_val
=
FindMaxAbs
(
weight_data
,
size
);
std
::
vector
<
float
>
max_vec
(
max_ptr_size
,
max_val
);
weight_max
->
set_type
(
p
addle
::
experimental
::
CppTypeToDataType
<
float
>::
Type
()
);
weight_max
->
set_type
(
p
hi
::
DataType
::
FLOAT32
);
weight_max
->
Resize
({
max_ptr_size
});
auto
*
dev
_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
auto
*
cpu
_ctx
=
static_cast
<
phi
::
CPUContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
phi
::
CPUPlace
()));
memcpy
(
dev
_ctx
->
Alloc
<
float
>
(
weight_max
),
memcpy
(
cpu
_ctx
->
Alloc
<
float
>
(
weight_max
),
max_vec
.
data
(),
max_ptr_size
*
sizeof
(
float
));
// Quant
std
::
vector
<
T
>
quant_data
(
size
);
QuantFP32ToIntX
(
weight_data
,
quant_data
.
data
(),
max_val
,
size
);
weight
->
set_type
(
paddle
::
experimental
::
CppTypeToDataType
<
T
>::
Type
());
memcpy
(
dev_ctx
->
Alloc
<
T
>
(
weight
),
quant_data
.
data
(),
size
*
sizeof
(
T
));
weight
->
Resize
(
weight_fp32
.
dims
());
QuantFP32ToIntX
(
weight_data
,
cpu_ctx
->
Alloc
<
T
>
(
weight
),
max_val
,
size
);
}
template
void
QuantWeight
<
int16_t
>(
phi
::
DenseTensor
*
weight
,
...
...
paddle/fluid/framework/ir/xpu/quant_utils.h
浏览文件 @
f265a313
...
...
@@ -19,8 +19,11 @@ namespace paddle {
namespace
framework
{
namespace
ir
{
template
<
typename
T
>
void
Transpose2D
(
const
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
*
out
);
void
Assign
(
const
phi
::
DenseTensor
&
in
,
phi
::
DenseTensor
*
out
);
void
Transpose2D
(
phi
::
DenseTensor
*
in
,
phi
::
DenseTensor
*
out
=
nullptr
);
void
CastToFp32
(
phi
::
DenseTensor
*
in
,
phi
::
DenseTensor
*
out
=
nullptr
);
// 1. Quant weight from fp32 to int16/int31
// 2. Weight data is in-place update.
...
...
paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
浏览文件 @
f265a313
...
...
@@ -41,18 +41,31 @@ ConvertToMixedPrecisionPass::ConvertToMixedPrecisionPass(
backend_
(
backend
),
keep_io_types_
(
keep_io_types
),
black_list_
(
black_list
)
{
if
(
mixed_precision_
!=
phi
::
DataType
::
FLOAT16
&&
mixed_precision_
!=
phi
::
DataType
::
BFLOAT16
)
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
InvalidArgument
(
"mixed_precision currently not supported dtype %d, we now only "
"support fp16 and bf16."
,
static_cast
<
int
>
(
mixed_precision_
)));
}
if
(
backend_
!=
phi
::
Backend
::
GPU
&&
backend_
!=
phi
::
Backend
::
CUSTOM
)
{
PADDLE_THROW
(
paddle
::
platform
::
errors
::
InvalidArgument
(
"mixed_precision currently not supported place %d, we now only "
"support gpu and custom device ."
,
static_cast
<
int
>
(
backend_
)));
switch
(
backend_
)
{
case
phi
::
Backend
::
GPU
:
PADDLE_ENFORCE
(
mixed_precision_
==
phi
::
DataType
::
FLOAT16
||
mixed_precision_
==
phi
::
DataType
::
BFLOAT16
,
platform
::
errors
::
InvalidArgument
(
"mixed_precision of %s currently only supported fp16 "
"and bf16, not support %s."
,
experimental
::
BackendToString
(
backend_
),
phi
::
DataTypeToString
(
mixed_precision_
)));
break
;
case
phi
::
Backend
::
XPU
:
case
phi
::
Backend
::
CUSTOM
:
PADDLE_ENFORCE
(
mixed_precision_
==
phi
::
DataType
::
FLOAT16
,
platform
::
errors
::
InvalidArgument
(
"mixed_precision of %s currently only supported fp16 "
"and bf16, not support %s."
,
experimental
::
BackendToString
(
backend_
),
phi
::
DataTypeToString
(
mixed_precision_
)));
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"mixed_precision currently not supported place GPU or XPU or CUSTOM, "
"not support %s."
,
experimental
::
BackendToString
(
backend_
)));
break
;
}
}
...
...
@@ -70,17 +83,16 @@ void ConvertToMixedPrecisionPass::Run() {
framework
::
ir
::
AutoMixedPrecisionPass
pass
;
pass
.
Set
(
"mixed_precision_mode"
,
new
int
{
static_cast
<
int
>
(
mixed_precision_
)});
pass
.
Set
(
"mixed_black_list"
,
new
std
::
unordered_set
<
std
::
string
>
{
black_list_
});
if
(
backend_
==
phi
::
Backend
::
GPU
)
{
pass
.
Set
(
"enable_gpu_mixed"
,
new
bool
{
true
});
pass
.
Set
(
"enable_custom_device_mixed"
,
new
bool
{
false
});
}
else
if
(
backend_
==
phi
::
Backend
::
XPU
)
{
pass
.
Set
(
"enable_xpu_mixed"
,
new
bool
{
true
});
}
else
if
(
backend_
==
phi
::
Backend
::
CUSTOM
)
{
pass
.
Set
(
"enable_gpu_mixed"
,
new
bool
{
false
});
pass
.
Set
(
"enable_custom_device_mixed"
,
new
bool
{
true
});
}
pass
.
Set
(
"mixed_black_list"
,
new
std
::
unordered_set
<
std
::
string
>
{
black_list_
});
pass
.
Set
(
"keep_io_types"
,
new
bool
{
keep_io_types_
});
pass
.
Apply
(
main_graph_
.
get
());
SaveMixedModel
();
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
f265a313
...
...
@@ -1302,18 +1302,23 @@ void AnalysisPredictor::PrepareArgument() {
<<
", we will use a new PassStrategy. Note that only the GPU "
"backend is supported for now."
;
if
(
!
config_
.
use_cinn_compiler_
)
{
pass_builder
->
ClearPasses
();
const
auto
&
deleted_passes
=
pass_builder
->
GetAllDeletedPasses
();
if
(
config_
.
tensorrt_engine_enabled
())
{
pass_builder
->
ClearPasses
();
for
(
const
auto
&
pass
:
kTrtLowerPrecisionPasses
)
{
if
(
deleted_passes
.
count
(
pass
))
continue
;
pass_builder
->
AppendPass
(
pass
);
}
}
else
if
(
config_
.
use_gpu
())
{
pass_builder
->
ClearPasses
();
for
(
const
auto
&
pass
:
kGpuLowerPrecisionPasses
)
{
if
(
deleted_passes
.
count
(
pass
))
continue
;
pass_builder
->
AppendPass
(
pass
);
}
}
else
if
(
config_
.
use_xpu
())
{
// All passes support fp16. Not reset pass_builder.
}
else
{
pass_builder
->
ClearPasses
();
}
}
}
...
...
paddle/fluid/inference/api/paddle_pass_builder.cc
浏览文件 @
f265a313
...
...
@@ -519,9 +519,9 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
"delete_dropout_op_pass"
,
"identity_scale_op_clean_pass"
,
"generate_sequence_xpu_fuse_pass"
,
"embedding_with_eltwise_add_xpu_fuse_pass"
,
"multi_encoder_xpu_fuse_pass"
,
"multi_encoder_xpu_slice_fuse_pass"
,
"embedding_with_eltwise_add_xpu_fuse_pass"
,
"fc_xpu_fuse_pass"
,
"link_xpu_op_max_pass"
,
});
...
...
paddle/phi/backends/xpu/xpu2_op_list.cc
浏览文件 @
f265a313
...
...
@@ -253,7 +253,8 @@ XPUOpMap& get_kl2_ops() {
phi
::
DataType
::
BOOL
,
phi
::
DataType
::
FLOAT16
,
phi
::
DataType
::
FLOAT32
})},
{
"fc_xpu"
,
XPUKernelSet
({
phi
::
DataType
::
FLOAT32
})},
{
"fc_xpu"
,
XPUKernelSet
({
phi
::
DataType
::
FLOAT32
,
phi
::
DataType
::
FLOAT16
})},
{
"fill"
,
XPUKernelSet
({
phi
::
DataType
::
INT64
,
phi
::
DataType
::
INT32
,
...
...
@@ -461,7 +462,8 @@ XPUOpMap& get_kl2_ops() {
phi
::
DataType
::
FLOAT16
,
phi
::
DataType
::
INT32
,
phi
::
DataType
::
INT64
})},
{
"multi_encoder_xpu"
,
XPUKernelSet
({
phi
::
DataType
::
FLOAT32
})},
{
"multi_encoder_xpu"
,
XPUKernelSet
({
phi
::
DataType
::
FLOAT32
,
phi
::
DataType
::
FLOAT16
})},
{
"nearest_interp_v2"
,
XPUKernelSet
({
phi
::
DataType
::
FLOAT32
})},
{
"nearest_interp_v2_grad"
,
XPUKernelSet
({
phi
::
DataType
::
FLOAT32
})},
{
"not_equal"
,
...
...
paddle/phi/common/backend.h
浏览文件 @
f265a313
...
...
@@ -210,6 +210,8 @@ inline std::string BackendToString(const Backend& backend) {
return
"KPS"
;
case
Backend
::
IPU
:
return
"IPU"
;
case
Backend
::
CUSTOM
:
return
"CUSTOM"
;
default:
{
size_t
device_type_id_
=
static_cast
<
size_t
>
(
backend
)
-
static_cast
<
size_t
>
(
Backend
::
NUM_BACKENDS
);
...
...
paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
浏览文件 @
f265a313
...
...
@@ -33,44 +33,53 @@ void FcXPUKernel(const Context& ctx,
float
act_alpha
,
DenseTensor
*
out
,
DenseTensor
*
out_max
)
{
using
XPUType
=
typename
XPUTypeTrait
<
T
>::
Type
;
auto
in_mat_dims
=
flatten_to_2d
(
x
.
dims
(),
in_num_col_dims
);
int
m
=
in_mat_dims
[
0
];
int
k
=
in_mat_dims
[
1
];
int
n
=
w
.
dims
()[
0
];
auto
*
x_data
=
reinterpret_cast
<
const
XPUType
*>
(
x
.
data
<
T
>
());
const
float
*
x_max_data
=
x_max
.
get_ptr
()
==
nullptr
?
nullptr
:
x_max
.
get_ptr
()
->
data
<
float
>
();
const
float
*
bias_data
=
bias
.
get_ptr
()
==
nullptr
?
nullptr
:
bias
.
get_ptr
()
->
data
<
float
>
();
auto
*
out_data
=
reinterpret_cast
<
XPUType
*>
(
ctx
.
template
Alloc
<
T
>(
out
));
xpu
::
Activation_t
act
(
static_cast
<
xpu
::
Activation_t
::
act_enum
>
(
act_type
));
if
(
act_type
==
5
)
{
act
.
leaky_alpha
=
act_alpha
;
}
else
if
(
act_type
==
15
)
{
act
.
hard_sigmoid_slope
=
act_alpha
;
}
int
r
=
xpu
::
fc_fusion
<
T
,
int16_t
,
T
,
int16_t
>
(
// TX, TW. TY, TGEMM
ctx
.
x_context
(),
// ctx
x
.
data
<
T
>
(),
// x
w
.
data
<
int16_t
>
(),
// w
ctx
.
template
Alloc
<
T
>(
out
),
// y
m
,
// m
n
,
// n
k
,
// k
transpose_x
,
// x_trans
true
,
// w_trans
x_max_data
,
// x_maxptr
w_max
.
data
<
float
>
(),
// w_maxptr
ctx
.
template
Alloc
<
float
>(
out_max
),
// y_maxptr
transpose_x
?
m
:
k
,
// ldx
k
,
// ldw
n
,
// ldy
alpha
,
// alpha
beta
,
// beta
bias_data
,
// bias
act
);
int
r
=
xpu
::
fc_fusion
<
XPUType
,
int16_t
,
XPUType
,
int16_t
>
(
// TX, TW. TY, TGEMM
ctx
.
x_context
(),
// ctx
x_data
,
// x
w
.
data
<
int16_t
>
(),
// w
out_data
,
// y
m
,
// m
n
,
// n
k
,
// k
transpose_x
,
// x_trans
true
,
// w_trans
x_max_data
,
// x_maxptr
w_max
.
data
<
float
>
(),
// w_maxptr
ctx
.
template
Alloc
<
float
>(
out_max
),
// y_maxptr
transpose_x
?
m
:
k
,
// ldx
k
,
// ldw
n
,
// ldy
alpha
,
// alpha
beta
,
// beta
bias_data
,
// bias
act
);
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"fc_xpu"
);
}
}
// namespace fusion
}
// namespace phi
PD_REGISTER_KERNEL
(
fc_xpu
,
XPU
,
ALL_LAYOUT
,
phi
::
fusion
::
FcXPUKernel
,
float
)
{}
PD_REGISTER_KERNEL
(
fc_xpu
,
XPU
,
ALL_LAYOUT
,
phi
::
fusion
::
FcXPUKernel
,
float
,
phi
::
dtype
::
float16
)
{}
paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc
浏览文件 @
f265a313
...
...
@@ -40,18 +40,26 @@ void MultiEncoderXPUKernel(const Context& ctx,
DenseTensor
*
out
,
DenseTensor
*
x_fp16
,
DenseTensor
*
out_fp16
)
{
using
float16
=
typename
XPUTypeTrait
<
phi
::
dtype
::
float16
>::
Type
;
// XPU2 only support fp16 input/output.
float16
*
x_fp16_data
=
reinterpret_cast
<
float16
*>
(
ctx
.
template
Alloc
<
phi
::
dtype
::
float16
>(
x_fp16
));
int
r_cast_x
=
xpu
::
cast_v2
<
float
,
float16
>
(
ctx
.
x_context
(),
x
.
data
<
T
>
(),
x_fp16_data
,
x
.
numel
());
PADDLE_ENFORCE_XDNN_SUCCESS
(
r_cast_x
,
"multi_encoder_xpu(cast x from fp32 to fp16)"
);
float16
*
out_fp16_data
=
reinterpret_cast
<
float16
*>
(
ctx
.
template
Alloc
<
phi
::
dtype
::
float16
>(
out_fp16
));
auto
x_dtype
=
x
.
dtype
();
const
float16
*
x_fp16_data
=
nullptr
;
float16
*
out_fp16_data
=
nullptr
;
if
(
x_dtype
==
phi
::
DataType
::
FLOAT32
)
{
auto
*
x_fp16_data_t
=
reinterpret_cast
<
float16
*>
(
ctx
.
template
Alloc
<
phi
::
dtype
::
float16
>(
x_fp16
));
int
r_cast_x
=
xpu
::
cast_v2
<
float
,
float16
>
(
ctx
.
x_context
(),
x
.
data
<
float
>
(),
x_fp16_data_t
,
x
.
numel
());
PADDLE_ENFORCE_XDNN_SUCCESS
(
r_cast_x
,
"multi_encoder_xpu(cast x from fp32 to fp16)"
);
x_fp16_data
=
x_fp16_data_t
;
out_fp16_data
=
reinterpret_cast
<
float16
*>
(
ctx
.
template
Alloc
<
phi
::
dtype
::
float16
>(
out_fp16
));
}
else
{
x_fp16_data
=
reinterpret_cast
<
const
float16
*>
(
x
.
data
<
phi
::
dtype
::
float16
>
());
out_fp16_data
=
reinterpret_cast
<
float16
*>
(
ctx
.
template
Alloc
<
phi
::
dtype
::
float16
>(
out
));
}
// q,k,v weight are fused.
// Each encoder's weight should be: w0, null, null, w3, w4, w5
...
...
@@ -78,8 +86,8 @@ void MultiEncoderXPUKernel(const Context& ctx,
ln_scale_data
.
push_back
(
ln_scale
[
i
]
->
data
<
float
>
());
ln_bias_data
.
push_back
(
ln_bias
[
i
]
->
data
<
float
>
());
}
const
T
*
mask_data
=
mask
.
get_ptr
()
==
nullptr
?
nullptr
:
mask
.
get_ptr
()
->
data
<
T
>
();
const
float
*
mask_data
=
mask
.
get_ptr
()
==
nullptr
?
nullptr
:
mask
.
get_ptr
()
->
data
<
float
>
();
xpu
::
Activation_t
qkv_act
(
static_cast
<
xpu
::
Activation_t
::
act_enum
>
(
act_type
));
int
batch
=
x
.
dims
()[
0
];
...
...
@@ -152,10 +160,15 @@ void MultiEncoderXPUKernel(const Context& ctx,
PADDLE_ENFORCE_XDNN_SUCCESS
(
r
,
"multi_encoder_xpu"
);
}
int
r_cast_out
=
xpu
::
cast_v2
<
float16
,
float
>
(
ctx
.
x_context
(),
out_fp16_data
,
ctx
.
template
Alloc
<
T
>(
out
),
out
->
numel
());
PADDLE_ENFORCE_XDNN_SUCCESS
(
r_cast_out
,
"multi_encoder_xpu(cast out from fp16 to fp32)"
);
if
(
x_dtype
==
phi
::
DataType
::
FLOAT32
)
{
int
r_cast_out
=
xpu
::
cast_v2
<
float16
,
float
>
(
ctx
.
x_context
(),
out_fp16_data
,
ctx
.
template
Alloc
<
float
>(
out
),
out
->
numel
());
PADDLE_ENFORCE_XDNN_SUCCESS
(
r_cast_out
,
"multi_encoder_xpu(cast out from fp16 to fp32)"
);
}
}
}
// namespace fusion
...
...
@@ -165,4 +178,5 @@ PD_REGISTER_KERNEL(multi_encoder_xpu,
XPU
,
ALL_LAYOUT
,
phi
::
fusion
::
MultiEncoderXPUKernel
,
float
)
{}
float
,
phi
::
dtype
::
float16
)
{}
python/paddle/fluid/tests/unittests/ir/inference/test_xpu_convert_mixed_precision.py
0 → 100644
浏览文件 @
f265a313
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
tempfile
import
unittest
import
paddle
from
paddle.inference
import
(
PlaceType
,
PrecisionType
,
convert_to_mixed_precision
,
)
from
paddle.jit
import
to_static
from
paddle.static
import
InputSpec
from
paddle.vision.models
import
resnet50
class
ConvertMixedPrecison
(
unittest
.
TestCase
):
def
test
(
self
):
self
.
temp_dir
=
tempfile
.
TemporaryDirectory
()
model
=
resnet50
(
True
)
net
=
to_static
(
model
,
input_spec
=
[
InputSpec
(
shape
=
[
None
,
3
,
224
,
224
],
name
=
'x'
)]
)
paddle
.
jit
.
save
(
net
,
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'resnet50/inference'
)
)
convert_to_mixed_precision
(
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'resnet50/inference.pdmodel'
),
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'resnet50/inference.pdiparams'
),
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'mixed_precision/inference.pdmodel'
),
os
.
path
.
join
(
self
.
temp_dir
.
name
,
'mixed_precision/inference.pdiparams'
),
backend
=
PlaceType
.
XPU
,
mixed_precision
=
PrecisionType
.
Half
,
)
self
.
temp_dir
.
cleanup
()
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录