Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
1c97aa69
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
1c97aa69
编写于
4月 27, 2023
作者:
Z
zhupengyang
提交者:
GitHub
4月 27, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
xpu quant weight only (#53306)
上级
2d17df97
变更
9
显示空白变更内容
内联
并排
Showing
9 changed file
with
113 addition
and
30 deletion
+113
-30
paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass.cc
...ramework/ir/xpu/fused_multi_transformer_xpu_quant_pass.cc
+35
-30
paddle/fluid/framework/ir/xpu/pass_utils.cc
paddle/fluid/framework/ir/xpu/pass_utils.cc
+7
-0
paddle/fluid/framework/ir/xpu/quant_utils.cc
paddle/fluid/framework/ir/xpu/quant_utils.cc
+13
-0
paddle/fluid/inference/analysis/argument.h
paddle/fluid/inference/analysis/argument.h
+6
-0
paddle/fluid/inference/analysis/ir_pass_manager.cc
paddle/fluid/inference/analysis/ir_pass_manager.cc
+8
-0
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+21
-0
paddle/fluid/inference/api/analysis_predictor.cc
paddle/fluid/inference/api/analysis_predictor.cc
+4
-0
paddle/fluid/inference/api/paddle_analysis_config.h
paddle/fluid/inference/api/paddle_analysis_config.h
+14
-0
paddle/fluid/pybind/inference_api.cc
paddle/fluid/pybind/inference_api.cc
+5
-0
未找到文件。
paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_quant_pass.cc
浏览文件 @
1c97aa69
...
...
@@ -280,6 +280,8 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph,
with_time_step
,
with_seq_lengths
,
with_src_mask
);
int
quant_weight_bits
=
Has
(
"quant_weight_bits"
)
?
Get
<
int
>
(
"quant_weight_bits"
)
:
-
1
;
int
found_subgraph_count
=
0
;
auto
handler
=
[
&
](
const
GraphPatternDetector
::
subgraph_t
&
subgraph
,
...
...
@@ -312,36 +314,39 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph,
// quant weight nodes
// w_nodes_vec: [QKVW, OutLinearW, FFN1Weight, FFN2Weight]
std
::
vector
<
std
::
vector
<
Node
*>>
w_nodes_vec
(
4
);
std
::
vector
<
std
::
vector
<
Node
*>>
w_int
16
_nodes_vec
(
4
);
std
::
vector
<
std
::
vector
<
Node
*>>
w_int
x
_nodes_vec
(
4
);
std
::
vector
<
std
::
vector
<
Node
*>>
w_max_nodes_vec
(
4
);
std
::
vector
<
std
::
vector
<
std
::
string
>>
w_int
16
_names_vec
(
4
);
std
::
vector
<
std
::
vector
<
std
::
string
>>
w_int
x
_names_vec
(
4
);
std
::
vector
<
std
::
vector
<
std
::
string
>>
w_max_names_vec
(
4
);
auto
quant_func
=
[
&
](
const
std
::
string
&
input_name
,
std
::
vector
<
Node
*>*
w_nodes
,
std
::
vector
<
Node
*>*
w_int
16
_nodes
,
std
::
vector
<
Node
*>*
w_int
x
_nodes
,
std
::
vector
<
Node
*>*
w_max_nodes
,
std
::
vector
<
std
::
string
>*
w_int
16
_names
,
std
::
vector
<
std
::
string
>*
w_int
x
_names
,
std
::
vector
<
std
::
string
>*
w_max_names
,
bool
need_transpose
)
{
typedef
int16_t
TW
;
auto
w_names
=
fused_mt
->
Op
()
->
Input
(
input_name
);
for
(
auto
w_name
:
w_names
)
{
Node
*
w_node
=
FindNodeWithName
(
graph
,
w_name
);
Node
*
w_int
16
=
nullptr
;
Node
*
w_int
x
=
nullptr
;
Node
*
w_max
=
nullptr
;
PADDLE_ENFORCE_NE
(
w_node
,
nullptr
,
platform
::
errors
::
Fatal
(
"w node should not be nullptr"
));
PrepareWeight
<
TW
>
(
graph
,
scope
,
block
,
w_node
,
&
w_int16
,
&
w_max
,
need_transpose
);
if
(
quant_weight_bits
==
8
)
{
PrepareWeight
<
int8_t
>
(
graph
,
scope
,
block
,
w_node
,
&
w_intx
,
&
w_max
,
need_transpose
);
}
else
{
PrepareWeight
<
int16_t
>
(
graph
,
scope
,
block
,
w_node
,
&
w_intx
,
&
w_max
,
need_transpose
);
}
w_nodes
->
push_back
(
w_node
);
w_int
16_nodes
->
push_back
(
w_int16
);
w_int
x_nodes
->
push_back
(
w_intx
);
w_max_nodes
->
push_back
(
w_max
);
}
for
(
size_t
i
=
0
;
i
<
w_names
.
size
();
++
i
)
{
w_int
16_names
->
push_back
(
w_int16
_nodes
->
at
(
i
)
->
Name
());
w_int
x_names
->
push_back
(
w_intx
_nodes
->
at
(
i
)
->
Name
());
w_max_names
->
push_back
(
w_max_nodes
->
at
(
i
)
->
Name
());
}
PADDLE_ENFORCE_EQ
(
...
...
@@ -353,11 +358,11 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph,
static_cast
<
int
>
(
w_nodes
->
size
())));
PADDLE_ENFORCE_EQ
(
w_names
.
size
(),
w_int
16
_nodes
->
size
(),
w_int
x
_nodes
->
size
(),
platform
::
errors
::
Fatal
(
"The size of w_names(%d) should be equal to w_int
16
_nodes(%d)"
,
"The size of w_names(%d) should be equal to w_int
x
_nodes(%d)"
,
static_cast
<
int
>
(
w_names
.
size
()),
static_cast
<
int
>
(
w_int
16
_nodes
->
size
())));
static_cast
<
int
>
(
w_int
x
_nodes
->
size
())));
PADDLE_ENFORCE_EQ
(
w_names
.
size
(),
w_max_nodes
->
size
(),
...
...
@@ -367,11 +372,11 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph,
static_cast
<
int
>
(
w_max_nodes
->
size
())));
PADDLE_ENFORCE_EQ
(
w_names
.
size
(),
w_int
16
_names
->
size
(),
w_int
x
_names
->
size
(),
platform
::
errors
::
Fatal
(
"The size of w_names(%d) should be equal to w_int
16
_names(%d)"
,
"The size of w_names(%d) should be equal to w_int
x
_names(%d)"
,
static_cast
<
int
>
(
w_names
.
size
()),
static_cast
<
int
>
(
w_int
16
_names
->
size
())));
static_cast
<
int
>
(
w_int
x
_names
->
size
())));
PADDLE_ENFORCE_EQ
(
w_names
.
size
(),
w_max_names
->
size
(),
...
...
@@ -382,30 +387,30 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph,
};
quant_func
(
"QKVW"
,
&
(
w_nodes_vec
[
0
]),
&
(
w_int
16
_nodes_vec
[
0
]),
&
(
w_int
x
_nodes_vec
[
0
]),
&
(
w_max_nodes_vec
[
0
]),
&
(
w_int
16
_names_vec
[
0
]),
&
(
w_int
x
_names_vec
[
0
]),
&
(
w_max_names_vec
[
0
]),
false
);
quant_func
(
"OutLinearW"
,
&
(
w_nodes_vec
[
1
]),
&
(
w_int
16
_nodes_vec
[
1
]),
&
(
w_int
x
_nodes_vec
[
1
]),
&
(
w_max_nodes_vec
[
1
]),
&
(
w_int
16
_names_vec
[
1
]),
&
(
w_int
x
_names_vec
[
1
]),
&
(
w_max_names_vec
[
1
]),
true
);
quant_func
(
"FFN1Weight"
,
&
(
w_nodes_vec
[
2
]),
&
(
w_int
16
_nodes_vec
[
2
]),
&
(
w_int
x
_nodes_vec
[
2
]),
&
(
w_max_nodes_vec
[
2
]),
&
(
w_int
16
_names_vec
[
2
]),
&
(
w_int
x
_names_vec
[
2
]),
&
(
w_max_names_vec
[
2
]),
true
);
quant_func
(
"FFN2Weight"
,
&
(
w_nodes_vec
[
3
]),
&
(
w_int
16
_nodes_vec
[
3
]),
&
(
w_int
x
_nodes_vec
[
3
]),
&
(
w_max_nodes_vec
[
3
]),
&
(
w_int
16
_names_vec
[
3
]),
&
(
w_int
x
_names_vec
[
3
]),
&
(
w_max_names_vec
[
3
]),
true
);
...
...
@@ -482,13 +487,13 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph,
name_caches
.
at
(
"CacheKVOut"
));
fused_mt_xpu_op_desc
->
SetOutput
(
"out"
,
name_caches
.
at
(
"Out"
));
fused_mt_xpu_op_desc
->
SetInput
(
"qkvw"
,
w_int
16
_names_vec
[
0
]);
fused_mt_xpu_op_desc
->
SetInput
(
"qkvw"
,
w_int
x
_names_vec
[
0
]);
fused_mt_xpu_op_desc
->
SetInput
(
"qkvw_max"
,
w_max_names_vec
[
0
]);
fused_mt_xpu_op_desc
->
SetInput
(
"out_linear_w"
,
w_int
16
_names_vec
[
1
]);
fused_mt_xpu_op_desc
->
SetInput
(
"out_linear_w"
,
w_int
x
_names_vec
[
1
]);
fused_mt_xpu_op_desc
->
SetInput
(
"out_linear_wmax"
,
w_max_names_vec
[
1
]);
fused_mt_xpu_op_desc
->
SetInput
(
"ffn1_weight"
,
w_int
16
_names_vec
[
2
]);
fused_mt_xpu_op_desc
->
SetInput
(
"ffn1_weight"
,
w_int
x
_names_vec
[
2
]);
fused_mt_xpu_op_desc
->
SetInput
(
"ffn1_weight_max"
,
w_max_names_vec
[
2
]);
fused_mt_xpu_op_desc
->
SetInput
(
"ffn2_weight"
,
w_int
16
_names_vec
[
3
]);
fused_mt_xpu_op_desc
->
SetInput
(
"ffn2_weight"
,
w_int
x
_names_vec
[
3
]);
fused_mt_xpu_op_desc
->
SetInput
(
"ffn2_weight_max"
,
w_max_names_vec
[
3
]);
if
(
!
fused_mt_xpu_op_desc
->
HasAttr
(
"rotary_emb_dims"
))
{
fused_mt_xpu_op_desc
->
SetAttr
(
"rotary_emb_dims"
,
0
);
...
...
@@ -501,7 +506,7 @@ int FusedMultiTransformerXPUQuantPass::ApplyImpl(ir::Graph* graph,
}
// link int16 format of QKVW/OutLinearW/FFN1Weight/FFN2Weight to
// fused_mt_xpu
for
(
auto
nodes
:
w_int
16
_nodes_vec
)
{
for
(
auto
nodes
:
w_int
x
_nodes_vec
)
{
for
(
auto
node
:
nodes
)
{
IR_NODE_LINK_TO
(
node
,
fused_mt
);
}
...
...
paddle/fluid/framework/ir/xpu/pass_utils.cc
浏览文件 @
1c97aa69
...
...
@@ -193,6 +193,13 @@ template void PrepareWeight<int16_t>(Graph* graph,
Node
**
dst
,
Node
**
dst_max
,
bool
transpose
);
template
void
PrepareWeight
<
int8_t
>(
Graph
*
graph
,
Scope
*
scope
,
BlockDesc
*
block
,
Node
*
src
,
Node
**
dst
,
Node
**
dst_max
,
bool
transpose
);
void
PrepareBias
(
Graph
*
graph
,
Scope
*
scope
,
BlockDesc
*
block
,
Node
*
src
,
Node
**
dst
)
{
...
...
paddle/fluid/framework/ir/xpu/quant_utils.cc
浏览文件 @
1c97aa69
...
...
@@ -207,6 +207,16 @@ void QuantFP32ToIntX<int16_t>(const float* src_ptr,
}
}
template
<
>
void
QuantFP32ToIntX
<
int8_t
>
(
const
float
*
src_ptr
,
int8_t
*
dst_ptr
,
float
max_val
,
int
numel
)
{
for
(
int
i
=
0
;
i
<
numel
;
i
++
)
{
dst_ptr
[
i
]
=
Fp32ToIntx
<
int8_t
,
127
>
(
src_ptr
[
i
],
max_val
);
}
}
template
<
typename
T
>
void
PrepareWeight
(
phi
::
DenseTensor
*
weight
,
phi
::
DenseTensor
*
weight_max
,
...
...
@@ -253,6 +263,9 @@ void PrepareWeight(phi::DenseTensor* weight,
template
void
PrepareWeight
<
int16_t
>(
phi
::
DenseTensor
*
weight
,
phi
::
DenseTensor
*
weight_max
,
bool
transpose
);
template
void
PrepareWeight
<
int8_t
>(
phi
::
DenseTensor
*
weight
,
phi
::
DenseTensor
*
weight_max
,
bool
transpose
);
}
// namespace ir
}
// namespace framework
...
...
paddle/fluid/inference/analysis/argument.h
浏览文件 @
1c97aa69
...
...
@@ -289,6 +289,12 @@ struct Argument {
DECL_ARGUMENT_FIELD
(
xpu_adaptive_seqlen
,
XpuAdaptiveSeqlen
,
bool
);
DECL_ARGUMENT_FIELD
(
xpu_device_id
,
XpuDeviceId
,
int
);
DECL_ARGUMENT_FIELD
(
xpu_enable_multi_stream
,
XpuEnableMultiStream
,
bool
);
DECL_ARGUMENT_FIELD
(
xpu_quant_post_dynamic_weight_bits
,
XpuQuantPostDynamicWeightBits
,
int
);
DECL_ARGUMENT_FIELD
(
xpu_quant_post_dynamic_op_types
,
XpuQuantPostDynamicOpTypss
,
std
::
vector
<
std
::
string
>
);
DECL_ARGUMENT_FIELD
(
use_opencl
,
UseOpenCL
,
bool
);
...
...
paddle/fluid/inference/analysis/ir_pass_manager.cc
浏览文件 @
1c97aa69
...
...
@@ -308,6 +308,14 @@ void IRPassManager::CreatePasses(Argument *argument,
}
bool
use_fc_padding
=
!
fc_mkldnn_pass
&&
argument
->
use_fc_padding
();
pass
->
Set
(
"use_fc_padding"
,
new
bool
(
use_fc_padding
));
}
else
if
(
pass_name
==
"fused_multi_transformer_xpu_quant_pass"
)
{
auto
op_types
=
argument
->
xpu_quant_post_dynamic_op_types
();
if
(
std
::
count
(
op_types
.
begin
(),
op_types
.
end
(),
"fused_multi_transformer"
)
>
0
)
{
pass
->
Set
(
"quant_weight_bits"
,
new
int
(
argument
->
xpu_quant_post_dynamic_weight_bits
()));
}
}
pre_pass
=
pass_name
;
...
...
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
1c97aa69
...
...
@@ -196,6 +196,14 @@ void AnalysisConfig::SetXpuDeviceId(int device_id) {
Update
();
}
void
AnalysisConfig
::
SetXpuConfig
(
int
quant_post_dynamic_weight_bits
,
const
std
::
vector
<
std
::
string
>
&
quant_post_dynamic_op_types
)
{
xpu_quant_post_dynamic_weight_bits_
=
quant_post_dynamic_weight_bits
;
xpu_quant_post_dynamic_op_types_
=
quant_post_dynamic_op_types
;
Update
();
}
void
AnalysisConfig
::
EnableCustomDevice
(
const
std
::
string
&
device_type
,
int
device_id
,
Precision
precision_mode
)
{
...
...
@@ -489,6 +497,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER
(
xpu_precision_
);
CP_MEMBER
(
xpu_adaptive_seqlen_
);
CP_MEMBER
(
xpu_enable_multi_stream_
);
CP_MEMBER
(
xpu_quant_post_dynamic_weight_bits_
);
CP_MEMBER
(
xpu_quant_post_dynamic_op_types_
);
// Lite OpenCL Related
CP_MEMBER
(
use_opencl_
);
...
...
@@ -1091,6 +1101,10 @@ std::string AnalysisConfig::SerializeInfoCache() {
ss
<<
xpu_precision_
;
ss
<<
xpu_adaptive_seqlen_
;
ss
<<
xpu_enable_multi_stream_
;
ss
<<
xpu_quant_post_dynamic_weight_bits_
;
for
(
auto
op_type
:
xpu_quant_post_dynamic_op_types_
)
{
ss
<<
op_type
;
}
ss
<<
use_npu_
;
ss
<<
npu_device_id_
;
...
...
@@ -1331,6 +1345,13 @@ std::string AnalysisConfig::Summary() {
os
.
InsertRow
({
"xpu_device_id"
,
std
::
to_string
(
xpu_device_id_
)});
os
.
InsertRow
(
{
"xpu_l3_workspace_size"
,
std
::
to_string
(
xpu_l3_workspace_size_
)});
os
.
InsertRow
({
"xpu_quant_post_dynamic_weight_bits"
,
std
::
to_string
(
xpu_quant_post_dynamic_weight_bits_
)});
std
::
vector
<
std
::
string
>
op_types
{
"xpu_quant_post_dynamic_op_types"
};
for
(
auto
op_type
:
xpu_quant_post_dynamic_op_types_
)
{
op_types
.
push_back
(
op_type
);
}
os
.
InsertRow
(
op_types
);
}
os
.
InsetDivider
();
...
...
paddle/fluid/inference/api/analysis_predictor.cc
浏览文件 @
1c97aa69
...
...
@@ -1426,6 +1426,10 @@ void AnalysisPredictor::PrepareArgument() {
argument_
->
SetXpuAdaptiveSeqlen
(
config_
.
xpu_adaptive_seqlen_
);
argument_
->
SetXpuDeviceId
(
config_
.
xpu_device_id_
);
argument_
->
SetXpuEnableMultiStream
(
config_
.
xpu_enable_multi_stream_
);
argument_
->
SetXpuQuantPostDynamicWeightBits
(
config_
.
xpu_quant_post_dynamic_weight_bits_
);
argument_
->
SetXpuQuantPostDynamicOpTypss
(
config_
.
xpu_quant_post_dynamic_op_types_
);
#endif
auto
*
pass_builder
=
config_
.
pass_builder
();
...
...
paddle/fluid/inference/api/paddle_analysis_config.h
浏览文件 @
1c97aa69
...
...
@@ -288,6 +288,18 @@ struct PD_INFER_DECL AnalysisConfig {
bool
adaptive_seqlen
=
false
,
bool
enable_multi_stream
=
false
);
///
/// \brief configs of XPU
///
/// \param quant_post_dynamic_weight_bits Weight bits used in dynamic post
/// quantization. Optional value: -1, 8, 16. Default value is -1, means using
/// the recommended way. \param quant_post_dynamic_op_types Ops used in
/// dynamic post quantization.
///
void
SetXpuConfig
(
int
quant_post_dynamic_weight_bits
=
-
1
,
const
std
::
vector
<
std
::
string
>&
quant_post_dynamic_op_types
=
{});
///
/// \brief configs of IPU
///
...
...
@@ -1181,6 +1193,8 @@ struct PD_INFER_DECL AnalysisConfig {
std
::
string
xpu_precision_
;
bool
xpu_adaptive_seqlen_
;
bool
xpu_enable_multi_stream_
;
int
xpu_quant_post_dynamic_weight_bits_
{
-
1
};
std
::
vector
<
std
::
string
>
xpu_quant_post_dynamic_op_types_
;
// LITE OPENCL SETTINGS
bool
use_opencl_
{
false
};
...
...
paddle/fluid/pybind/inference_api.cc
浏览文件 @
1c97aa69
...
...
@@ -767,6 +767,11 @@ void BindAnalysisConfig(py::module *m) {
.
def
(
"set_xpu_device_id"
,
&
AnalysisConfig
::
SetXpuDeviceId
,
py
::
arg
(
"device_id"
)
=
0
)
.
def
(
"set_xpu_config"
,
&
AnalysisConfig
::
SetXpuConfig
,
py
::
arg
(
"quant_post_dynamic_weight_bits"
)
=
-
1
,
py
::
arg
(
"quant_post_dynamic_op_types"
)
=
std
::
vector
<
std
::
string
>
({}))
.
def
(
"enable_custom_device"
,
&
AnalysisConfig
::
EnableCustomDevice
,
py
::
arg
(
"device_type"
),
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录