Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
174726fc
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
174726fc
编写于
11月 28, 2022
作者:
C
caozhou
提交者:
GitHub
11月 28, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Auto Parallel]Add pattern for auto search (#48316)
* add pattern for auto search * add unittest
上级
e7d459ac
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
274 addition
and
1 deletion
+274
-1
python/paddle/distributed/auto_parallel/graph.py
python/paddle/distributed/auto_parallel/graph.py
+3
-0
python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
...addle/distributed/auto_parallel/tuner/rule_based_tuner.py
+137
-0
python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
...paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+1
-1
python/paddle/fluid/tests/unittests/auto_parallel/test_pattern.py
...addle/fluid/tests/unittests/auto_parallel/test_pattern.py
+133
-0
未找到文件。
python/paddle/distributed/auto_parallel/graph.py
浏览文件 @
174726fc
...
...
@@ -123,6 +123,8 @@ class Graph:
else
:
self
.
_nodes
[
node_id
].
attrs
.
update
(
attrs
)
return
self
.
_nodes
[
node_id
]
def
add_edge
(
self
,
src_id
,
tgt_id
,
**
attrs
):
# add nodes
if
src_id
is
None
:
...
...
@@ -140,6 +142,7 @@ class Graph:
# add the edge
edge
=
Edge
(
src_id
,
tgt_id
,
**
attrs
)
self
.
_adjs
[
src_id
][
tgt_id
]
=
edge
return
edge
def
__len__
(
self
):
return
len
(
self
.
_nodes
)
...
...
python/paddle/distributed/auto_parallel/tuner/rule_based_tuner.py
浏览文件 @
174726fc
...
...
@@ -12,6 +12,143 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from
abc
import
ABC
,
abstractmethod
from
..graph
import
Graph
_PATTERNS
=
{}
def
register_pattern
(
cls
):
"""Register pattern for rule-based tuner."""
name
=
cls
.
name
def
register
(
name
):
global
_PATTERNS
_PATTERNS
[
name
]
=
cls
()
register
(
name
)
return
cls
def
convert_to_graph
(
ops
,
block
):
"""Convert ops to graph."""
graph
=
Graph
()
graph
.
attrs
[
"var_to_id"
]
=
{}
# {var_name: node_id}
graph
.
attrs
[
"id_to_var"
]
=
{}
# {node_id: var_name}
graph
.
attrs
[
"op_to_id"
]
=
{}
# {op_id: node_id}
graph
.
attrs
[
"id_to_op"
]
=
{}
# {node_id: op_id}
node_id
=
-
1
for
op
in
ops
:
attrs
=
op
.
all_attrs
()
attrs
[
"type"
]
=
op
.
type
node_id
+=
1
# create op node
op_node
=
graph
.
add_node
(
node_id
,
**
attrs
)
graph
.
attrs
[
"op_to_id"
][
op
.
desc
.
id
()]
=
op_node
.
id
graph
.
attrs
[
"id_to_op"
][
op_node
.
id
]
=
op
.
desc
.
id
()
for
input_name
in
op
.
input_names
:
for
var_name
in
op
.
input
(
input_name
):
if
var_name
not
in
graph
.
attrs
[
"var_to_id"
]:
# create var node
node_id
+=
1
var_node
=
graph
.
add_node
(
node_id
)
var
=
block
.
_var_recursive
(
var_name
)
if
var
.
is_parameter
:
var_node
.
attrs
[
"type"
]
=
"param"
else
:
var_node
.
attrs
[
"type"
]
=
"var"
graph
.
attrs
[
"var_to_id"
][
var_name
]
=
var_node
.
id
graph
.
attrs
[
"id_to_var"
][
var_node
.
id
]
=
var_name
else
:
var_node_id
=
graph
.
attrs
[
"var_to_id"
][
var_name
]
var_node
=
graph
.
_nodes
[
var_node_id
]
# create edge that input -> op
input_edge
=
graph
.
add_edge
(
var_node
.
id
,
op_node
.
id
)
input_edge
.
attrs
[
"input_name"
]
=
input_name
for
output_name
in
op
.
output_names
:
for
var_name
in
op
.
output
(
output_name
):
if
var_name
not
in
graph
.
attrs
[
"var_to_id"
]:
# create var node
node_id
+=
1
var_node
=
graph
.
add_node
(
node_id
)
var
=
block
.
_var_recursive
(
var_name
)
if
var
.
is_parameter
:
var_node
.
attrs
[
"type"
]
=
"param"
else
:
var_node
.
attrs
[
"type"
]
=
"var"
graph
.
attrs
[
"var_to_id"
][
var_name
]
=
var_node
.
id
graph
.
attrs
[
"id_to_var"
][
var_node
.
id
]
=
var_name
else
:
var_node_id
=
graph
.
attrs
[
"var_to_id"
][
var_name
]
var_node
=
graph
.
_nodes
[
var_node_id
]
# create edge that op -> output
output_edge
=
graph
.
add_edge
(
op_node
.
id
,
var_node
.
id
)
output_edge
.
attrs
[
"output_name"
]
=
output_name
return
graph
class
BasePattern
(
ABC
):
name
=
"base"
def
__init__
(
self
):
self
.
graph
=
None
self
.
build
()
@
abstractmethod
def
build
(
self
):
pass
@
register_pattern
class
QKVPattern
(
BasePattern
):
name
=
"qkv"
def
__init__
(
self
):
super
().
__init__
()
def
build
(
self
):
self
.
graph
=
Graph
()
query
=
self
.
graph
.
add_node
(
0
,
**
{
"type"
:
"var"
})
q_weight
=
self
.
graph
.
add_node
(
1
,
**
{
"dim"
:
2
,
"type"
:
"param"
})
k_weight
=
self
.
graph
.
add_node
(
2
,
**
{
"dim"
:
2
,
"type"
:
"param"
})
v_weight
=
self
.
graph
.
add_node
(
3
,
**
{
"dim"
:
2
,
"type"
:
"param"
})
q_matmul
=
self
.
graph
.
add_node
(
4
,
**
{
"type"
:
"matmul_v2"
})
k_matmul
=
self
.
graph
.
add_node
(
5
,
**
{
"type"
:
"matmul_v2"
})
v_matmul
=
self
.
graph
.
add_node
(
6
,
**
{
"type"
:
"matmul_v2"
})
q_x
=
self
.
graph
.
add_edge
(
0
,
4
,
**
{
"input_name"
:
"X"
})
k_x
=
self
.
graph
.
add_edge
(
0
,
5
,
**
{
"input_name"
:
"X"
})
v_x
=
self
.
graph
.
add_edge
(
0
,
6
,
**
{
"input_name"
:
"X"
})
q_y
=
self
.
graph
.
add_edge
(
1
,
4
,
**
{
"input_name"
:
"Y"
})
k_y
=
self
.
graph
.
add_edge
(
2
,
5
,
**
{
"input_name"
:
"Y"
})
v_y
=
self
.
graph
.
add_edge
(
3
,
6
,
**
{
"input_name"
:
"Y"
})
q
=
self
.
graph
.
add_node
(
7
,
**
{
"type"
:
"var"
})
k
=
self
.
graph
.
add_node
(
8
,
**
{
"type"
:
"var"
})
v
=
self
.
graph
.
add_node
(
9
,
**
{
"type"
:
"var"
})
q_out
=
self
.
graph
.
add_edge
(
7
,
4
,
**
{
"output_name"
:
"Out"
})
k_out
=
self
.
graph
.
add_edge
(
8
,
5
,
**
{
"output_name"
:
"Out"
})
v_out
=
self
.
graph
.
add_edge
(
9
,
6
,
**
{
"output_name"
:
"Out"
})
# Pattern
self
.
graph
.
attrs
[
"shard_tensor"
]
=
[
(
1
,
2
,
3
),
[[
-
1
,
0
],
[
-
1
,
1
]],
]
# 2-tuple such as (tensor_id, patterns)
class
OperatorGroupUtil
:
common_starts
=
[
"layer_norm"
,
"matmul_v2"
,
"matmul"
]
...
...
python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
浏览文件 @
174726fc
...
...
@@ -119,5 +119,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
py_test_modules
(
test_engine_api_error MODULES test_engine_api_error
)
py_test_modules
(
test_fp16_assign MODULES test_fp16_assign
)
py_test_modules
(
test_group_operators MODULES test_group_operators
)
py_test_modules
(
test_pattern MODULES test_pattern
)
endif
()
python/paddle/fluid/tests/unittests/auto_parallel/test_pattern.py
0 → 100644
浏览文件 @
174726fc
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
sys
import
unittest
import
numpy
as
np
import
paddle
import
paddle.static
as
static
sys
.
path
.
append
(
".."
)
import
auto_parallel_gpt_model
as
modeling
from
auto_parallel_gpt_model
import
(
GPTModel
,
GPTForPretraining
,
GPTPretrainingCriterion
,
)
def
get_gpt_model
(
train_program
,
start_program
,
place
,
batch_size
,
sequence_len
,
vocab_size
):
with
static
.
program_guard
(
train_program
,
start_program
):
tokens
=
paddle
.
static
.
data
(
name
=
"tokens"
,
shape
=
[
batch_size
,
sequence_len
],
dtype
=
'int64'
)
position_ids
=
paddle
.
static
.
data
(
name
=
"position_ids"
,
shape
=
[
batch_size
,
sequence_len
],
dtype
=
'int64'
)
attention_mask
=
paddle
.
static
.
data
(
name
=
"attention_mask"
,
shape
=
[
batch_size
,
1
,
sequence_len
,
sequence_len
],
dtype
=
'float32'
,
)
labels
=
paddle
.
static
.
data
(
name
=
"labels"
,
shape
=
[
batch_size
,
sequence_len
],
dtype
=
'int64'
)
loss_mask
=
paddle
.
static
.
data
(
name
=
"loss_mask"
,
shape
=
[
batch_size
,
sequence_len
],
dtype
=
'float32'
)
gpt
=
GPTModel
(
vocab_size
=
1000
,
hidden_size
=
64
,
num_hidden_layers
=
2
,
num_attention_heads
=
8
,
intermediate_size
=
256
,
hidden_act
=
"gelu"
,
hidden_dropout_prob
=
0.0
,
attention_probs_dropout_prob
=
0.0
,
max_position_embeddings
=
1024
,
type_vocab_size
=
1
,
initializer_range
=
0.02
,
pad_token_id
=
0
,
eos_token_id
=
7
,
bos_token_id
=
0
,
eol_token_id
=
3
,
)
model
=
GPTForPretraining
(
gpt
,
vocab_size
=
1000
,
hidden_size
=
64
,
initializer_range
=
0.02
)
preds
=
model
(
tokens
,
position_ids
,
attention_mask
)
criterion
=
GPTPretrainingCriterion
()
loss
=
criterion
(
preds
,
labels
,
loss_mask
)
def
gen_data
():
np
.
random
.
seed
(
2021
)
tokens
=
[]
position_ids
=
[]
attention_mask
=
[]
labels
=
[]
loss_mask
=
[]
for
_
in
range
(
batch_size
):
tokens
.
append
(
np
.
random
.
randint
(
vocab_size
,
size
=
sequence_len
))
position_ids
.
append
(
np
.
arange
(
sequence_len
))
attention_mask
.
append
([
np
.
tril
(
np
.
ones
(
sequence_len
))])
labels
.
append
(
np
.
random
.
randint
(
vocab_size
,
size
=
sequence_len
))
loss_mask
.
append
(
np
.
ones
(
sequence_len
))
return
tokens
,
position_ids
,
attention_mask
,
labels
,
loss_mask
return
train_program
,
start_program
,
loss
,
gen_data
class
TestGroupOperators
(
unittest
.
TestCase
):
def
test_gpt
(
self
):
modeling
.
init_global
()
train_program
=
static
.
Program
()
start_program
=
static
.
Program
()
place
=
paddle
.
set_device
(
"gpu"
)
batch_size
=
8
sequence_len
=
512
vocab_size
=
1000
train_program
,
start_program
,
loss
,
gen_data
=
get_gpt_model
(
train_program
,
start_program
,
place
,
batch_size
,
sequence_len
,
vocab_size
,
)
from
paddle.distributed.auto_parallel.tuner.rule_based_tuner
import
(
RuleBasedTuner
,
convert_to_graph
,
_PATTERNS
,
)
from
paddle.distributed.auto_parallel.dist_context
import
(
DistributedContext
,
)
dist_context
=
DistributedContext
()
tuner
=
RuleBasedTuner
(
dist_context
)
layers
=
tuner
.
group_operators
(
train_program
.
global_block
().
ops
)
layer
=
layers
[
0
]
graph
=
convert_to_graph
(
layer
,
train_program
.
global_block
())
print
(
graph
)
print
(
"qkv: "
,
_PATTERNS
[
"qkv"
].
graph
)
if
__name__
==
"__main__"
:
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录