Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PGL
提交
86835b86
P
PGL
项目概览
PaddlePaddle
/
PGL
通知
76
Star
4
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
11
列表
看板
标记
里程碑
合并请求
1
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PGL
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
11
Issue
11
列表
看板
标记
里程碑
合并请求
1
合并请求
1
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
86835b86
编写于
8月 12, 2020
作者:
L
liujiaxiang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix the comment and delete useless code
上级
a9d30244
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
19 addition
and
60 deletion
+19
-60
examples/xformer/sparse_scaled_dot_product_attention.py
examples/xformer/sparse_scaled_dot_product_attention.py
+18
-59
examples/xformer/transformer_encoder_sparse.py
examples/xformer/transformer_encoder_sparse.py
+1
-1
未找到文件。
examples/xformer/sparse_scaled_dot_product_attention.py
浏览文件 @
86835b86
#!/usr/bin/env python
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# -*- coding: utf-8 -*-
#
########################################################################
# Licensed under the Apache License, Version 2.0 (the "License");
# #
# you may not use this file except in compliance with the License.
# Copyright (c) 2019 Baidu.com, Inc. All Rights Reserved #
# You may obtain a copy of the License at
# #
#
########################################################################
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
import
paddle.fluid.layers
as
L
import
paddle.fluid.layers
as
L
import
paddle.fluid.layers
as
layers
import
paddle.fluid.layers
as
layers
from
pgl.utils
import
paddle_helper
from
pgl.utils
import
paddle_helper
import
pgl
import
pgl
def
masked_select
(
input
,
mask
):
def
masked_select
(
input
,
mask
):
"""masked_select
"""masked_select
...
@@ -28,7 +37,6 @@ def masked_select(input, mask):
...
@@ -28,7 +37,6 @@ def masked_select(input, mask):
return
L
.
gather
(
input
,
index
,
overwrite
=
False
)
return
L
.
gather
(
input
,
index
,
overwrite
=
False
)
class
BigBirdWrapper
(
pgl
.
graph_wrapper
.
BaseGraphWrapper
):
class
BigBirdWrapper
(
pgl
.
graph_wrapper
.
BaseGraphWrapper
):
"""Implement of Edge Drop """
"""Implement of Edge Drop """
def
__init__
(
self
,
input_mask
):
def
__init__
(
self
,
input_mask
):
...
@@ -51,6 +59,7 @@ class BigBirdWrapper(pgl.graph_wrapper.BaseGraphWrapper):
...
@@ -51,6 +59,7 @@ class BigBirdWrapper(pgl.graph_wrapper.BaseGraphWrapper):
self
.
_edge_uniq_dst_count
=
L
.
concat
([
uniq_count
,
last
])
self
.
_edge_uniq_dst_count
=
L
.
concat
([
uniq_count
,
last
])
self
.
_edge_uniq_dst_count
.
stop_gradient
=
True
self
.
_edge_uniq_dst_count
.
stop_gradient
=
True
def
select_edges
(
src
,
dst
,
input_mask
,
num_nodes
,
max_seqlen
):
def
select_edges
(
src
,
dst
,
input_mask
,
num_nodes
,
max_seqlen
):
src
=
fluid
.
layers
.
elementwise_max
(
src
,
num_nodes
*
0
)
src
=
fluid
.
layers
.
elementwise_max
(
src
,
num_nodes
*
0
)
dst
=
fluid
.
layers
.
elementwise_max
(
dst
,
num_nodes
*
0
)
dst
=
fluid
.
layers
.
elementwise_max
(
dst
,
num_nodes
*
0
)
...
@@ -74,6 +83,7 @@ def select_edges(src, dst, input_mask, num_nodes, max_seqlen):
...
@@ -74,6 +83,7 @@ def select_edges(src, dst, input_mask, num_nodes, max_seqlen):
src
=
masked_select
(
src
,
mask
)
src
=
masked_select
(
src
,
mask
)
return
src
,
dst
return
src
,
dst
def
uniq_edges
(
src
,
dst
,
num_nodes
):
def
uniq_edges
(
src
,
dst
,
num_nodes
):
sorted_dst
=
L
.
cast
(
dst
,
dtype
=
"int64"
)
sorted_dst
=
L
.
cast
(
dst
,
dtype
=
"int64"
)
sorted_src
=
L
.
cast
(
src
,
dtype
=
"int64"
)
sorted_src
=
L
.
cast
(
src
,
dtype
=
"int64"
)
...
@@ -88,38 +98,6 @@ def uniq_edges(src, dst, num_nodes):
...
@@ -88,38 +98,6 @@ def uniq_edges(src, dst, num_nodes):
return
sorted_src
,
sorted_dst
return
sorted_src
,
sorted_dst
#def build_edges(num_nodes, input_mask, max_seqlen):
# edges = L.range(start=0, end=num_nodes, step=1, dtype="int32")
# all_edges = []
# # Window
# filter_func = lambda x, y: select_edges(x, y, input_mask, num_nodes, max_seqlen)
#
# all_edges.append(filter_func(edges - 1, edges)) # win-1
# all_edges.append(filter_func(edges + 1, edges)) # win-2
# all_edges.append(filter_func(edges, edges)) #self-loop
#
# # Global Assume [CLS] is the first token.
# cls_position = edges / max_seqlen * max_seqlen
# all_edges.append(filter_func(cls_position, edges))
# all_edges.append(filter_func(edges, cls_position))
#
# # Random
# for i in range(2):
# rand_edge = L.floor(L.uniform_random(min=0, max=1, shape=[num_nodes]) * L.cast(max_seqlen, dtype="float32"))
# rand_edge = L.cast(rand_edge, dtype="int32") + cls_position
# all_edges.append(filter_func(rand_edge, edges))
#
# if len(all_edges) > 1:
# src = L.concat([ s for s, d in all_edges], 0)
# dst = L.concat([ d for s, d in all_edges], 0)
# else:
# src = all_edges[0][0]
# dst = all_edges[0][1]
#
# # sort edges
# sorted_src, sorted_dst = uniq_edges(src, dst, num_nodes)
# return sorted_src, sorted_dst
def
build_edges
(
num_nodes
,
input_mask
,
max_seqlen
):
def
build_edges
(
num_nodes
,
input_mask
,
max_seqlen
):
edges
=
L
.
range
(
start
=
0
,
end
=
num_nodes
,
step
=
1
,
dtype
=
"int32"
)
edges
=
L
.
range
(
start
=
0
,
end
=
num_nodes
,
step
=
1
,
dtype
=
"int32"
)
all_edges
=
[]
all_edges
=
[]
...
@@ -127,35 +105,19 @@ def build_edges(num_nodes, input_mask, max_seqlen):
...
@@ -127,35 +105,19 @@ def build_edges(num_nodes, input_mask, max_seqlen):
filter_func
=
lambda
x
,
y
:
select_edges
(
x
,
y
,
input_mask
,
num_nodes
,
max_seqlen
)
filter_func
=
lambda
x
,
y
:
select_edges
(
x
,
y
,
input_mask
,
num_nodes
,
max_seqlen
)
all_edges
.
append
(
filter_func
(
edges
-
1
,
edges
))
# win-1
all_edges
.
append
(
filter_func
(
edges
-
1
,
edges
))
# win-1
#all_edges.append(filter_func(edges - 2, edges)) # win-1
#all_edges.append(filter_func(edges - 3, edges)) # win-1
all_edges
.
append
(
filter_func
(
edges
+
1
,
edges
))
# win-2
all_edges
.
append
(
filter_func
(
edges
+
1
,
edges
))
# win-2
#all_edges.append(filter_func(edges + 2, edges)) # win-2
#all_edges.append(filter_func(edges + 3, edges)) # win-2
all_edges
.
append
(
filter_func
(
edges
,
edges
))
#self-loop
all_edges
.
append
(
filter_func
(
edges
,
edges
))
#self-loop
# Global Assume [CLS] is the first token.
# Global Assume [CLS] is the first token.
# vertical cls-window attention
# vertical cls-window attention
cls_position
=
edges
/
max_seqlen
*
max_seqlen
cls_position
=
edges
/
max_seqlen
*
max_seqlen
#all_edges.append(filter_func(cls_position + 1, edges))
all_edges
.
append
(
filter_func
(
cls_position
,
edges
))
all_edges
.
append
(
filter_func
(
cls_position
,
edges
))
# vertical sliding attention
#all_edges.append(filter_func(cls_position + 6, edges))
#all_edges.append(filter_func(cls_position + max_seqlen - 6, edges))
# horizontal cls attention
# horizontal cls attention
all_edges
.
append
(
filter_func
(
edges
,
cls_position
))
all_edges
.
append
(
filter_func
(
edges
,
cls_position
))
#all_edges.append(filter_func(edges, cls_position))
# horizontal sliding attention
#all_edges.append(filter_func(edges, cls_position + 6)
#all_edges.append(filter_func(edges, cls_position + max_seq_len - 6)
# Random
# Random
#for i in range(2):
for
i
in
range
(
2
):
for
i
in
range
(
2
):
rand_edge
=
L
.
floor
(
L
.
uniform_random
(
min
=
0
,
max
=
1
,
shape
=
[
num_nodes
])
*
L
.
cast
(
max_seqlen
,
dtype
=
"float32"
))
rand_edge
=
L
.
floor
(
L
.
uniform_random
(
min
=
0
,
max
=
1
,
shape
=
[
num_nodes
])
*
L
.
cast
(
max_seqlen
,
dtype
=
"float32"
))
rand_edge
=
L
.
cast
(
rand_edge
,
dtype
=
"int32"
)
+
cls_position
rand_edge
=
L
.
cast
(
rand_edge
,
dtype
=
"int32"
)
+
cls_position
...
@@ -173,8 +135,6 @@ def build_edges(num_nodes, input_mask, max_seqlen):
...
@@ -173,8 +135,6 @@ def build_edges(num_nodes, input_mask, max_seqlen):
return
sorted_src
,
sorted_dst
return
sorted_src
,
sorted_dst
def
sparse_scaled_dot_product_attention
(
q
,
k
,
v
,
input_mask
,
dropout_rate
,
n_head
,
d_key
,
d_value
):
def
sparse_scaled_dot_product_attention
(
q
,
k
,
v
,
input_mask
,
dropout_rate
,
n_head
,
d_key
,
d_value
):
def
send_q_k_spmm
(
src_feat
,
dst_feat
,
edge_feat
):
def
send_q_k_spmm
(
src_feat
,
dst_feat
,
edge_feat
):
# q [ num_edges, n_head * dim]
# q [ num_edges, n_head * dim]
...
@@ -221,4 +181,3 @@ def sparse_scaled_dot_product_attention(q, k, v, input_mask, dropout_rate, n_hea
...
@@ -221,4 +181,3 @@ def sparse_scaled_dot_product_attention(q, k, v, input_mask, dropout_rate, n_hea
return
out
,
out
return
out
,
out
examples/xformer/transformer_encoder_sparse.py
浏览文件 @
86835b86
#
Copyright (c) 2018
PaddlePaddle Authors. All Rights Reserved.
#
Copyright (c) 2020
PaddlePaddle Authors. All Rights Reserved.
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# you may not use this file except in compliance with the License.
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录