Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PALM
提交
aff521f0
P
PALM
项目概览
PaddlePaddle
/
PALM
通知
7
Star
3
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
10
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PALM
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
10
Issue
10
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
aff521f0
编写于
12月 04, 2019
作者:
W
wangxiao
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
change tensorshape
上级
bba10bb6
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
48 addition
and
48 deletion
+48
-48
paddlepalm/backbone/bert.py
paddlepalm/backbone/bert.py
+4
-4
paddlepalm/backbone/ernie.py
paddlepalm/backbone/ernie.py
+5
-5
paddlepalm/reader/mlm.py
paddlepalm/reader/mlm.py
+7
-7
paddlepalm/reader/utils/batching4bert.py
paddlepalm/reader/utils/batching4bert.py
+5
-5
paddlepalm/reader/utils/batching4ernie.py
paddlepalm/reader/utils/batching4ernie.py
+5
-5
paddlepalm/reader/utils/mlm_batching.py
paddlepalm/reader/utils/mlm_batching.py
+4
-4
paddlepalm/reader/utils/reader4ernie.py
paddlepalm/reader/utils/reader4ernie.py
+8
-8
paddlepalm/task_paradigm/cls.py
paddlepalm/task_paradigm/cls.py
+1
-1
paddlepalm/task_paradigm/match.py
paddlepalm/task_paradigm/match.py
+1
-1
paddlepalm/task_paradigm/mlm.py
paddlepalm/task_paradigm/mlm.py
+2
-2
paddlepalm/task_paradigm/mrc.py
paddlepalm/task_paradigm/mrc.py
+6
-6
未找到文件。
paddlepalm/backbone/bert.py
浏览文件 @
aff521f0
...
...
@@ -52,10 +52,10 @@ class Model(backbone):
@
property
def
inputs_attr
(
self
):
return
{
"token_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
-
1
,
1
],
'float32'
]}
return
{
"token_ids"
:
[[
-
1
,
-
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
-
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
-
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
-
1
],
'float32'
]}
@
property
def
outputs_attr
(
self
):
...
...
paddlepalm/backbone/ernie.py
浏览文件 @
aff521f0
...
...
@@ -62,11 +62,11 @@ class Model(backbone):
@
property
def
inputs_attr
(
self
):
return
{
"token_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
-
1
,
1
],
'float32'
],
"task_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
]}
return
{
"token_ids"
:
[[
-
1
,
-
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
-
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
-
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
-
1
],
'float32'
],
"task_ids"
:
[[
-
1
,
-
1
],
'int64'
]}
@
property
def
outputs_attr
(
self
):
...
...
paddlepalm/reader/mlm.py
浏览文件 @
aff521f0
...
...
@@ -60,13 +60,13 @@ class Reader(reader):
@
property
def
outputs_attr
(
self
):
return
{
"token_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
-
1
,
1
],
'float32'
],
"task_ids"
:
[[
-
1
,
-
1
,
1
],
'int64'
],
"mask_label"
:
[[
-
1
,
1
],
'int64'
],
"mask_pos"
:
[[
-
1
,
1
],
'int64'
],
return
{
"token_ids"
:
[[
-
1
,
-
1
],
'int64'
],
"position_ids"
:
[[
-
1
,
-
1
],
'int64'
],
"segment_ids"
:
[[
-
1
,
-
1
],
'int64'
],
"input_mask"
:
[[
-
1
,
-
1
],
'float32'
],
"task_ids"
:
[[
-
1
,
-
1
],
'int64'
],
"mask_label"
:
[[
-
1
],
'int64'
],
"mask_pos"
:
[[
-
1
],
'int64'
],
}
...
...
paddlepalm/reader/utils/batching4bert.py
浏览文件 @
aff521f0
...
...
@@ -67,8 +67,8 @@ def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
sent
[
token_index
]
=
MASK
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
mask_label
=
np
.
array
(
mask_label
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
mask_pos
=
np
.
array
(
mask_pos
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
mask_label
=
np
.
array
(
mask_label
).
astype
(
"int64"
).
reshape
([
-
1
])
mask_pos
=
np
.
array
(
mask_pos
).
astype
(
"int64"
).
reshape
([
-
1
])
return
batch_tokens
,
mask_label
,
mask_pos
...
...
@@ -96,7 +96,7 @@ def prepare_batch_data(insts,
# or unique id
for
i
in
range
(
3
,
len
(
insts
[
0
]),
1
):
labels
=
[
inst
[
i
]
for
inst
in
insts
]
labels
=
np
.
array
(
labels
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
labels
=
np
.
array
(
labels
).
astype
(
"int64"
).
reshape
([
-
1
])
labels_list
.
append
(
labels
)
# First step: do mask without padding
if
mask_id
>=
0
:
...
...
@@ -154,14 +154,14 @@ def pad_batch_data(insts,
inst_data
=
np
.
array
([
list
(
inst
)
+
list
([
pad_idx
]
*
(
max_len
-
len
(
inst
)))
for
inst
in
insts
])
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
,
1
])]
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
])]
# position data
if
return_pos
:
inst_pos
=
np
.
array
([
list
(
range
(
0
,
len
(
inst
)))
+
[
pad_idx
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
,
1
])]
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
])]
if
return_input_mask
:
# This is used to avoid attention on paddings.
input_mask_data
=
np
.
array
([[
1
]
*
len
(
inst
)
+
[
0
]
*
...
...
paddlepalm/reader/utils/batching4ernie.py
浏览文件 @
aff521f0
...
...
@@ -113,8 +113,8 @@ def mask(batch_tokens,
pre_sent_len
=
len
(
sent
)
mask_label
=
np
.
array
(
mask_label
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
mask_pos
=
np
.
array
(
mask_pos
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
mask_label
=
np
.
array
(
mask_label
).
astype
(
"int64"
).
reshape
([
-
1
])
mask_pos
=
np
.
array
(
mask_pos
).
astype
(
"int64"
).
reshape
([
-
1
])
return
batch_tokens
,
mask_label
,
mask_pos
...
...
@@ -136,7 +136,7 @@ def pad_batch_data(insts,
inst_data
=
np
.
array
(
[
inst
+
list
([
pad_idx
]
*
(
max_len
-
len
(
inst
)))
for
inst
in
insts
])
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
,
1
])]
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
])]
# position data
if
return_pos
:
...
...
@@ -145,7 +145,7 @@ def pad_batch_data(insts,
for
inst
in
insts
])
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
,
1
])]
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
])]
if
return_input_mask
:
# This is used to avoid attention on paddings.
...
...
@@ -165,7 +165,7 @@ def pad_batch_data(insts,
if
return_seq_lens
:
seq_lens
=
np
.
array
([
len
(
inst
)
for
inst
in
insts
])
return_list
+=
[
seq_lens
.
astype
(
"int64"
).
reshape
([
-
1
,
1
])]
return_list
+=
[
seq_lens
.
astype
(
"int64"
).
reshape
([
-
1
])]
return
return_list
if
len
(
return_list
)
>
1
else
return_list
[
0
]
...
...
paddlepalm/reader/utils/mlm_batching.py
浏览文件 @
aff521f0
...
...
@@ -67,8 +67,8 @@ def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3):
sent
[
token_index
]
=
MASK
mask_flag
=
True
mask_pos
.
append
(
sent_index
*
max_len
+
token_index
)
mask_label
=
np
.
array
(
mask_label
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
mask_pos
=
np
.
array
(
mask_pos
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
mask_label
=
np
.
array
(
mask_label
).
astype
(
"int64"
).
reshape
([
-
1
])
mask_pos
=
np
.
array
(
mask_pos
).
astype
(
"int64"
).
reshape
([
-
1
])
return
batch_tokens
,
mask_label
,
mask_pos
...
...
@@ -147,14 +147,14 @@ def pad_batch_data(insts,
inst_data
=
np
.
array
([
list
(
inst
)
+
list
([
pad_idx
]
*
(
max_len
-
len
(
inst
)))
for
inst
in
insts
])
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
,
1
])]
return_list
+=
[
inst_data
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
])]
# position data
if
return_pos
:
inst_pos
=
np
.
array
([
list
(
range
(
0
,
len
(
inst
)))
+
[
pad_idx
]
*
(
max_len
-
len
(
inst
))
for
inst
in
insts
])
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
,
1
])]
return_list
+=
[
inst_pos
.
astype
(
"int64"
).
reshape
([
-
1
,
max_len
])]
if
return_input_mask
:
# This is used to avoid attention on paddings.
input_mask_data
=
np
.
array
([[
1
]
*
len
(
inst
)
+
[
0
]
*
...
...
paddlepalm/reader/utils/reader4ernie.py
浏览文件 @
aff521f0
...
...
@@ -479,17 +479,17 @@ class ClassifyReader(BaseReader):
batch_labels
=
[
record
.
label_id
for
record
in
batch_records
]
if
self
.
is_classify
:
batch_labels
=
np
.
array
(
batch_labels
).
astype
(
"int64"
).
reshape
(
[
-
1
,
1
])
[
-
1
])
elif
self
.
is_regression
:
batch_labels
=
np
.
array
(
batch_labels
).
astype
(
"float32"
).
reshape
(
[
-
1
,
1
])
[
-
1
])
if
batch_records
[
0
].
qid
:
batch_qids
=
[
record
.
qid
for
record
in
batch_records
]
batch_qids
=
np
.
array
(
batch_qids
).
astype
(
"int64"
).
reshape
(
[
-
1
,
1
])
[
-
1
])
else
:
batch_qids
=
np
.
array
([]).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
batch_qids
=
np
.
array
([]).
astype
(
"int64"
).
reshape
([
-
1
])
# padding
padded_token_ids
,
input_mask
=
pad_batch_data
(
...
...
@@ -908,15 +908,15 @@ class MRCReader(BaseReader):
record
.
end_position
for
record
in
batch_records
]
batch_start_position
=
np
.
array
(
batch_start_position
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
"int64"
).
reshape
([
-
1
])
batch_end_position
=
np
.
array
(
batch_end_position
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
"int64"
).
reshape
([
-
1
])
else
:
batch_size
=
len
(
batch_token_ids
)
batch_start_position
=
np
.
zeros
(
shape
=
[
batch_size
,
1
],
dtype
=
"int64"
)
batch_end_position
=
np
.
zeros
(
shape
=
[
batch_size
,
1
],
dtype
=
"int64"
)
shape
=
[
batch_size
],
dtype
=
"int64"
)
batch_end_position
=
np
.
zeros
(
shape
=
[
batch_size
],
dtype
=
"int64"
)
batch_unique_ids
=
[
record
.
unique_id
for
record
in
batch_records
]
batch_unique_ids
=
np
.
array
(
batch_unique_ids
).
astype
(
"int64"
).
reshape
(
...
...
paddlepalm/task_paradigm/cls.py
浏览文件 @
aff521f0
...
...
@@ -43,7 +43,7 @@ class TaskParadigm(task_paradigm):
@
property
def
inputs_attrs
(
self
):
if
self
.
_is_training
:
reader
=
{
"label_ids"
:
[[
-
1
,
1
],
'int64'
]}
reader
=
{
"label_ids"
:
[[
-
1
],
'int64'
]}
else
:
reader
=
{}
bb
=
{
"sentence_embedding"
:
[[
-
1
,
self
.
_hidden_size
],
'float32'
]}
...
...
paddlepalm/task_paradigm/match.py
浏览文件 @
aff521f0
...
...
@@ -44,7 +44,7 @@ class TaskParadigm(task_paradigm):
@
property
def
inputs_attrs
(
self
):
if
self
.
_is_training
:
reader
=
{
"label_ids"
:
[[
-
1
,
1
],
'int64'
]}
reader
=
{
"label_ids"
:
[[
-
1
],
'int64'
]}
else
:
reader
=
{}
bb
=
{
"sentence_pair_embedding"
:
[[
-
1
,
self
.
_hidden_size
],
'float32'
]}
...
...
paddlepalm/task_paradigm/mlm.py
浏览文件 @
aff521f0
...
...
@@ -33,8 +33,8 @@ class TaskParadigm(task_paradigm):
@
property
def
inputs_attrs
(
self
):
reader
=
{
"mask_label"
:
[[
-
1
,
1
],
'int64'
],
"mask_pos"
:
[[
-
1
,
1
],
'int64'
]}
"mask_label"
:
[[
-
1
],
'int64'
],
"mask_pos"
:
[[
-
1
],
'int64'
]}
if
not
self
.
_is_training
:
del
reader
[
'mask_label'
]
del
reader
[
'batchsize_x_seqlen'
]
...
...
paddlepalm/task_paradigm/mrc.py
浏览文件 @
aff521f0
...
...
@@ -49,11 +49,11 @@ class TaskParadigm(task_paradigm):
@
property
def
inputs_attrs
(
self
):
if
self
.
_is_training
:
reader
=
{
"start_positions"
:
[[
-
1
,
1
],
'int64'
],
"end_positions"
:
[[
-
1
,
1
],
'int64'
],
reader
=
{
"start_positions"
:
[[
-
1
],
'int64'
],
"end_positions"
:
[[
-
1
],
'int64'
],
}
else
:
reader
=
{
'unique_ids'
:
[[
-
1
,
1
],
'int64'
]}
reader
=
{
'unique_ids'
:
[[
-
1
],
'int64'
]}
bb
=
{
"encoder_outputs"
:
[[
-
1
,
-
1
,
self
.
_hidden_size
],
'float32'
]}
return
{
'reader'
:
reader
,
'backbone'
:
bb
}
...
...
@@ -68,9 +68,9 @@ class TaskParadigm(task_paradigm):
if
self
.
_is_training
:
return
{
'loss'
:
[[
1
],
'float32'
]}
else
:
return
{
'start_logits'
:
[[
-
1
,
-
1
,
1
],
'float32'
],
'end_logits'
:
[[
-
1
,
-
1
,
1
],
'float32'
],
'unique_ids'
:
[[
-
1
,
1
],
'int64'
]}
return
{
'start_logits'
:
[[
-
1
,
-
1
],
'float32'
],
'end_logits'
:
[[
-
1
,
-
1
],
'float32'
],
'unique_ids'
:
[[
-
1
],
'int64'
]}
def
build
(
self
,
inputs
,
scope_name
=
""
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录