Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
4b4b48ef
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
1 年多 前同步成功
通知
283
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
4b4b48ef
编写于
4月 04, 2019
作者:
Z
Zeyu Chen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add create task and task_reader
上级
60db0313
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
537 addition
and
121 deletion
+537
-121
demo/ernie-classification/finetune_with_hub.py
demo/ernie-classification/finetune_with_hub.py
+1
-1
demo/ernie-seq-label/finetune_with_hub.py
demo/ernie-seq-label/finetune_with_hub.py
+2
-2
paddlehub/__init__.py
paddlehub/__init__.py
+4
-3
paddlehub/finetune/network.py
paddlehub/finetune/network.py
+0
-115
paddlehub/finetune/task.py
paddlehub/finetune/task.py
+132
-0
paddlehub/reader/task_reader.py
paddlehub/reader/task_reader.py
+398
-0
未找到文件。
demo/ernie-classification/finetune_with_hub.py
浏览文件 @
4b4b48ef
...
...
@@ -79,7 +79,7 @@ if __name__ == '__main__':
label
.
name
]
# Define a classfication finetune task by PaddleHub's API
cls_task
=
hub
.
append_mlp_classifier
(
cls_task
=
hub
.
create_text_classification_task
(
pooled_output
,
label
,
num_classes
=
num_labels
)
# Finetune and evaluate by PaddleHub's API
...
...
demo/ernie-seq-label/finetune_with_hub.py
浏览文件 @
4b4b48ef
...
...
@@ -82,13 +82,13 @@ if __name__ == '__main__':
label
.
name
,
seq_len
]
# Define a classfication finetune task by PaddleHub's API
seq_label_task
=
hub
.
append_sequence_labeler
(
seq_label_task
=
hub
.
create_seq_labeling_task
(
feature
=
sequence_output
,
labels
=
label
,
seq_len
=
seq_len
,
num_classes
=
num_labels
)
# Finetune and evaluate by PaddleHub's API
# Finetune and evaluate
model
by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
hub
.
finetune_and_eval
(
task
=
seq_label_task
,
...
...
paddlehub/__init__.py
浏览文件 @
4b4b48ef
...
...
@@ -32,11 +32,12 @@ from .module.manager import default_module_manager
from
.io.type
import
DataType
from
.finetune.network
import
append_mlp_classifier
from
.finetune.network
import
append_sequence_labeler
from
.finetune.task
import
Task
from
.finetune.task
import
create_seq_labeling_task
from
.finetune.task
import
create_text_classification_task
from
.finetune.task
import
create_img_classification_task
from
.finetune.finetune
import
finetune_and_eval
from
.finetune.config
import
RunConfig
from
.finetune.task
import
Task
from
.finetune.strategy
import
BERTFinetuneStrategy
from
.finetune.strategy
import
DefaultStrategy
...
...
paddlehub/finetune/network.py
已删除
100644 → 0
浏览文件 @
60db0313
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
collections
import
time
import
multiprocessing
import
numpy
as
np
import
paddle.fluid
as
fluid
from
paddlehub.finetune.task
import
Task
__all__
=
[
'append_mlp_classifier'
]
def
append_mlp_classifier
(
feature
,
label
,
num_classes
=
2
,
hidden_units
=
None
):
"""
Append a multi-layer perceptron classifier for binary classification base
on input feature
"""
cls_feats
=
fluid
.
layers
.
dropout
(
x
=
feature
,
dropout_prob
=
0.1
,
dropout_implementation
=
"upscale_in_train"
)
# append fully connected layer according to hidden_units
if
hidden_units
is
not
None
:
for
n_hidden
in
hidden_units
:
cls_feats
=
fluid
.
layers
.
fc
(
input
=
cls_feats
,
size
=
n_hidden
)
logits
=
fluid
.
layers
.
fc
(
input
=
cls_feats
,
size
=
num_classes
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_w"
,
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
0.02
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_b"
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
ce_loss
,
probs
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
logits
,
label
=
label
,
return_softmax
=
True
)
loss
=
fluid
.
layers
.
mean
(
x
=
ce_loss
)
num_example
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
accuracy
=
fluid
.
layers
.
accuracy
(
input
=
probs
,
label
=
label
,
total
=
num_example
)
graph_var_dict
=
{
"loss"
:
loss
,
"probs"
:
probs
,
"accuracy"
:
accuracy
,
"num_example"
:
num_example
}
task
=
Task
(
"text_classification"
,
graph_var_dict
,
fluid
.
default_main_program
(),
fluid
.
default_startup_program
())
return
task
def
append_mlp_multi_classifier
(
feature
,
label
,
num_classes
,
hidden_units
=
None
,
act
=
None
):
pass
def
append_sequence_labeler
(
feature
,
labels
,
seq_len
,
num_classes
=
None
):
logits
=
fluid
.
layers
.
fc
(
input
=
feature
,
size
=
num_classes
,
num_flatten_dims
=
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"cls_seq_label_out_w"
,
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
0.02
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls_seq_label_out_b"
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
ret_labels
=
fluid
.
layers
.
reshape
(
x
=
labels
,
shape
=
[
-
1
,
1
])
ret_infers
=
fluid
.
layers
.
reshape
(
x
=
fluid
.
layers
.
argmax
(
logits
,
axis
=
2
),
shape
=
[
-
1
,
1
])
labels
=
fluid
.
layers
.
flatten
(
labels
,
axis
=
2
)
ce_loss
,
probs
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
fluid
.
layers
.
flatten
(
logits
,
axis
=
2
),
label
=
labels
,
return_softmax
=
True
)
loss
=
fluid
.
layers
.
mean
(
x
=
ce_loss
)
# accuracy = fluid.layers.accuracy(
# input=probs, label=labels, total=num_example)
graph_var_dict
=
{
"loss"
:
loss
,
"probs"
:
probs
,
"labels"
:
ret_labels
,
"infers"
:
ret_infers
,
"seq_len"
:
seq_len
}
task
=
Task
(
"sequence_labeling"
,
graph_var_dict
,
fluid
.
default_main_program
(),
fluid
.
default_startup_program
())
return
task
paddlehub/finetune/task.py
浏览文件 @
4b4b48ef
...
...
@@ -22,6 +22,11 @@ import paddle.fluid as fluid
class
Task
(
object
):
"""
A simple transfer learning task definition,
including Paddle's main_program, startup_program and inference program
"""
def
__init__
(
self
,
task_type
,
graph_var_dict
,
main_program
,
startup_program
):
self
.
task_type
=
task_type
...
...
@@ -51,3 +56,130 @@ class Task(object):
metric_variable_names
.
append
(
var_name
)
return
metric_variable_names
def
create_text_classification_task
(
feature
,
label
,
num_classes
,
hidden_units
=
None
):
"""
Append a multi-layer perceptron classifier for binary classification base
on input feature
"""
cls_feats
=
fluid
.
layers
.
dropout
(
x
=
feature
,
dropout_prob
=
0.1
,
dropout_implementation
=
"upscale_in_train"
)
# append fully connected layer according to hidden_units
if
hidden_units
is
not
None
:
for
n_hidden
in
hidden_units
:
cls_feats
=
fluid
.
layers
.
fc
(
input
=
cls_feats
,
size
=
n_hidden
)
logits
=
fluid
.
layers
.
fc
(
input
=
cls_feats
,
size
=
num_classes
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_w"
,
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
0.02
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_b"
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
ce_loss
,
probs
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
logits
,
label
=
label
,
return_softmax
=
True
)
loss
=
fluid
.
layers
.
mean
(
x
=
ce_loss
)
num_example
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
accuracy
=
fluid
.
layers
.
accuracy
(
input
=
probs
,
label
=
label
,
total
=
num_example
)
graph_var_dict
=
{
"loss"
:
loss
,
"probs"
:
probs
,
"accuracy"
:
accuracy
,
"num_example"
:
num_example
}
task
=
Task
(
"text_classification"
,
graph_var_dict
,
fluid
.
default_main_program
(),
fluid
.
default_startup_program
())
return
task
def
create_img_classification_task
(
feature
,
label
,
num_classes
,
hidden_units
=
None
):
"""
Append a multi-layer perceptron classifier for binary classification base
on input feature
"""
cls_feats
=
feature
# append fully connected layer according to hidden_units
if
hidden_units
is
not
None
:
for
n_hidden
in
hidden_units
:
cls_feats
=
fluid
.
layers
.
fc
(
input
=
cls_feats
,
size
=
n_hidden
)
logits
=
fluid
.
layers
.
fc
(
input
=
cls_feats
,
size
=
num_classes
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_w"
,
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
0.02
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_b"
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
ce_loss
,
probs
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
logits
,
label
=
label
,
return_softmax
=
True
)
loss
=
fluid
.
layers
.
mean
(
x
=
ce_loss
)
num_example
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
accuracy
=
fluid
.
layers
.
accuracy
(
input
=
probs
,
label
=
label
,
total
=
num_example
)
graph_var_dict
=
{
"loss"
:
loss
,
"probs"
:
probs
,
"accuracy"
:
accuracy
,
"num_example"
:
num_example
}
task
=
Task
(
"text_classification"
,
graph_var_dict
,
fluid
.
default_main_program
(),
fluid
.
default_startup_program
())
return
task
def
create_seq_labeling_task
(
feature
,
labels
,
seq_len
,
num_classes
=
None
):
logits
=
fluid
.
layers
.
fc
(
input
=
feature
,
size
=
num_classes
,
num_flatten_dims
=
2
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"cls_seq_label_out_w"
,
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
0.02
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls_seq_label_out_b"
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
ret_labels
=
fluid
.
layers
.
reshape
(
x
=
labels
,
shape
=
[
-
1
,
1
])
ret_infers
=
fluid
.
layers
.
reshape
(
x
=
fluid
.
layers
.
argmax
(
logits
,
axis
=
2
),
shape
=
[
-
1
,
1
])
labels
=
fluid
.
layers
.
flatten
(
labels
,
axis
=
2
)
ce_loss
,
probs
=
fluid
.
layers
.
softmax_with_cross_entropy
(
logits
=
fluid
.
layers
.
flatten
(
logits
,
axis
=
2
),
label
=
labels
,
return_softmax
=
True
)
loss
=
fluid
.
layers
.
mean
(
x
=
ce_loss
)
graph_var_dict
=
{
"loss"
:
loss
,
"probs"
:
probs
,
"labels"
:
ret_labels
,
"infers"
:
ret_infers
,
"seq_len"
:
seq_len
}
task
=
Task
(
"sequence_labeling"
,
graph_var_dict
,
fluid
.
default_main_program
(),
fluid
.
default_startup_program
())
return
task
paddlehub/reader/task_reader.py
0 → 100644
浏览文件 @
4b4b48ef
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
csv
import
json
import
numpy
as
np
from
collections
import
namedtuple
from
paddlehub.reader
import
tokenization
from
.batching
import
pad_batch_data
class
BaseReader
(
object
):
def
__init__
(
self
,
dataset
,
vocab_path
,
label_map_config
=
None
,
max_seq_len
=
512
,
do_lower_case
=
True
,
in_tokens
=
False
,
random_seed
=
None
):
self
.
max_seq_len
=
max_seq_len
self
.
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
vocab_path
,
do_lower_case
=
do_lower_case
)
self
.
vocab
=
self
.
tokenizer
.
vocab
self
.
dataset
=
dataset
self
.
pad_id
=
self
.
vocab
[
"[PAD]"
]
self
.
cls_id
=
self
.
vocab
[
"[CLS]"
]
self
.
sep_id
=
self
.
vocab
[
"[SEP]"
]
self
.
in_tokens
=
in_tokens
np
.
random
.
seed
(
random_seed
)
self
.
label_map
=
self
.
dataset
.
get_label_map
()
self
.
current_example
=
0
self
.
current_epoch
=
0
self
.
num_examples
=
0
# if label_map_config:
# with open(label_map_config) as f:
# self.label_map = json.load(f)
# else:
# self.label_map = None
self
.
num_examples
=
{
'train'
:
-
1
,
'dev'
:
-
1
,
'test'
:
-
1
}
def
get_train_examples
(
self
):
"""Gets a collection of `InputExample`s for the train set."""
return
self
.
dataset
.
get_train_examples
()
def
get_dev_examples
(
self
):
"""Gets a collection of `InputExample`s for the dev set."""
return
self
.
dataset
.
get_dev_examples
()
def
get_val_examples
(
self
):
"""Gets a collection of `InputExample`s for the val set."""
return
self
.
dataset
.
get_val_examples
()
def
get_test_examples
(
self
):
"""Gets a collection of `InputExample`s for prediction."""
return
self
.
dataset
.
get_test_examples
()
def
get_labels
(
self
):
"""Gets the list of labels for this data set."""
return
self
.
dataset
.
get_labels
()
def
get_train_progress
(
self
):
"""Gets progress for training phase."""
return
self
.
current_example
,
self
.
current_epoch
def
_truncate_seq_pair
(
self
,
tokens_a
,
tokens_b
,
max_length
):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while
True
:
total_length
=
len
(
tokens_a
)
+
len
(
tokens_b
)
if
total_length
<=
max_length
:
break
if
len
(
tokens_a
)
>
len
(
tokens_b
):
tokens_a
.
pop
()
else
:
tokens_b
.
pop
()
def
_convert_example_to_record
(
self
,
example
,
max_seq_length
,
tokenizer
):
"""Converts a single `Example` into a single `Record`."""
text_a
=
tokenization
.
convert_to_unicode
(
example
.
text_a
)
tokens_a
=
tokenizer
.
tokenize
(
text_a
)
tokens_b
=
None
if
example
.
text_b
is
not
None
:
#if "text_b" in example._fields:
text_b
=
tokenization
.
convert_to_unicode
(
example
.
text_b
)
tokens_b
=
tokenizer
.
tokenize
(
text_b
)
if
tokens_b
:
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
self
.
_truncate_seq_pair
(
tokens_a
,
tokens_b
,
max_seq_length
-
3
)
else
:
# Account for [CLS] and [SEP] with "- 2"
if
len
(
tokens_a
)
>
max_seq_length
-
2
:
tokens_a
=
tokens_a
[
0
:(
max_seq_length
-
2
)]
# The convention in BERT/ERNIE is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens
=
[]
text_type_ids
=
[]
tokens
.
append
(
"[CLS]"
)
text_type_ids
.
append
(
0
)
for
token
in
tokens_a
:
tokens
.
append
(
token
)
text_type_ids
.
append
(
0
)
tokens
.
append
(
"[SEP]"
)
text_type_ids
.
append
(
0
)
if
tokens_b
:
for
token
in
tokens_b
:
tokens
.
append
(
token
)
text_type_ids
.
append
(
1
)
tokens
.
append
(
"[SEP]"
)
text_type_ids
.
append
(
1
)
token_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
position_ids
=
list
(
range
(
len
(
token_ids
)))
if
self
.
label_map
:
label_id
=
self
.
label_map
[
example
.
label
]
else
:
label_id
=
example
.
label
# Record = namedtuple(
# 'Record',
# ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'])
# qid = None
# if "qid" in example._fields:
# qid = example.qid
# record = Record(
# token_ids=token_ids,
# text_type_ids=text_type_ids,
# position_ids=position_ids,
# label_id=label_id,
# qid=qid)
Record
=
namedtuple
(
'Record'
,
[
'token_ids'
,
'text_type_ids'
,
'position_ids'
,
'label_id'
])
record
=
Record
(
token_ids
=
token_ids
,
text_type_ids
=
text_type_ids
,
position_ids
=
position_ids
,
label_id
=
label_id
)
return
record
def
_prepare_batch_data
(
self
,
examples
,
batch_size
,
phase
=
None
):
"""generate batch records"""
batch_records
,
max_len
=
[],
0
for
index
,
example
in
enumerate
(
examples
):
if
phase
==
"train"
:
self
.
current_example
=
index
record
=
self
.
_convert_example_to_record
(
example
,
self
.
max_seq_len
,
self
.
tokenizer
)
max_len
=
max
(
max_len
,
len
(
record
.
token_ids
))
if
self
.
in_tokens
:
to_append
=
(
len
(
batch_records
)
+
1
)
*
max_len
<=
batch_size
else
:
to_append
=
len
(
batch_records
)
<
batch_size
if
to_append
:
batch_records
.
append
(
record
)
else
:
yield
self
.
_pad_batch_records
(
batch_records
)
batch_records
,
max_len
=
[
record
],
len
(
record
.
token_ids
)
if
batch_records
:
yield
self
.
_pad_batch_records
(
batch_records
)
# def get_num_examples(self, input_file):
# examples = self._read_tsv(input_file)
# return len(examples)
def
get_num_examples
(
self
,
phase
):
"""Get number of examples for train, dev or test."""
if
phase
not
in
[
'train'
,
'val'
,
'dev'
,
'test'
]:
raise
ValueError
(
"Unknown phase, which should be in ['train', 'val'/'dev', 'test']."
)
return
self
.
num_examples
[
phase
]
def
data_generator
(
self
,
batch_size
,
phase
=
'train'
,
shuffle
=
True
):
if
phase
==
'train'
:
examples
=
self
.
get_train_examples
()
self
.
num_examples
[
'train'
]
=
len
(
examples
)
elif
phase
==
'val'
or
phase
==
'dev'
:
examples
=
self
.
get_dev_examples
()
self
.
num_examples
[
'dev'
]
=
len
(
examples
)
elif
phase
==
'test'
:
examples
=
self
.
get_test_examples
()
self
.
num_examples
[
'test'
]
=
len
(
examples
)
else
:
raise
ValueError
(
"Unknown phase, which should be in ['train', 'dev', 'test']."
)
def
wrapper
():
if
shuffle
:
np
.
random
.
shuffle
(
examples
)
for
batch_data
in
self
.
_prepare_batch_data
(
examples
,
batch_size
,
phase
=
phase
):
yield
[
batch_data
]
return
wrapper
class
ClassifyReader
(
BaseReader
):
def
_pad_batch_records
(
self
,
batch_records
):
batch_token_ids
=
[
record
.
token_ids
for
record
in
batch_records
]
batch_text_type_ids
=
[
record
.
text_type_ids
for
record
in
batch_records
]
batch_position_ids
=
[
record
.
position_ids
for
record
in
batch_records
]
batch_labels
=
[
record
.
label_id
for
record
in
batch_records
]
batch_labels
=
np
.
array
(
batch_labels
).
astype
(
"int64"
).
reshape
([
-
1
,
1
])
# if batch_records[0].qid:
# batch_qids = [record.qid for record in batch_records]
# batch_qids = np.array(batch_qids).astype("int64").reshape([-1, 1])
# else:
# batch_qids = np.array([]).astype("int64").reshape([-1, 1])
# padding
padded_token_ids
,
input_mask
=
pad_batch_data
(
batch_token_ids
,
max_seq_len
=
self
.
max_seq_len
,
pad_idx
=
self
.
pad_id
,
return_input_mask
=
True
)
padded_text_type_ids
=
pad_batch_data
(
batch_text_type_ids
,
max_seq_len
=
self
.
max_seq_len
,
pad_idx
=
self
.
pad_id
)
padded_position_ids
=
pad_batch_data
(
batch_position_ids
,
max_seq_len
=
self
.
max_seq_len
,
pad_idx
=
self
.
pad_id
)
return_list
=
[
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
,
batch_labels
]
return
return_list
class
SequenceLabelReader
(
BaseReader
):
def
_pad_batch_records
(
self
,
batch_records
):
batch_token_ids
=
[
record
.
token_ids
for
record
in
batch_records
]
batch_text_type_ids
=
[
record
.
text_type_ids
for
record
in
batch_records
]
batch_position_ids
=
[
record
.
position_ids
for
record
in
batch_records
]
batch_label_ids
=
[
record
.
label_ids
for
record
in
batch_records
]
# padding
padded_token_ids
,
input_mask
,
batch_seq_lens
=
pad_batch_data
(
batch_token_ids
,
pad_idx
=
self
.
pad_id
,
max_seq_len
=
self
.
max_seq_len
,
return_input_mask
=
True
,
return_seq_lens
=
True
)
padded_text_type_ids
=
pad_batch_data
(
batch_text_type_ids
,
max_seq_len
=
self
.
max_seq_len
,
pad_idx
=
self
.
pad_id
)
padded_position_ids
=
pad_batch_data
(
batch_position_ids
,
max_seq_len
=
self
.
max_seq_len
,
pad_idx
=
self
.
pad_id
)
padded_label_ids
=
pad_batch_data
(
batch_label_ids
,
max_seq_len
=
self
.
max_seq_len
,
pad_idx
=
len
(
self
.
label_map
)
-
1
)
return_list
=
[
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
,
padded_label_ids
,
batch_seq_lens
]
return
return_list
def
_reseg_token_label
(
self
,
tokens
,
labels
,
tokenizer
):
assert
len
(
tokens
)
==
len
(
labels
)
ret_tokens
=
[]
ret_labels
=
[]
for
token
,
label
in
zip
(
tokens
,
labels
):
sub_token
=
tokenizer
.
tokenize
(
token
)
if
len
(
sub_token
)
==
0
:
continue
ret_tokens
.
extend
(
sub_token
)
ret_labels
.
append
(
label
)
if
len
(
sub_token
)
<
2
:
continue
sub_label
=
label
if
label
.
startswith
(
"B-"
):
sub_label
=
"I-"
+
label
[
2
:]
ret_labels
.
extend
([
sub_label
]
*
(
len
(
sub_token
)
-
1
))
assert
len
(
ret_tokens
)
==
len
(
ret_labels
)
return
ret_tokens
,
ret_labels
def
_convert_example_to_record
(
self
,
example
,
max_seq_length
,
tokenizer
):
tokens
=
tokenization
.
convert_to_unicode
(
example
.
text_a
).
split
(
u
""
)
labels
=
tokenization
.
convert_to_unicode
(
example
.
label
).
split
(
u
""
)
tokens
,
labels
=
self
.
_reseg_token_label
(
tokens
,
labels
,
tokenizer
)
if
len
(
tokens
)
>
max_seq_length
-
2
:
tokens
=
tokens
[
0
:(
max_seq_length
-
2
)]
labels
=
labels
[
0
:(
max_seq_length
-
2
)]
tokens
=
[
"[CLS]"
]
+
tokens
+
[
"[SEP]"
]
token_ids
=
tokenizer
.
convert_tokens_to_ids
(
tokens
)
position_ids
=
list
(
range
(
len
(
token_ids
)))
text_type_ids
=
[
0
]
*
len
(
token_ids
)
no_entity_id
=
len
(
self
.
label_map
)
-
1
label_ids
=
[
no_entity_id
]
+
[
self
.
label_map
[
label
]
for
label
in
labels
]
+
[
no_entity_id
]
Record
=
namedtuple
(
'Record'
,
[
'token_ids'
,
'text_type_ids'
,
'position_ids'
,
'label_ids'
])
record
=
Record
(
token_ids
=
token_ids
,
text_type_ids
=
text_type_ids
,
position_ids
=
position_ids
,
label_ids
=
label_ids
)
return
record
class
ExtractEmbeddingReader
(
BaseReader
):
def
_pad_batch_records
(
self
,
batch_records
):
batch_token_ids
=
[
record
.
token_ids
for
record
in
batch_records
]
batch_text_type_ids
=
[
record
.
text_type_ids
for
record
in
batch_records
]
batch_position_ids
=
[
record
.
position_ids
for
record
in
batch_records
]
# padding
padded_token_ids
,
input_mask
,
seq_lens
=
pad_batch_data
(
batch_token_ids
,
pad_idx
=
self
.
pad_id
,
max_seq_len
=
self
.
max_seq_len
,
return_input_mask
=
True
,
return_seq_lens
=
True
)
padded_text_type_ids
=
pad_batch_data
(
batch_text_type_ids
,
pad_idx
=
self
.
pad_id
,
max_seq_len
=
self
.
max_seq_len
)
padded_position_ids
=
pad_batch_data
(
batch_position_ids
,
pad_idx
=
self
.
pad_id
,
max_seq_len
=
self
.
max_seq_len
)
return_list
=
[
padded_token_ids
,
padded_text_type_ids
,
padded_position_ids
,
input_mask
,
seq_lens
]
return
return_list
if
__name__
==
'__main__'
:
pass
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录