Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
62a6b95c
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 2 年 前同步成功
通知
285
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
62a6b95c
编写于
11月 20, 2019
作者:
K
kinghuin
提交者:
wuzewu
11月 22, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
ernie-tiny support seq task
上级
bc8a7ed3
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
56 addition
and
8 deletion
+56
-8
demo/sequence-labeling/sequence_label.py
demo/sequence-labeling/sequence_label.py
+9
-2
demo/text-classification/run_classifier.sh
demo/text-classification/run_classifier.sh
+2
-2
paddlehub/reader/nlp_reader.py
paddlehub/reader/nlp_reader.py
+28
-0
paddlehub/reader/tokenization.py
paddlehub/reader/tokenization.py
+17
-4
未找到文件。
demo/sequence-labeling/sequence_label.py
浏览文件 @
62a6b95c
...
...
@@ -37,16 +37,23 @@ args = parser.parse_args()
if
__name__
==
'__main__'
:
# Load Paddlehub ERNIE pretrained model
module
=
hub
.
Module
(
name
=
"
roberta_wwm_ext_chinese_L-24_H-1024_A-16
"
)
module
=
hub
.
Module
(
name
=
"
ernie_v2_chinese_tiny
"
)
inputs
,
outputs
,
program
=
module
.
context
(
trainable
=
True
,
max_seq_len
=
args
.
max_seq_len
)
if
module
.
name
.
startswith
(
"ernie_v2"
):
use_taskid
=
True
else
:
use_taskid
=
False
# Download dataset and use SequenceLabelReader to read dataset
dataset
=
hub
.
dataset
.
MSRA_NER
()
reader
=
hub
.
reader
.
SequenceLabelReader
(
dataset
=
dataset
,
vocab_path
=
module
.
get_vocab_path
(),
max_seq_len
=
args
.
max_seq_len
)
max_seq_len
=
args
.
max_seq_len
,
use_task_id
=
use_taskid
,
sp_model_path
=
module
.
get_spm_path
(),
word_dict_path
=
module
.
get_word_dict_path
())
# Construct transfer learning network
# Use "sequence_output" for token-level output.
...
...
demo/text-classification/run_classifier.sh
浏览文件 @
62a6b95c
...
...
@@ -13,8 +13,8 @@ python -u text_classifier.py \
--weight_decay
=
0.01
\
--max_seq_len
=
128
\
--num_epoch
=
3
\
--use_pyreader
=
Fals
e
\
--use_data_parallel
=
Fals
e
--use_pyreader
=
Tru
e
\
--use_data_parallel
=
Tru
e
# Recommending hyper parameters for difference task
# for ChineseGLUE:
...
...
paddlehub/reader/nlp_reader.py
浏览文件 @
62a6b95c
...
...
@@ -361,6 +361,34 @@ class ClassifyReader(BaseReader):
class
SequenceLabelReader
(
BaseReader
):
def
__init__
(
self
,
vocab_path
,
dataset
=
None
,
label_map_config
=
None
,
max_seq_len
=
512
,
do_lower_case
=
True
,
random_seed
=
None
,
use_task_id
=
False
,
sp_model_path
=
None
,
word_dict_path
=
None
,
in_tokens
=
False
):
super
(
SequenceLabelReader
,
self
).
__init__
(
vocab_path
=
vocab_path
,
dataset
=
dataset
,
label_map_config
=
label_map_config
,
max_seq_len
=
max_seq_len
,
do_lower_case
=
do_lower_case
,
random_seed
=
random_seed
,
use_task_id
=
use_task_id
,
sp_model_path
=
sp_model_path
,
word_dict_path
=
word_dict_path
,
in_tokens
=
in_tokens
)
if
sp_model_path
and
word_dict_path
:
self
.
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
vocab_path
,
do_lower_case
=
do_lower_case
,
use_sentence_piece_vocab
=
True
)
def
_pad_batch_records
(
self
,
batch_records
,
phase
=
None
):
batch_token_ids
=
[
record
.
token_ids
for
record
in
batch_records
]
batch_text_type_ids
=
[
record
.
text_type_ids
for
record
in
batch_records
]
...
...
paddlehub/reader/tokenization.py
浏览文件 @
62a6b95c
...
...
@@ -113,11 +113,17 @@ def whitespace_tokenize(text):
class
FullTokenizer
(
object
):
"""Runs end-to-end tokenziation."""
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
):
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
,
use_sentence_piece_vocab
=
False
):
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
inv_vocab
=
{
v
:
k
for
k
,
v
in
self
.
vocab
.
items
()}
self
.
basic_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
)
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
)
self
.
use_sentence_piece_vocab
=
use_sentence_piece_vocab
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
,
use_sentence_piece_vocab
=
self
.
use_sentence_piece_vocab
)
def
tokenize
(
self
,
text
):
split_tokens
=
[]
...
...
@@ -329,10 +335,15 @@ class BasicTokenizer(object):
class
WordpieceTokenizer
(
object
):
"""Runs WordPiece tokenziation."""
def
__init__
(
self
,
vocab
,
unk_token
=
"[UNK]"
,
max_input_chars_per_word
=
100
):
def
__init__
(
self
,
vocab
,
unk_token
=
"[UNK]"
,
max_input_chars_per_word
=
100
,
use_sentence_piece_vocab
=
False
):
self
.
vocab
=
vocab
self
.
unk_token
=
unk_token
self
.
max_input_chars_per_word
=
max_input_chars_per_word
self
.
use_sentence_piece_vocab
=
use_sentence_piece_vocab
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text into its word pieces.
...
...
@@ -369,7 +380,9 @@ class WordpieceTokenizer(object):
cur_substr
=
None
while
start
<
end
:
substr
=
""
.
join
(
chars
[
start
:
end
])
if
start
>
0
:
if
start
==
0
and
self
.
use_sentence_piece_vocab
:
substr
=
u
'
\u2581
'
+
substr
if
start
>
0
and
not
self
.
use_sentence_piece_vocab
:
substr
=
"##"
+
substr
if
substr
in
self
.
vocab
:
cur_substr
=
substr
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录