Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
510f5407
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 1 年 前同步成功
通知
282
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
510f5407
编写于
8月 02, 2019
作者:
Z
zhangxuefei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Update text-cls demo and multi-lable cls demo to adopt to ernie v2
上级
7f1a2c0b
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
71 addition
and
40 deletion
+71
-40
demo/multi-label-classification/multi_label_classifier.py
demo/multi-label-classification/multi_label_classifier.py
+29
-15
demo/multi-label-classification/predict.py
demo/multi-label-classification/predict.py
+31
-19
demo/multi-label-classification/run_classifier.sh
demo/multi-label-classification/run_classifier.sh
+2
-1
demo/multi-label-classification/run_predict.sh
demo/multi-label-classification/run_predict.sh
+1
-1
paddlehub/module/module.py
paddlehub/module/module.py
+2
-1
paddlehub/reader/nlp_reader.py
paddlehub/reader/nlp_reader.py
+5
-2
paddlehub/reader/tokenization.py
paddlehub/reader/tokenization.py
+1
-1
未找到文件。
demo/multi-label-classification/multi_label_classifier.py
浏览文件 @
510f5407
...
...
@@ -30,37 +30,51 @@ parser.add_argument("--warmup_proportion", type=float, default=0.1, help="Warmup
parser
.
add_argument
(
"--checkpoint_dir"
,
type
=
str
,
default
=
None
,
help
=
"Directory to model checkpoint"
)
parser
.
add_argument
(
"--max_seq_len"
,
type
=
int
,
default
=
128
,
help
=
"Number of words of the longest seqence."
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
1
,
help
=
"Total examples' number in batch for training."
)
parser
.
add_argument
(
"--use_taskid"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether to user ernie v2 , if not to use bert."
)
args
=
parser
.
parse_args
()
# yapf: enable.
if
__name__
==
'__main__'
:
# Step1: load Paddlehub BERT pretrained model
module
=
hub
.
Module
(
name
=
"bert_uncased_L-12_H-768_A-12"
)
# Load Paddlehub BERT pretrained model
if
args
.
use_taskid
:
module
=
hub
.
Module
(
name
=
"ernie_eng_base.hub_module"
)
inputs
,
outputs
,
program
=
module
.
context
(
trainable
=
True
,
max_seq_len
=
args
.
max_seq_len
)
inputs
,
outputs
,
program
=
module
.
context
(
trainable
=
True
,
max_seq_len
=
args
.
max_seq_len
)
# Step2: Download dataset and use MultiLabelReader to read dataset
# Setup feed list for data feeder
feed_list
=
[
inputs
[
"input_ids"
].
name
,
inputs
[
"position_ids"
].
name
,
inputs
[
"segment_ids"
].
name
,
inputs
[
"input_mask"
].
name
,
inputs
[
"task_ids"
].
name
]
else
:
module
=
hub
.
Module
(
name
=
"bert_uncased_L-12_H-768_A-12"
)
inputs
,
outputs
,
program
=
module
.
context
(
trainable
=
True
,
max_seq_len
=
args
.
max_seq_len
)
# Setup feed list for data feeder
feed_list
=
[
inputs
[
"input_ids"
].
name
,
inputs
[
"position_ids"
].
name
,
inputs
[
"segment_ids"
].
name
,
inputs
[
"input_mask"
].
name
,
]
# Download dataset and use MultiLabelReader to read dataset
dataset
=
hub
.
dataset
.
Toxic
()
reader
=
hub
.
reader
.
MultiLabelClassifyReader
(
dataset
=
dataset
,
vocab_path
=
module
.
get_vocab_path
(),
max_seq_len
=
args
.
max_seq_len
)
max_seq_len
=
args
.
max_seq_len
,
use_task_id
=
args
.
use_taskid
)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
pooled_output
=
outputs
[
"pooled_output"
]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list
=
[
inputs
[
"input_ids"
].
name
,
inputs
[
"position_ids"
].
name
,
inputs
[
"segment_ids"
].
name
,
inputs
[
"input_mask"
].
name
,
]
# Select finetune strategy, setup config and finetune
strategy
=
hub
.
AdamWeightDecayStrategy
(
weight_decay
=
args
.
weight_decay
,
...
...
demo/multi-label-classification/predict.py
浏览文件 @
510f5407
...
...
@@ -36,40 +36,52 @@ parser.add_argument("--checkpoint_dir", type=str, default=None, help="Directory
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
1
,
help
=
"Total examples' number in batch for training."
)
parser
.
add_argument
(
"--max_seq_len"
,
type
=
int
,
default
=
128
,
help
=
"Number of words of the longest seqence."
)
parser
.
add_argument
(
"--use_gpu"
,
type
=
ast
.
literal_eval
,
default
=
True
,
help
=
"Whether use GPU for finetuning, input should be True or False"
)
parser
.
add_argument
(
"--use_taskid"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether to user ernie v2 , if not to use bert."
)
args
=
parser
.
parse_args
()
# yapf: enable.
if
__name__
==
'__main__'
:
#
loading Paddlehub ERNIE
pretrained model
module
=
hub
.
Module
(
name
=
"bert_uncased_L-12_H-768_A-12"
)
inputs
,
outputs
,
program
=
module
.
context
(
max_seq_len
=
args
.
max_seq_len
)
#
Load Paddlehub BERT
pretrained model
if
args
.
use_taskid
:
module
=
hub
.
Module
(
name
=
"ernie_eng_base.hub_module"
)
# Sentence classification dataset reader
inputs
,
outputs
,
program
=
module
.
context
(
trainable
=
True
,
max_seq_len
=
args
.
max_seq_len
)
# Setup feed list for data feeder
feed_list
=
[
inputs
[
"input_ids"
].
name
,
inputs
[
"position_ids"
].
name
,
inputs
[
"segment_ids"
].
name
,
inputs
[
"input_mask"
].
name
,
inputs
[
"task_ids"
].
name
]
else
:
module
=
hub
.
Module
(
name
=
"bert_uncased_L-12_H-768_A-12"
)
inputs
,
outputs
,
program
=
module
.
context
(
trainable
=
True
,
max_seq_len
=
args
.
max_seq_len
)
# Setup feed list for data feeder
feed_list
=
[
inputs
[
"input_ids"
].
name
,
inputs
[
"position_ids"
].
name
,
inputs
[
"segment_ids"
].
name
,
inputs
[
"input_mask"
].
name
,
]
# Download dataset and use MultiLabelReader to read dataset
dataset
=
hub
.
dataset
.
Toxic
()
num_label
=
len
(
dataset
.
get_labels
())
reader
=
hub
.
reader
.
MultiLabelClassifyReader
(
dataset
=
dataset
,
vocab_path
=
module
.
get_vocab_path
(),
max_seq_len
=
args
.
max_seq_len
)
place
=
fluid
.
CUDAPlace
(
0
)
if
args
.
use_gpu
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
max_seq_len
=
args
.
max_seq_len
,
use_task_id
=
args
.
use_taskid
)
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output
=
outputs
[
"pooled_output"
]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list
=
[
inputs
[
"input_ids"
].
name
,
inputs
[
"position_ids"
].
name
,
inputs
[
"segment_ids"
].
name
,
inputs
[
"input_mask"
].
name
,
]
# Setup runing config for PaddleHub Finetune API
config
=
hub
.
RunConfig
(
use_data_parallel
=
False
,
...
...
@@ -104,7 +116,7 @@ if __name__ == '__main__':
for
result
in
results
:
# get predict index
label_ids
=
[]
for
i
in
range
(
num_label
):
for
i
in
range
(
dataset
.
num_labels
):
label_val
=
np
.
argmax
(
result
[
i
])
label_ids
.
append
(
label_val
)
print
(
"%s
\t
predict=%s"
%
(
data
[
index
][
0
],
label_ids
))
...
...
demo/multi-label-classification/run_classifier.sh
浏览文件 @
510f5407
...
...
@@ -16,4 +16,5 @@ python -u multi_label_classifier.py \
--learning_rate
=
5e-5
\
--weight_decay
=
0.01
\
--max_seq_len
=
128
\
--num_epoch
=
3
--num_epoch
=
3
\
--use_taskid
=
False
demo/multi-label-classification/run_predict.sh
浏览文件 @
510f5407
...
...
@@ -2,4 +2,4 @@ export FLAGS_eager_delete_tensor_gb=0.0
export
CUDA_VISIBLE_DEVICES
=
0
CKPT_DIR
=
"./ckpt_toxic"
python
-u
predict.py
--checkpoint_dir
$CKPT_DIR
--max_seq_len
128
--use_gpu
True
python
-u
predict.py
--checkpoint_dir
$CKPT_DIR
--max_seq_len
128
--use_gpu
True
--use_taskid
False
paddlehub/module/module.py
浏览文件 @
510f5407
...
...
@@ -581,7 +581,8 @@ class Module(object):
"Set maximum sequence length of input tensor to {}"
.
format
(
max_seq_len
))
for
tensor_name
in
[
"input_ids"
,
"position_ids"
,
"segment_ids"
,
"input_mask"
"input_ids"
,
"position_ids"
,
"segment_ids"
,
"input_mask"
,
"task_ids"
]:
seq_tensor_shape
=
[
-
1
,
max_seq_len
,
1
]
logger
.
info
(
"The shape of input tensor[{}] set to {}"
.
format
(
...
...
paddlehub/reader/nlp_reader.py
浏览文件 @
510f5407
...
...
@@ -690,8 +690,11 @@ class MultiLabelClassifyReader(BaseReader):
position_ids
=
list
(
range
(
len
(
token_ids
)))
label_ids
=
[]
for
label
in
example
.
label
:
label_ids
.
append
(
int
(
label
))
if
phase
==
"predict"
:
label_ids
=
[
0
,
0
,
0
,
0
,
0
,
0
]
else
:
for
label
in
example
.
label
:
label_ids
.
append
(
self
.
label_map
[
label
])
if
phase
!=
"predict"
:
Record
=
namedtuple
(
...
...
paddlehub/reader/tokenization.py
浏览文件 @
510f5407
...
...
@@ -71,7 +71,7 @@ def printable_text(text):
def
load_vocab
(
vocab_file
):
"""Loads a vocabulary file into a dictionary."""
vocab
=
collections
.
OrderedDict
()
fin
=
io
.
open
(
vocab_file
,
"r"
,
"UTF-8"
)
fin
=
io
.
open
(
vocab_file
,
"r"
,
encoding
=
"UTF-8"
)
for
num
,
line
in
enumerate
(
fin
):
items
=
convert_to_unicode
(
line
.
strip
()).
split
(
"
\t
"
)
if
len
(
items
)
>
2
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录