Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
b9ad9775
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
1 年多 前同步成功
通知
283
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b9ad9775
编写于
6月 11, 2019
作者:
B
BinLong
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of github.com:PaddlePaddle/PaddleHub into develop
上级
92fdb878
e6f4a801
变更
16
隐藏空白更改
内联
并排
Showing
16 changed file
with
240 addition
and
169 deletion
+240
-169
demo/image-classification/README.md
demo/image-classification/README.md
+3
-0
demo/image-classification/img_classifier.py
demo/image-classification/img_classifier.py
+11
-6
demo/image-classification/predict.py
demo/image-classification/predict.py
+8
-5
demo/lac/lac_demo.py
demo/lac/lac_demo.py
+4
-4
demo/senta/senta_demo.py
demo/senta/senta_demo.py
+2
-2
demo/sequence-labeling/predict.py
demo/sequence-labeling/predict.py
+60
-62
demo/sequence-labeling/run_predict.sh
demo/sequence-labeling/run_predict.sh
+1
-1
demo/sequence-labeling/run_sequence_label.sh
demo/sequence-labeling/run_sequence_label.sh
+3
-1
demo/sequence-labeling/sequence_label.py
demo/sequence-labeling/sequence_label.py
+17
-16
demo/text-classification/predict.py
demo/text-classification/predict.py
+49
-39
demo/text-classification/run_classifier.sh
demo/text-classification/run_classifier.sh
+4
-2
demo/text-classification/run_predict.sh
demo/text-classification/run_predict.sh
+1
-1
demo/text-classification/text_classifier.py
demo/text-classification/text_classifier.py
+22
-15
paddlehub/commands/clear.py
paddlehub/commands/clear.py
+1
-1
paddlehub/finetune/task.py
paddlehub/finetune/task.py
+43
-12
paddlehub/module/module.py
paddlehub/module/module.py
+11
-2
未找到文件。
demo/image-classification/README.md
浏览文件 @
b9ad9775
...
...
@@ -36,6 +36,8 @@ $ pip install --upgrade paddlepaddle
--checkpoint_dir
: 模型保存路径,PaddleHub会自动保存验证集上表现最好的模型。默认为paddlehub_finetune_ckpt
--dataset
: 使用什么数据集进行finetune, 脚本支持分别是
{
flowers/dogcat/stanforddogs/indoor67/food101
}
。默认为flowers
--use_gpu
: 是否使用GPU进行训练,如果机器支持GPU且安装了GPU版本的PaddlePaddle,我们建议您打开这个开关。默认关闭
--use_data_parallel
: 是否使用数据并行,打开该开关时,会将数据分散到不同的卡上进行训练(CPU下会分布到不同线程)。默认关闭
--use_pyreader
: 是否使用pyreader进行数据喂入。默认关闭
```
## 进行预测
...
...
@@ -51,6 +53,7 @@ $ pip install --upgrade paddlepaddle
--checkpoint_dir
: 模型保存路径,PaddleHub会自动保存验证集上表现最好的模型。默认为paddlehub_finetune_ckpt
--dataset
: 使用什么数据集进行finetune, 脚本支持分别是
{
flowers/dogcat
}
。默认为flowers
--use_gpu
: 使用使用GPU进行训练,如果本机支持GPU且安装了GPU版本的PaddlePaddle,我们建议您打开这个开关。默认关闭
--use_pyreader
: 是否使用pyreader进行数据喂入。默认关闭
```
`注意`
:进行预测时,所选择的module,checkpoint_dir,dataset必须和finetune所用的一样
demo/image-classification/img_classifier.py
浏览文件 @
b9ad9775
#coding:utf-8
import
argparse
import
os
import
ast
import
paddle.fluid
as
fluid
import
paddlehub
as
hub
...
...
@@ -8,12 +9,14 @@ import numpy as np
# yapf: disable
parser
=
argparse
.
ArgumentParser
(
__doc__
)
parser
.
add_argument
(
"--num_epoch"
,
type
=
int
,
default
=
1
,
help
=
"Number of epoches for fine-tuning."
)
parser
.
add_argument
(
"--use_gpu"
,
type
=
bool
,
default
=
True
,
help
=
"Whether use GPU for fine-tuning."
)
parser
.
add_argument
(
"--checkpoint_dir"
,
type
=
str
,
default
=
"paddlehub_finetune_ckpt"
,
help
=
"Path to save log data."
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
16
,
help
=
"Total examples' number in batch for training."
)
parser
.
add_argument
(
"--module"
,
type
=
str
,
default
=
"resnet50"
,
help
=
"Module used as feature extractor."
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
"flowers"
,
help
=
"Dataset to finetune."
)
parser
.
add_argument
(
"--num_epoch"
,
type
=
int
,
default
=
1
,
help
=
"Number of epoches for fine-tuning."
)
parser
.
add_argument
(
"--use_gpu"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether use GPU for fine-tuning."
)
parser
.
add_argument
(
"--checkpoint_dir"
,
type
=
str
,
default
=
"paddlehub_finetune_ckpt"
,
help
=
"Path to save log data."
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
16
,
help
=
"Total examples' number in batch for training."
)
parser
.
add_argument
(
"--module"
,
type
=
str
,
default
=
"resnet50"
,
help
=
"Module used as feature extractor."
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
"flowers"
,
help
=
"Dataset to finetune."
)
parser
.
add_argument
(
"--use_pyreader"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether use pyreader to feed data."
)
parser
.
add_argument
(
"--use_data_parallel"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether use data parallel."
)
# yapf: enable.
module_map
=
{
...
...
@@ -56,6 +59,8 @@ def finetune(args):
feed_list
=
[
img
.
name
]
config
=
hub
.
RunConfig
(
use_data_parallel
=
args
.
use_data_parallel
,
use_pyreader
=
args
.
use_pyreader
,
use_cuda
=
args
.
use_gpu
,
num_epoch
=
args
.
num_epoch
,
batch_size
=
args
.
batch_size
,
...
...
demo/image-classification/predict.py
浏览文件 @
b9ad9775
#coding:utf-8
import
argparse
import
os
import
ast
import
paddle.fluid
as
fluid
import
paddlehub
as
hub
...
...
@@ -8,11 +9,12 @@ import numpy as np
# yapf: disable
parser
=
argparse
.
ArgumentParser
(
__doc__
)
parser
.
add_argument
(
"--use_gpu"
,
type
=
bool
,
default
=
False
,
help
=
"Whether use GPU for predict."
)
parser
.
add_argument
(
"--checkpoint_dir"
,
type
=
str
,
default
=
"paddlehub_finetune_ckpt"
,
help
=
"Path to save log data."
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
16
,
help
=
"Total examples' number in batch for training."
)
parser
.
add_argument
(
"--module"
,
type
=
str
,
default
=
"resnet50"
,
help
=
"Module used as a feature extractor."
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
"flowers"
,
help
=
"Dataset to finetune."
)
parser
.
add_argument
(
"--use_gpu"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether use GPU for predict."
)
parser
.
add_argument
(
"--checkpoint_dir"
,
type
=
str
,
default
=
"paddlehub_finetune_ckpt"
,
help
=
"Path to save log data."
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
16
,
help
=
"Total examples' number in batch for training."
)
parser
.
add_argument
(
"--module"
,
type
=
str
,
default
=
"resnet50"
,
help
=
"Module used as a feature extractor."
)
parser
.
add_argument
(
"--dataset"
,
type
=
str
,
default
=
"flowers"
,
help
=
"Dataset to finetune."
)
parser
.
add_argument
(
"--use_pyreader"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether use pyreader to feed data."
)
# yapf: enable.
module_map
=
{
...
...
@@ -56,6 +58,7 @@ def predict(args):
config
=
hub
.
RunConfig
(
use_data_parallel
=
False
,
use_pyreader
=
args
.
use_pyreader
,
use_cuda
=
args
.
use_gpu
,
batch_size
=
args
.
batch_size
,
enable_memory_optim
=
False
,
...
...
demo/lac/lac_demo.py
浏览文件 @
b9ad9775
...
...
@@ -19,10 +19,10 @@ if __name__ == "__main__":
results
=
lac
.
lexical_analysis
(
data
=
inputs
)
for
result
in
results
:
if
six
.
PY2
:
print
(
json
.
dumps
(
result
[
'word'
],
encoding
=
"utf8"
,
ensure_ascii
=
False
))
print
(
json
.
dumps
(
result
[
'tag'
],
encoding
=
"utf8"
,
ensure_ascii
=
False
))
print
(
json
.
dumps
(
result
[
'word'
],
encoding
=
"utf8"
,
ensure_ascii
=
False
))
print
(
json
.
dumps
(
result
[
'tag'
],
encoding
=
"utf8"
,
ensure_ascii
=
False
))
else
:
print
(
result
[
'word'
])
print
(
result
[
'tag'
])
demo/senta/senta_demo.py
浏览文件 @
b9ad9775
...
...
@@ -21,7 +21,7 @@ if __name__ == "__main__":
results
[
index
][
"text"
]
=
text
for
index
,
result
in
enumerate
(
results
):
if
six
.
PY2
:
print
(
json
.
dumps
(
results
[
index
],
encoding
=
"utf8"
,
ensure_ascii
=
False
))
print
(
json
.
dumps
(
results
[
index
],
encoding
=
"utf8"
,
ensure_ascii
=
False
))
else
:
print
(
results
[
index
])
demo/sequence-labeling/predict.py
浏览文件 @
b9ad9775
...
...
@@ -33,15 +33,16 @@ from paddlehub.finetune.evaluate import chunk_eval, calculate_f1
parser
=
argparse
.
ArgumentParser
(
__doc__
)
parser
.
add_argument
(
"--checkpoint_dir"
,
type
=
str
,
default
=
None
,
help
=
"Directory to model checkpoint"
)
parser
.
add_argument
(
"--max_seq_len"
,
type
=
int
,
default
=
512
,
help
=
"Number of words of the longest seqence."
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
1
,
help
=
"Total examples' number in batch for training."
)
parser
.
add_argument
(
"--use_gpu"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether use GPU for finetuning, input should be True or False"
)
parser
.
add_argument
(
"--use_pyreader"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether use pyreader to feed data."
)
args
=
parser
.
parse_args
()
# yapf: enable.
if
__name__
==
'__main__'
:
# loading Paddlehub ERNIE pretrained model
module
=
hub
.
Module
(
name
=
"ernie"
)
input_dict
,
output_dict
,
program
=
module
.
context
(
max_seq_len
=
args
.
max_seq_len
)
inputs
,
outputs
,
program
=
module
.
context
(
max_seq_len
=
args
.
max_seq_len
)
# Sentence labeling dataset reader
dataset
=
hub
.
dataset
.
MSRA_NER
()
...
...
@@ -53,70 +54,67 @@ if __name__ == '__main__':
place
=
fluid
.
CUDAPlace
(
0
)
if
args
.
use_gpu
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
with
fluid
.
program_guard
(
program
):
# Use "sequence_outputs" for token-level output.
sequence_output
=
output_dict
[
"sequence_output"
]
# Define a classfication finetune task by PaddleHub's API
seq_label_task
=
hub
.
create_seq_label_task
(
feature
=
sequence_output
,
num_classes
=
dataset
.
num_labels
,
max_seq_len
=
args
.
max_seq_len
)
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
# Compared to classification task, we need add seq_len tensor to feedlist
feed_list
=
[
input_dict
[
"input_ids"
].
name
,
input_dict
[
"position_ids"
].
name
,
input_dict
[
"segment_ids"
].
name
,
input_dict
[
"input_mask"
].
name
,
seq_label_task
.
variable
(
'label'
).
name
,
seq_label_task
.
variable
(
'seq_len'
).
name
]
fetch_list
=
[
seq_label_task
.
variable
(
"labels"
).
name
,
seq_label_task
.
variable
(
"infers"
).
name
,
seq_label_task
.
variable
(
"seq_len"
).
name
]
# classification probability tensor
probs
=
seq_label_task
.
variable
(
"probs"
)
# load best model checkpoint
fluid
.
io
.
load_persistables
(
exe
,
args
.
checkpoint_dir
)
inference_program
=
program
.
clone
(
for_test
=
True
)
# calculate the num of label from probs variable shape
num_labels
=
seq_label_task
.
variable
(
"probs"
).
shape
[
1
]
data_feeder
=
fluid
.
DataFeeder
(
feed_list
=
feed_list
,
place
=
place
)
test_reader
=
reader
.
data_generator
(
phase
=
'test'
,
shuffle
=
False
)
test_examples
=
dataset
.
get_test_examples
()
total_label
,
total_infer
,
total_correct
=
0.0
,
0.0
,
0.0
for
index
,
batch
in
enumerate
(
test_reader
()):
np_labels
,
np_infers
,
np_lens
=
exe
.
run
(
feed
=
data_feeder
.
feed
(
batch
),
fetch_list
=
fetch_list
,
program
=
inference_program
)
label_num
,
infer_num
,
correct_num
=
chunk_eval
(
np_labels
,
np_infers
,
np_lens
,
num_labels
)
total_infer
+=
infer_num
total_label
+=
label_num
total_correct
+=
correct_num
labels
=
np_labels
.
reshape
([
-
1
]).
astype
(
np
.
int32
).
tolist
()
# Construct transfer learning network
# Use "sequence_output" for token-level output.
sequence_output
=
outputs
[
"sequence_output"
]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list
=
[
inputs
[
"input_ids"
].
name
,
inputs
[
"position_ids"
].
name
,
inputs
[
"segment_ids"
].
name
,
inputs
[
"input_mask"
].
name
,
]
# Setup runing config for PaddleHub Finetune API
config
=
hub
.
RunConfig
(
use_data_parallel
=
False
,
use_pyreader
=
args
.
use_pyreader
,
use_cuda
=
args
.
use_gpu
,
batch_size
=
args
.
batch_size
,
enable_memory_optim
=
False
,
checkpoint_dir
=
args
.
checkpoint_dir
,
strategy
=
hub
.
finetune
.
strategy
.
DefaultFinetuneStrategy
())
# Define a sequence labeling finetune task by PaddleHub's API
seq_label_task
=
hub
.
SequenceLabelTask
(
data_reader
=
reader
,
feature
=
sequence_output
,
feed_list
=
feed_list
,
max_seq_len
=
args
.
max_seq_len
,
num_classes
=
dataset
.
num_labels
,
config
=
config
)
# test data
data
=
[
[
"我们变而以书会友,以书结缘,把欧美、港台流行的食品类图谱、画册、工具书汇集一堂。"
],
[
"为了跟踪国际最新食品工艺、流行趋势,大量搜集海外专业书刊资料是提高技艺的捷径。"
],
[
"其中线装古籍逾千册;民国出版物几百种;珍本四册、稀见本四百余册,出版时间跨越三百余年。"
],
[
"有的古木交柯,春机荣欣,从诗人句中得之,而入画中,观之令人心驰。"
],
[
"不过重在晋趣,略增明人气息,妙在集古有道、不露痕迹罢了。"
],
]
results
=
seq_label_task
.
predict
(
data
=
data
)
for
num_batch
,
batch_results
in
enumerate
(
results
):
infers
=
batch_results
[
0
].
reshape
([
-
1
]).
astype
(
np
.
int32
).
tolist
()
np_lens
=
batch_results
[
1
]
for
index
,
np_len
in
enumerate
(
np_lens
):
labels
=
infers
[
index
*
args
.
max_seq_len
:(
index
+
1
)
*
args
.
max_seq_len
]
label_str
=
""
count
=
0
for
label_val
in
labels
:
label_str
+=
inv_label_map
[
label_val
]
count
+=
1
if
count
==
np_len
s
:
if
count
==
np_len
:
break
print
(
"%s
\t
predict=%s"
%
(
test_examples
[
index
],
label_str
))
precision
,
recall
,
f1
=
calculate_f1
(
total_label
,
total_infer
,
total_correct
)
print
(
"F1-Score=%f, precision=%f, recall=%f "
%
(
f1
,
precision
,
recall
))
# Drop the label results of CLS and SEP Token
print
(
"%s
\t
predict=%s"
%
(
data
[
num_batch
*
args
.
batch_size
+
index
][
0
],
label_str
[
1
:
-
1
]))
demo/sequence-labeling/run_predict.sh
浏览文件 @
b9ad9775
export
CUDA_VISIBLE_DEVICES
=
0
CKPT_DIR
=
"./ckpt_sequence_label
/best_model
"
CKPT_DIR
=
"./ckpt_sequence_label"
python
-u
predict.py
--checkpoint_dir
$CKPT_DIR
--max_seq_len
128
--use_gpu
True
demo/sequence-labeling/run_sequence_label.sh
浏览文件 @
b9ad9775
...
...
@@ -7,4 +7,6 @@ python -u sequence_label.py \
--num_epoch
3
\
--checkpoint_dir
$CKPT_DIR
\
--max_seq_len
256
\
--learning_rate
5e-5
--learning_rate
5e-5
\
--use_pyreader
True
\
--use_data_parallel
True
demo/sequence-labeling/sequence_label.py
浏览文件 @
b9ad9775
...
...
@@ -30,40 +30,34 @@ parser.add_argument("--warmup_proportion", type=float, default=0.0, help="Warmup
parser
.
add_argument
(
"--max_seq_len"
,
type
=
int
,
default
=
512
,
help
=
"Number of words of the longest seqence."
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
32
,
help
=
"Total examples' number in batch for training."
)
parser
.
add_argument
(
"--checkpoint_dir"
,
type
=
str
,
default
=
None
,
help
=
"Directory to model checkpoint"
)
parser
.
add_argument
(
"--use_pyreader"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether use pyreader to feed data."
)
parser
.
add_argument
(
"--use_data_parallel"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether use data parallel."
)
args
=
parser
.
parse_args
()
# yapf: enable.
if
__name__
==
'__main__'
:
#
Step1: l
oad Paddlehub ERNIE pretrained model
#
L
oad Paddlehub ERNIE pretrained model
module
=
hub
.
Module
(
name
=
"ernie"
)
inputs
,
outputs
,
program
=
module
.
context
(
trainable
=
True
,
max_seq_len
=
args
.
max_seq_len
)
#
Step2:
Download dataset and use SequenceLabelReader to read dataset
# Download dataset and use SequenceLabelReader to read dataset
dataset
=
hub
.
dataset
.
MSRA_NER
()
reader
=
hub
.
reader
.
SequenceLabelReader
(
dataset
=
dataset
,
vocab_path
=
module
.
get_vocab_path
(),
max_seq_len
=
args
.
max_seq_len
)
#
Step3: c
onstruct transfer learning network
#
C
onstruct transfer learning network
# Use "sequence_output" for token-level output.
sequence_output
=
outputs
[
"sequence_output"
]
# Define a sequence labeling finetune task by PaddleHub's API
seq_label_task
=
hub
.
create_seq_label_task
(
feature
=
sequence_output
,
max_seq_len
=
args
.
max_seq_len
,
num_classes
=
dataset
.
num_labels
)
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
# Compared to classification task, we need add seq_len tensor to feedlist
feed_list
=
[
inputs
[
"input_ids"
].
name
,
inputs
[
"position_ids"
].
name
,
inputs
[
"segment_ids"
].
name
,
inputs
[
"input_mask"
].
name
,
seq_label_task
.
variable
(
'label'
).
name
,
seq_label_task
.
variable
(
'seq_len'
).
name
inputs
[
"segment_ids"
].
name
,
inputs
[
"input_mask"
].
name
]
# Select a finetune strategy
...
...
@@ -75,16 +69,23 @@ if __name__ == '__main__':
# Setup runing config for PaddleHub Finetune API
config
=
hub
.
RunConfig
(
use_data_parallel
=
args
.
use_data_parallel
,
use_pyreader
=
args
.
use_pyreader
,
use_cuda
=
args
.
use_gpu
,
num_epoch
=
args
.
num_epoch
,
batch_size
=
args
.
batch_size
,
checkpoint_dir
=
args
.
checkpoint_dir
,
strategy
=
strategy
)
# Finetune and evaluate model by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
hub
.
finetune_and_eval
(
task
=
seq_label_task
,
# Define a sequence labeling finetune task by PaddleHub's API
seq_label_task
=
hub
.
SequenceLabelTask
(
data_reader
=
reader
,
feature
=
sequence_output
,
feed_list
=
feed_list
,
max_seq_len
=
args
.
max_seq_len
,
num_classes
=
dataset
.
num_labels
,
config
=
config
)
# Finetune and evaluate model by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
seq_label_task
.
finetune_and_eval
()
demo/text-classification/predict.py
浏览文件 @
b9ad9775
...
...
@@ -31,16 +31,17 @@ import paddlehub as hub
# yapf: disable
parser
=
argparse
.
ArgumentParser
(
__doc__
)
parser
.
add_argument
(
"--checkpoint_dir"
,
type
=
str
,
default
=
None
,
help
=
"Directory to model checkpoint"
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
1
,
help
=
"Total examples' number in batch for training."
)
parser
.
add_argument
(
"--max_seq_len"
,
type
=
int
,
default
=
512
,
help
=
"Number of words of the longest seqence."
)
parser
.
add_argument
(
"--use_gpu"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether use GPU for finetuning, input should be True or False"
)
parser
.
add_argument
(
"--use_pyreader"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether use pyreader to feed data."
)
args
=
parser
.
parse_args
()
# yapf: enable.
if
__name__
==
'__main__'
:
# loading Paddlehub ERNIE pretrained model
module
=
hub
.
Module
(
name
=
"ernie"
)
input_dict
,
output_dict
,
program
=
module
.
context
(
max_seq_len
=
args
.
max_seq_len
)
inputs
,
outputs
,
program
=
module
.
context
(
max_seq_len
=
args
.
max_seq_len
)
# Sentence classification dataset reader
dataset
=
hub
.
dataset
.
ChnSentiCorp
()
...
...
@@ -51,46 +52,55 @@ if __name__ == '__main__':
place
=
fluid
.
CUDAPlace
(
0
)
if
args
.
use_gpu
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
with
fluid
.
program_guard
(
program
):
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_outputs" for token-level output.
pooled_output
=
output_dict
[
"pooled_output"
]
# Define a classfication finetune task by PaddleHub's API
cls_task
=
hub
.
create_text_cls_task
(
feature
=
pooled_output
,
num_classes
=
dataset
.
num_labels
)
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list
=
[
input_dict
[
"input_ids"
].
name
,
input_dict
[
"position_ids"
].
name
,
input_dict
[
"segment_ids"
].
name
,
input_dict
[
"input_mask"
].
name
,
cls_task
.
variable
(
'label'
).
name
]
# Construct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output
=
outputs
[
"pooled_output"
]
# classificatin probability tensor
probs
=
cls_task
.
variable
(
"probs"
)
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list
=
[
inputs
[
"input_ids"
].
name
,
inputs
[
"position_ids"
].
name
,
inputs
[
"segment_ids"
].
name
,
inputs
[
"input_mask"
].
name
,
]
pred
=
fluid
.
layers
.
argmax
(
probs
,
axis
=
1
)
# Setup runing config for PaddleHub Finetune API
config
=
hub
.
RunConfig
(
use_data_parallel
=
False
,
use_pyreader
=
args
.
use_pyreader
,
use_cuda
=
args
.
use_gpu
,
batch_size
=
args
.
batch_size
,
enable_memory_optim
=
False
,
checkpoint_dir
=
args
.
checkpoint_dir
,
strategy
=
hub
.
finetune
.
strategy
.
DefaultFinetuneStrategy
())
# load best model checkpoint
fluid
.
io
.
load_persistables
(
exe
,
args
.
checkpoint_dir
)
# Define a classfication finetune task by PaddleHub's API
cls_task
=
hub
.
TextClassifierTask
(
data_reader
=
reader
,
feature
=
pooled_output
,
feed_list
=
feed_list
,
num_classes
=
dataset
.
num_labels
,
config
=
config
)
inference_program
=
program
.
clone
(
for_test
=
True
)
# Data to be prdicted
data
=
[
[
"这个宾馆比较陈旧了,特价的房间也很一般。总体来说一般"
],
[
"交通方便;环境很好;服务态度很好 房间较小"
],
[
"还稍微重了点,可能是硬盘大的原故,还要再轻半斤就好了。其他要进一步验证。贴的几种膜气泡较多,用不了多久就要更换了,屏幕膜稍好点,但比没有要强多了。建议配赠几张膜让用用户自己贴。"
],
[
"前台接待太差,酒店有A B楼之分,本人check-in后,前台未告诉B楼在何处,并且B楼无明显指示;房间太小,根本不像4星级设施,下次不会再选择入住此店啦"
],
[
"19天硬盘就罢工了~~~算上运来的一周都没用上15天~~~可就是不能换了~~~唉~~~~你说这算什么事呀~~~"
]
]
data_feeder
=
fluid
.
DataFeeder
(
feed_list
=
feed_list
,
place
=
place
)
test_reader
=
reader
.
data_generator
(
phase
=
'test'
,
shuffle
=
False
)
test_examples
=
dataset
.
get_test_examples
()
total
=
0
correct
=
0
for
index
,
batch
in
enumerate
(
test_reader
()):
pred_v
=
exe
.
run
(
feed
=
data_feeder
.
feed
(
batch
),
fetch_list
=
[
pred
.
name
],
program
=
inference_program
)
total
+=
1
if
(
pred_v
[
0
][
0
]
==
int
(
test_examples
[
index
].
label
)):
correct
+=
1
acc
=
1.0
*
correct
/
total
print
(
"%s
\t
predict=%s"
%
(
test_examples
[
index
],
pred_v
[
0
][
0
]))
print
(
"accuracy = %f"
%
acc
)
index
=
0
results
=
cls_task
.
predict
(
data
=
data
)
for
batch_result
in
results
:
# get predict index
batch_result
=
np
.
argmax
(
batch_result
,
axis
=
2
)[
0
]
for
result
in
batch_result
:
print
(
"%s
\t
predict=%s"
%
(
data
[
index
][
0
],
result
))
index
+=
1
demo/text-classification/run_classifier.sh
浏览文件 @
b9ad9775
export
CUDA_VISIBLE_DEVICES
=
1
export
CUDA_VISIBLE_DEVICES
=
0
# User can select chnsenticorp, nlpcc_dbqa, lcqmc for different task
DATASET
=
"chnsenticorp"
...
...
@@ -16,4 +16,6 @@ python -u text_classifier.py \
--learning_rate
=
5e-5
\
--weight_decay
=
0.01
\
--max_seq_len
=
128
\
--num_epoch
=
3
--num_epoch
=
3
\
--use_pyreader
=
True
\
--use_data_parallel
=
True
\
demo/text-classification/run_predict.sh
浏览文件 @
b9ad9775
export
CUDA_VISIBLE_DEVICES
=
0
CKPT_DIR
=
"./ckpt_chnsenticorp
/best_model
"
CKPT_DIR
=
"./ckpt_chnsenticorp"
python
-u
predict.py
--checkpoint_dir
$CKPT_DIR
--max_seq_len
128
--use_gpu
False
demo/text-classification/text_classifier.py
浏览文件 @
b9ad9775
...
...
@@ -32,17 +32,19 @@ parser.add_argument("--data_dir", type=str, default=None, help="Path to training
parser
.
add_argument
(
"--checkpoint_dir"
,
type
=
str
,
default
=
None
,
help
=
"Directory to model checkpoint"
)
parser
.
add_argument
(
"--max_seq_len"
,
type
=
int
,
default
=
512
,
help
=
"Number of words of the longest seqence."
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
32
,
help
=
"Total examples' number in batch for training."
)
parser
.
add_argument
(
"--use_pyreader"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether use pyreader to feed data."
)
parser
.
add_argument
(
"--use_data_parallel"
,
type
=
ast
.
literal_eval
,
default
=
False
,
help
=
"Whether use data parallel."
)
args
=
parser
.
parse_args
()
# yapf: enable.
if
__name__
==
'__main__'
:
#
Step1: l
oad Paddlehub ERNIE pretrained model
#
L
oad Paddlehub ERNIE pretrained model
module
=
hub
.
Module
(
name
=
"ernie"
)
# module = hub.Module(name="bert_multi_cased_L-12_H-768_A-12")
inputs
,
outputs
,
program
=
module
.
context
(
trainable
=
True
,
max_seq_len
=
args
.
max_seq_len
)
#
Step2:
Download dataset and use ClassifyReader to read dataset
# Download dataset and use ClassifyReader to read dataset
dataset
=
None
if
args
.
dataset
.
lower
()
==
"chnsenticorp"
:
dataset
=
hub
.
dataset
.
ChnSentiCorp
()
...
...
@@ -58,39 +60,44 @@ if __name__ == '__main__':
vocab_path
=
module
.
get_vocab_path
(),
max_seq_len
=
args
.
max_seq_len
)
#
Step3: c
onstruct transfer learning network
#
C
onstruct transfer learning network
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
pooled_output
=
outputs
[
"pooled_output"
]
# Define a classfication finetune task by PaddleHub's API
cls_task
=
hub
.
create_text_cls_task
(
feature
=
pooled_output
,
num_classes
=
dataset
.
num_labels
)
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
feed_list
=
[
inputs
[
"input_ids"
].
name
,
inputs
[
"position_ids"
].
name
,
inputs
[
"segment_ids"
].
name
,
inputs
[
"input_mask"
].
name
,
cls_task
.
variable
(
'label'
).
name
inputs
[
"input_ids"
].
name
,
inputs
[
"position_ids"
].
name
,
inputs
[
"segment_ids"
].
name
,
inputs
[
"input_mask"
].
name
,
]
# S
tep4: S
elect finetune strategy, setup config and finetune
# Select finetune strategy, setup config and finetune
strategy
=
hub
.
AdamWeightDecayStrategy
(
weight_decay
=
args
.
weight_decay
,
learning_rate
=
args
.
learning_rate
,
lr_scheduler
=
"linear_decay"
,
)
lr_scheduler
=
"linear_decay"
)
# Setup runing config for PaddleHub Finetune API
config
=
hub
.
RunConfig
(
use_data_parallel
=
args
.
use_data_parallel
,
use_pyreader
=
args
.
use_pyreader
,
use_cuda
=
args
.
use_gpu
,
num_epoch
=
args
.
num_epoch
,
batch_size
=
args
.
batch_size
,
checkpoint_dir
=
args
.
checkpoint_dir
,
strategy
=
strategy
)
# Define a classfication finetune task by PaddleHub's API
cls_task
=
hub
.
TextClassifierTask
(
data_reader
=
reader
,
feature
=
pooled_output
,
feed_list
=
feed_list
,
num_classes
=
dataset
.
num_labels
,
config
=
config
)
# Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
hub
.
finetune_and_eval
(
task
=
cls_task
,
data_reader
=
reader
,
feed_list
=
feed_list
,
config
=
config
)
cls_task
.
finetune_and_eval
()
paddlehub/commands/clear.py
浏览文件 @
b9ad9775
...
...
@@ -50,7 +50,7 @@ class ClearCommand(BaseCommand):
def
__init__
(
self
,
name
):
super
(
ClearCommand
,
self
).
__init__
(
name
)
self
.
show_in_help
=
True
self
.
description
=
"Clear all cache data."
self
.
description
=
"Clear all cache
d
data."
def
cache_dir
(
self
):
return
CACHE_HOME
...
...
paddlehub/finetune/task.py
浏览文件 @
b9ad9775
...
...
@@ -110,8 +110,17 @@ class BasicTask(object):
# run config
self
.
config
=
config
if
config
else
RunConfig
()
self
.
place
,
self
.
device_count
=
hub
.
common
.
get_running_device_info
(
self
.
config
)
self
.
place
=
self
.
places
[
0
]
self
.
device_count
=
len
(
self
.
places
)
if
self
.
config
.
batch_size
<
self
.
device_count
:
logger
.
warning
(
"Batch size({}) is less than the count of devices({}), which is not allowed in current Paddle versions"
.
format
(
self
.
config
.
batch_size
,
self
.
device_count
))
logger
.
warning
(
"Batch size automatically adjusted to {}"
.
format
(
self
.
device_count
))
self
.
config
.
_batch_size
=
self
.
device_count
self
.
exe
=
fluid
.
Executor
(
place
=
self
.
place
)
self
.
build_strategy
=
fluid
.
BuildStrategy
()
if
self
.
config
.
enable_memory_optim
:
...
...
@@ -239,6 +248,12 @@ class BasicTask(object):
self
.
exe
.
run
(
self
.
env
.
startup_program
)
self
.
_build_env_end_event
()
@
property
def
places
(
self
):
if
self
.
config
.
use_cuda
:
return
fluid
.
framework
.
cuda_places
()
return
fluid
.
framework
.
cpu_places
()
@
property
def
is_train_phase
(
self
):
return
self
.
phase
in
[
"train"
]
...
...
@@ -481,6 +496,9 @@ class BasicTask(object):
period_run_states
=
[]
for
run_step
,
batch
in
enumerate
(
self
.
reader
(),
start
=
1
):
if
self
.
config
.
use_data_parallel
and
len
(
batch
)
<
self
.
device_count
:
continue
step_run_state
=
RunState
(
len
(
self
.
fetch_list
))
step_run_state
.
run_step
=
1
num_batch_examples
=
len
(
batch
)
...
...
@@ -554,10 +572,10 @@ class BasicTask(object):
class
ClassifierTask
(
BasicTask
):
def
__init__
(
self
,
data_reader
,
feature
,
num_classes
,
feed_list
,
data_reader
,
startup_program
=
None
,
config
=
None
,
hidden_units
=
None
):
...
...
@@ -662,10 +680,10 @@ ImageClassifierTask = ClassifierTask
class
TextClassifierTask
(
ClassifierTask
):
def
__init__
(
self
,
data_reader
,
feature
,
num_classes
,
feed_list
,
data_reader
,
startup_program
=
None
,
config
=
None
,
hidden_units
=
None
):
...
...
@@ -711,8 +729,8 @@ class SequenceLabelTask(BasicTask):
feature
,
max_seq_len
,
num_classes
,
data_reader
,
feed_list
,
data_reader
,
startup_program
=
None
,
config
=
None
,
):
...
...
@@ -743,6 +761,14 @@ class SequenceLabelTask(BasicTask):
name
=
"cls_seq_label_out_b"
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
self
.
ret_infers
=
fluid
.
layers
.
reshape
(
x
=
fluid
.
layers
.
argmax
(
self
.
logits
,
axis
=
2
),
shape
=
[
-
1
,
1
])
ret_infers
=
fluid
.
layers
.
assign
(
self
.
ret_infers
)
self
.
seq_len
=
fluid
.
layers
.
data
(
name
=
"seq_len"
,
shape
=
[
1
],
dtype
=
'int64'
)
seq_len
=
fluid
.
layers
.
assign
(
self
.
seq_len
)
logits
=
self
.
logits
logits
=
fluid
.
layers
.
flatten
(
logits
,
axis
=
2
)
logits
=
fluid
.
layers
.
softmax
(
logits
)
...
...
@@ -761,13 +787,8 @@ class SequenceLabelTask(BasicTask):
return
loss
def
_add_metrics
(
self
):
ret_labels
=
fluid
.
layers
.
reshape
(
x
=
self
.
label
,
shape
=
[
-
1
,
1
])
ret_infers
=
fluid
.
layers
.
reshape
(
x
=
fluid
.
layers
.
argmax
(
self
.
logits
,
axis
=
2
),
shape
=
[
-
1
,
1
])
self
.
seq_len
=
fluid
.
layers
.
data
(
name
=
"seq_len"
,
shape
=
[
1
],
dtype
=
'int64'
)
seq_len
=
fluid
.
layers
.
assign
(
self
.
seq_len
)
return
[
ret_labels
,
ret_infers
,
seq_len
]
self
.
ret_labels
=
fluid
.
layers
.
reshape
(
x
=
self
.
label
,
shape
=
[
-
1
,
1
])
return
[
self
.
ret_labels
,
self
.
ret_infers
,
self
.
seq_len
]
def
_build_env_end_event
(
self
):
with
self
.
log_writer
.
mode
(
self
.
phase
)
as
logw
:
...
...
@@ -834,4 +855,14 @@ class SequenceLabelTask(BasicTask):
feed_list
=
[
varname
for
varname
in
self
.
_base_feed_list
]
if
self
.
is_train_phase
or
self
.
is_test_phase
:
feed_list
+=
[
self
.
label
.
name
,
self
.
seq_len
.
name
]
else
:
feed_list
+=
[
self
.
seq_len
.
name
]
return
feed_list
@
property
def
fetch_list
(
self
):
if
self
.
is_train_phase
or
self
.
is_test_phase
:
return
[
metric
.
name
for
metric
in
self
.
metrics
]
+
[
self
.
loss
.
name
]
elif
self
.
is_predict_phase
:
return
[
self
.
ret_infers
.
name
]
+
[
self
.
seq_len
.
name
]
return
[
self
.
output
.
name
]
paddlehub/module/module.py
浏览文件 @
b9ad9775
...
...
@@ -463,13 +463,22 @@ class Module(object):
with
fluid
.
program_guard
(
program
):
result
=
[]
index
=
0
place
=
fluid
.
CPUPlace
()
if
"PADDLEHUB_CUDA_ENABLE"
in
os
.
environ
:
place
=
fluid
.
CUDAPlace
(
0
)
else
:
place
=
fluid
.
CPUPlace
()
if
"PADDLEHUB_BATCH_SIZE"
in
os
.
environ
:
batch_size
=
os
.
environ
[
"PADDLEHUB_BATCH_SIZE"
]
else
:
batch_size
=
1
exe
=
fluid
.
Executor
(
place
=
place
)
data
=
self
.
processor
.
preprocess
(
sign_name
=
sign_name
,
data_dict
=
data
)
data_format
=
self
.
processor
.
data_format
(
sign_name
=
sign_name
)
reader
,
feeder
=
_get_reader_and_feeder
(
data_format
,
data
,
place
)
reader
=
paddle
.
batch
(
reader
,
batch_size
=
2
)
reader
=
paddle
.
batch
(
reader
,
batch_size
=
batch_size
)
for
batch
in
reader
():
data_out
=
exe
.
run
(
feed
=
feeder
.
feed
(
batch
),
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录