Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
e5e64166
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 1 年 前同步成功
通知
282
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e5e64166
编写于
4月 13, 2019
作者:
Z
Zeyu Chen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add ernie classification prediction
上级
086f5f2c
变更
12
显示空白变更内容
内联
并排
Showing
12 changed file
with
140 addition
and
16 deletion
+140
-16
demo/ernie-classification/cls_predict.py
demo/ernie-classification/cls_predict.py
+83
-0
demo/ernie-classification/ernie_tiny_demo.py
demo/ernie-classification/ernie_tiny_demo.py
+3
-4
demo/ernie-classification/question_answering.py
demo/ernie-classification/question_answering.py
+3
-3
demo/ernie-classification/question_matching.py
demo/ernie-classification/question_matching.py
+3
-3
demo/ernie-classification/run_predict.sh
demo/ernie-classification/run_predict.sh
+4
-0
demo/ernie-classification/sentiment_cls.py
demo/ernie-classification/sentiment_cls.py
+5
-2
paddlehub/dataset/chnsenticorp.py
paddlehub/dataset/chnsenticorp.py
+7
-0
paddlehub/dataset/dataset.py
paddlehub/dataset/dataset.py
+10
-0
paddlehub/dataset/lcqmc.py
paddlehub/dataset/lcqmc.py
+7
-0
paddlehub/dataset/msra_ner.py
paddlehub/dataset/msra_ner.py
+7
-0
paddlehub/dataset/nlpcc_dbqa.py
paddlehub/dataset/nlpcc_dbqa.py
+7
-0
paddlehub/reader/nlp_reader.py
paddlehub/reader/nlp_reader.py
+1
-4
未找到文件。
demo/ernie-classification/cls_predict.py
0 → 100644
浏览文件 @
e5e64166
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on classification task """
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
time
import
argparse
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
import
paddlehub
as
hub
# yapf: disable
parser
=
argparse
.
ArgumentParser
(
__doc__
)
parser
.
add_argument
(
"--checkpoint_dir"
,
type
=
str
,
default
=
None
,
help
=
"Directory to model checkpoint"
)
parser
.
add_argument
(
"--max_seq_len"
,
type
=
int
,
default
=
512
,
help
=
"Number of words of the longest seqence."
)
args
=
parser
.
parse_args
()
# yapf: enable.
if
__name__
==
'__main__'
:
# loading Paddlehub ERNIE pretrained model
module
=
hub
.
Module
(
name
=
"ernie"
)
input_dict
,
output_dict
,
program
=
module
.
context
(
max_seq_len
=
args
.
max_seq_len
)
# Sentence classification dataset reader
dataset
=
hub
.
dataset
.
ChnSentiCorp
()
reader
=
hub
.
reader
.
ClassifyReader
(
dataset
=
dataset
,
vocab_path
=
module
.
get_vocab_path
(),
max_seq_len
=
args
.
max_seq_len
)
place
=
fluid
.
CUDAPlace
(
0
)
exe
=
fluid
.
Executor
(
place
)
with
fluid
.
program_guard
(
program
):
label
=
fluid
.
layers
.
data
(
name
=
"label"
,
shape
=
[
1
],
dtype
=
'int64'
)
# Use "pooled_output" for classification tasks on an entire sentence.
# Use "sequence_outputs" for token-level output.
pooled_output
=
output_dict
[
"pooled_output"
]
# Setup feed list for data feeder
# Must feed all the tensor of ERNIE's module need
# Define a classfication finetune task by PaddleHub's API
cls_task
=
hub
.
create_text_classification_task
(
feature
=
pooled_output
,
label
=
label
,
num_classes
=
dataset
.
num_labels
)
# classificatin probability tensor
probs
=
cls_task
.
variable
(
"probs"
)
# load best model checkpoint
fluid
.
io
.
load_persistables
(
exe
,
args
.
checkpoint_dir
)
feed_list
=
[
input_dict
[
"input_ids"
].
name
,
input_dict
[
"position_ids"
].
name
,
input_dict
[
"segment_ids"
].
name
,
input_dict
[
"input_mask"
].
name
,
label
.
name
]
data_feeder
=
fluid
.
DataFeeder
(
feed_list
=
feed_list
,
place
=
place
)
test_reader
=
reader
.
data_generator
(
phase
=
'test'
,
shuffle
=
False
)
test_examples
=
dataset
.
get_test_examples
()
for
index
,
batch
in
enumerate
(
test_reader
()):
probs_v
=
exe
.
run
(
feed
=
data_feeder
.
feed
(
batch
),
fetch_list
=
[
probs
.
name
])
print
(
test_examples
[
index
],
probs_v
[
0
][
0
])
demo/ernie-classification/ernie_tiny_demo.py
浏览文件 @
e5e64166
...
...
@@ -6,10 +6,9 @@ module = hub.Module(name="ernie")
inputs
,
outputs
,
program
=
module
.
context
(
trainable
=
True
,
max_seq_len
=
128
)
# Step2
dataset
=
hub
.
dataset
.
ChnSentiCorp
()
reader
=
hub
.
reader
.
ClassifyReader
(
dataset
=
hub
.
dataset
.
ChnSentiCorp
(),
vocab_path
=
module
.
get_vocab_path
(),
max_seq_len
=
128
)
dataset
=
dataset
,
vocab_path
=
module
.
get_vocab_path
(),
max_seq_len
=
128
)
# Step3
with
fluid
.
program_guard
(
program
):
...
...
@@ -18,7 +17,7 @@ with fluid.program_guard(program):
pooled_output
=
outputs
[
"pooled_output"
]
cls_task
=
hub
.
create_text_classification_task
(
feature
=
pooled_output
,
label
=
label
,
num_classes
=
reader
.
get_num_labels
()
)
feature
=
pooled_output
,
label
=
label
,
num_classes
=
dataset
.
num_labels
)
# Step4
strategy
=
hub
.
AdamWeightDecayStrategy
(
...
...
demo/ernie-classification/question_answering.py
浏览文件 @
e5e64166
...
...
@@ -37,11 +37,11 @@ if __name__ == '__main__':
trainable
=
True
,
max_seq_len
=
args
.
max_seq_len
)
# Step2: Download dataset and use ClassifyReader to read dataset
dataset
=
hub
.
dataset
.
NLPCC_DBQA
()
reader
=
hub
.
reader
.
ClassifyReader
(
dataset
=
hub
.
dataset
.
NLPCC_DBQA
()
,
dataset
=
dataset
,
vocab_path
=
module
.
get_vocab_path
(),
max_seq_len
=
args
.
max_seq_len
)
num_labels
=
len
(
reader
.
get_labels
())
# Step3: construct transfer learning network
with
fluid
.
program_guard
(
program
):
...
...
@@ -59,7 +59,7 @@ if __name__ == '__main__':
]
# Define a classfication finetune task by PaddleHub's API
cls_task
=
hub
.
create_text_classification_task
(
pooled_output
,
label
,
num_classes
=
num_labels
)
pooled_output
,
label
,
num_classes
=
dataset
.
num_labels
)
# Step4: Select finetune strategy, setup config and finetune
strategy
=
hub
.
AdamWeightDecayStrategy
(
...
...
demo/ernie-classification/question_matching.py
浏览文件 @
e5e64166
...
...
@@ -37,11 +37,11 @@ if __name__ == '__main__':
trainable
=
True
,
max_seq_len
=
args
.
max_seq_len
)
# Step2: Download dataset and use ClassifyReader to read dataset
dataset
=
hub
.
dataset
.
LCQMC
()
reader
=
hub
.
reader
.
ClassifyReader
(
dataset
=
hub
.
dataset
.
LCQMC
()
,
dataset
=
dataset
,
vocab_path
=
module
.
get_vocab_path
(),
max_seq_len
=
args
.
max_seq_len
)
num_labels
=
len
(
reader
.
get_labels
())
# Step3: construct transfer learning network
with
fluid
.
program_guard
(
program
):
...
...
@@ -59,7 +59,7 @@ if __name__ == '__main__':
]
# Define a classfication finetune task by PaddleHub's API
cls_task
=
hub
.
create_text_classification_task
(
pooled_output
,
label
,
num_classes
=
num_labels
)
pooled_output
,
label
,
num_classes
=
dataset
.
num_labels
)
# Step4: Select finetune strategy, setup config and finetune
strategy
=
hub
.
AdamWeightDecayStrategy
(
...
...
demo/ernie-classification/run_predict.sh
0 → 100644
浏览文件 @
e5e64166
export
CUDA_VISIBLE_DEVICES
=
1
CKPT_DIR
=
"./ckpt_sentiment_cls/best_model"
python
-u
cls_predict.py
--checkpoint_dir
$CKPT_DIR
--max_seq_len
128
demo/ernie-classification/sentiment_cls.py
浏览文件 @
e5e64166
...
...
@@ -37,8 +37,9 @@ if __name__ == '__main__':
trainable
=
True
,
max_seq_len
=
args
.
max_seq_len
)
# Step2: Download dataset and use ClassifyReader to read dataset
dataset
=
hub
.
dataset
.
ChnSentiCorp
()
reader
=
hub
.
reader
.
ClassifyReader
(
dataset
=
hub
.
dataset
.
ChnSentiCorp
()
,
dataset
=
dataset
,
vocab_path
=
module
.
get_vocab_path
(),
max_seq_len
=
args
.
max_seq_len
)
...
...
@@ -58,7 +59,9 @@ if __name__ == '__main__':
]
# Define a classfication finetune task by PaddleHub's API
cls_task
=
hub
.
create_text_classification_task
(
pooled_output
,
label
,
num_classes
=
reader
.
get_num_labels
())
feature
=
pooled_output
,
label
=
label
,
num_classes
=
dataset
.
num_labels
())
# Step4: Select finetune strategy, setup config and finetune
strategy
=
hub
.
AdamWeightDecayStrategy
(
...
...
paddlehub/dataset/chnsenticorp.py
浏览文件 @
e5e64166
...
...
@@ -70,6 +70,13 @@ class ChnSentiCorp(HubDataset):
def
get_labels
(
self
):
return
[
"0"
,
"1"
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_tsv
(
self
,
input_file
,
quotechar
=
None
):
"""Reads a tab separated value file."""
with
open
(
input_file
,
"r"
)
as
f
:
...
...
paddlehub/dataset/dataset.py
浏览文件 @
e5e64166
...
...
@@ -40,6 +40,13 @@ class InputExample(object):
self
.
text_b
=
text_b
self
.
label
=
label
def
__str__
(
self
):
if
self
.
text_b
is
None
:
return
"text={}
\t
label={}"
.
format
(
self
.
text_a
,
self
.
label
)
else
:
return
"text_a={}
\t
text_b{},label={}"
.
format
(
self
.
text_a
,
self
.
text_b
,
label
)
class
HubDataset
(
object
):
def
get_train_examples
(
self
):
...
...
@@ -56,3 +63,6 @@ class HubDataset(object):
def
get_labels
(
self
):
raise
NotImplementedError
()
def
num_labels
(
self
):
raise
NotImplementedError
()
paddlehub/dataset/lcqmc.py
浏览文件 @
e5e64166
...
...
@@ -66,6 +66,13 @@ class LCQMC(HubDataset):
"""See base class."""
return
[
"0"
,
"1"
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_tsv
(
self
,
input_file
,
quotechar
=
None
):
"""Reads a tab separated value file."""
with
open
(
input_file
,
"r"
)
as
f
:
...
...
paddlehub/dataset/msra_ner.py
浏览文件 @
e5e64166
...
...
@@ -79,6 +79,13 @@ class MSRA_NER(HubDataset):
def
get_labels
(
self
):
return
[
"B-PER"
,
"I-PER"
,
"B-ORG"
,
"I-ORG"
,
"B-LOC"
,
"I-LOC"
,
"O"
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
get_label_map
(
self
):
return
self
.
label_map
...
...
paddlehub/dataset/nlpcc_dbqa.py
浏览文件 @
e5e64166
...
...
@@ -72,6 +72,13 @@ class NLPCC_DBQA(HubDataset):
"""See base class."""
return
[
"0"
,
"1"
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_tsv
(
self
,
input_file
,
quotechar
=
None
):
"""Reads a tab separated value file."""
with
open
(
input_file
,
"r"
)
as
f
:
...
...
paddlehub/reader/nlp_reader.py
浏览文件 @
e5e64166
...
...
@@ -80,9 +80,6 @@ class BaseReader(object):
"""Gets the list of labels for this data set."""
return
self
.
dataset
.
get_labels
()
def
get_num_labels
(
self
):
return
len
(
self
.
dataset
.
get_labels
())
def
get_train_progress
(
self
):
"""Gets progress for training phase."""
return
self
.
current_example
,
self
.
current_epoch
...
...
@@ -211,7 +208,7 @@ class BaseReader(object):
)
return
self
.
num_examples
[
phase
]
def
data_generator
(
self
,
batch_size
,
phase
=
'train'
,
shuffle
=
True
):
def
data_generator
(
self
,
batch_size
=
1
,
phase
=
'train'
,
shuffle
=
True
):
if
phase
==
'train'
:
examples
=
self
.
get_train_examples
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录