Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
82e1494a
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
1 年多 前同步成功
通知
283
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
82e1494a
编写于
4月 19, 2019
作者:
Z
Zeyu Chen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
update senta demo
上级
5bd9a50a
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
53 addition
and
20 deletion
+53
-20
demo/senta/run_classifier.sh
demo/senta/run_classifier.sh
+2
-2
demo/senta/text_classifier.py
demo/senta/text_classifier.py
+7
-8
demo/text-classification/simple_demo.py
demo/text-classification/simple_demo.py
+3
-3
paddlehub/reader/__init__.py
paddlehub/reader/__init__.py
+1
-1
paddlehub/reader/nlp_reader.py
paddlehub/reader/nlp_reader.py
+40
-6
未找到文件。
demo/senta/run_classifier.sh
浏览文件 @
82e1494a
export
CUDA_VISIBLE_DEVICES
=
0
export
CUDA_VISIBLE_DEVICES
=
2
# User can select chnsenticorp, nlpcc_dbqa, lcqmc for different task
DATASET
=
"chnsenticorp"
...
...
@@ -8,4 +8,4 @@ python -u text_classifier.py \
--batch_size
=
24
\
--use_gpu
=
True
\
--checkpoint_dir
=
${
CKPT_DIR
}
\
--num_epoch
=
3
--num_epoch
=
10
demo/senta/text_classifier.py
浏览文件 @
82e1494a
...
...
@@ -21,29 +21,28 @@ if __name__ == '__main__':
# Step2: Download dataset and use TextClassificationReader to read dataset
dataset
=
hub
.
dataset
.
ChnSentiCorp
()
reader
=
hub
.
reader
.
TextClassification
Reader
(
reader
=
hub
.
reader
.
LACTokenize
Reader
(
dataset
=
dataset
,
vocab_path
=
module
.
get_vocab_path
())
# Step3: construct transfer learning network
# Use "sequence_output" for classification tasks on an entire sentence.
# Use "sequence_output" for token-level output.
sequence_output
=
outputs
[
"sequence_output"
]
sent_feature
=
outputs
[
"sequence_output"
]
# Define a classfication finetune task by PaddleHub's API
cls_task
=
hub
.
create_text_cls_task
(
feature
=
se
quence_output
,
num_classes
=
dataset
.
num_labels
)
feature
=
se
nt_feature
,
num_classes
=
dataset
.
num_labels
)
# Setup feed list for data feeder
# Must feed all the tensor of senta's module need
feed_list
=
[
inputs
[
"words"
].
name
,
cls_task
.
variable
(
'label'
).
name
]
# Setup runing config for PaddleHub Finetune API
strategy
=
hub
.
finetune
.
strategy
.
AdamWeightDecayStrategy
(
learning_rate
=
1e-3
,
weight_decay
=
0.01
,
warmup_proportion
=
0.01
)
config
=
hub
.
RunConfig
(
use_cuda
=
args
.
use_gpu
,
num_epoch
=
args
.
num_epoch
,
batch_size
=
args
.
batch_size
,
checkpoint_dir
=
args
.
checkpoint_dir
,
strategy
=
hub
.
finetune
.
strategy
.
DefaultFinetuneStrategy
()
)
strategy
=
strategy
)
# Finetune and evaluate by PaddleHub's API
# will finish training, evaluation, testing, save model automatically
...
...
demo/text-classification/simple_demo.py
浏览文件 @
82e1494a
...
...
@@ -2,9 +2,9 @@ import paddle.fluid as fluid
import
paddlehub
as
hub
module
=
hub
.
Module
(
name
=
"ernie"
)
inputs
,
outputs
,
program
=
module
.
context
(
trainable
=
True
)
reader
=
hub
.
reader
.
ClassifyReader
(
hub
.
dataset
.
ChnSentiCorp
(),
module
.
get_vocab_path
()
)
inputs
,
outputs
,
program
=
module
.
context
(
trainable
=
True
,
max_seq_len
=
128
)
reader
=
hub
.
reader
.
ClassifyReader
(
hub
.
dataset
.
ChnSentiCorp
(),
module
.
get_vocab_path
(),
max_seq_len
=
128
)
task
=
hub
.
create_text_cls_task
(
feature
=
outputs
[
"pooled_output"
],
num_classes
=
2
)
strategy
=
hub
.
AdamWeightDecayStrategy
(
learning_rate
=
5e-5
)
config
=
hub
.
RunConfig
(
...
...
paddlehub/reader/__init__.py
浏览文件 @
82e1494a
...
...
@@ -14,5 +14,5 @@
from
.nlp_reader
import
ClassifyReader
from
.nlp_reader
import
SequenceLabelReader
from
.nlp_reader
import
TextClassification
Reader
from
.nlp_reader
import
LACTokenize
Reader
from
.cv_reader
import
ImageClassificationReader
paddlehub/reader/nlp_reader.py
浏览文件 @
82e1494a
...
...
@@ -58,7 +58,6 @@ class BaseReader(object):
self
.
current_example
=
0
self
.
current_epoch
=
0
self
.
num_examples
=
0
self
.
num_examples
=
{
'train'
:
-
1
,
'dev'
:
-
1
,
'test'
:
-
1
}
...
...
@@ -383,18 +382,47 @@ class ExtractEmbeddingReader(BaseReader):
return
return_list
class
TextClassificationReader
(
object
):
def
__init__
(
self
,
dataset
,
vocab_path
,
do_lower_case
=
False
):
class
LACTokenizeReader
(
object
):
def
__init__
(
self
,
dataset
,
vocab_path
):
self
.
dataset
=
dataset
self
.
lac
=
hub
.
Module
(
name
=
"lac"
)
self
.
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
vocab_path
,
do_lower_case
=
do_lower_ca
se
)
vocab_file
=
vocab_path
,
do_lower_case
=
Fal
se
)
self
.
vocab
=
self
.
tokenizer
.
vocab
self
.
lac
=
hub
.
Module
(
name
=
"lac"
)
self
.
feed_key
=
list
(
self
.
lac
.
processor
.
data_format
(
sign_name
=
"lexical_analysis"
).
keys
())[
0
]
self
.
num_examples
=
{
'train'
:
-
1
,
'dev'
:
-
1
,
'test'
:
-
1
}
def
get_num_examples
(
self
,
phase
):
"""Get number of examples for train, dev or test."""
if
phase
not
in
[
'train'
,
'val'
,
'dev'
,
'test'
]:
raise
ValueError
(
"Unknown phase, which should be in ['train', 'val'/'dev', 'test']."
)
return
self
.
num_examples
[
phase
]
def
get_train_examples
(
self
):
"""Gets a collection of `InputExample`s for the train set."""
return
self
.
dataset
.
get_train_examples
()
def
get_dev_examples
(
self
):
"""Gets a collection of `InputExample`s for the dev set."""
return
self
.
dataset
.
get_dev_examples
()
def
get_val_examples
(
self
):
"""Gets a collection of `InputExample`s for the val set."""
return
self
.
dataset
.
get_val_examples
()
def
get_test_examples
(
self
):
"""Gets a collection of `InputExample`s for prediction."""
return
self
.
dataset
.
get_test_examples
()
def
get_train_progress
(
self
):
"""Gets progress for training phase."""
return
self
.
current_example
,
self
.
current_epoch
def
data_generator
(
self
,
batch_size
=
1
,
phase
=
"train"
,
...
...
@@ -402,14 +430,20 @@ class TextClassificationReader(object):
data
=
None
):
if
phase
==
"train"
:
data
=
self
.
dataset
.
get_train_examples
()
self
.
num_examples
[
'train'
]
=
len
(
data
)
elif
phase
==
"test"
:
shuffle
=
False
data
=
self
.
dataset
.
get_test_examples
()
self
.
num_examples
[
'train'
]
=
len
(
data
)
elif
phase
==
"val"
or
phase
==
"dev"
:
shuffle
=
False
data
=
self
.
dataset
.
get_dev_examples
()
self
.
num_examples
[
'test'
]
=
len
(
data
)
elif
phase
==
"predict"
:
data
=
data
else
:
raise
ValueError
(
"Unknown phase, which should be in ['train', 'dev', 'test']."
)
def
preprocess
(
text
):
data_dict
=
{
self
.
feed_key
:
[
text
]}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录