Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
07e21b51
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 1 年 前同步成功
通知
281
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
07e21b51
编写于
12月 24, 2019
作者:
K
kinghuin
提交者:
wuzewu
12月 24, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
reconsitution reader and dataset(#279)
* reconsitution reader and dataset
上级
537d3c58
变更
34
展开全部
隐藏空白更改
内联
并排
Showing
34 changed file
with
950 addition
and
1157 deletion
+950
-1157
demo/reading-comprehension/predict.py
demo/reading-comprehension/predict.py
+0
-13
paddlehub/dataset/__init__.py
paddlehub/dataset/__init__.py
+1
-1
paddlehub/dataset/base_cv_dataset.py
paddlehub/dataset/base_cv_dataset.py
+49
-3
paddlehub/dataset/base_nlp_dataset.py
paddlehub/dataset/base_nlp_dataset.py
+80
-0
paddlehub/dataset/bq.py
paddlehub/dataset/bq.py
+22
-61
paddlehub/dataset/chnsenticorp.py
paddlehub/dataset/chnsenticorp.py
+19
-52
paddlehub/dataset/cmrc2018.py
paddlehub/dataset/cmrc2018.py
+15
-36
paddlehub/dataset/dataset.py
paddlehub/dataset/dataset.py
+115
-6
paddlehub/dataset/dogcat.py
paddlehub/dataset/dogcat.py
+24
-9
paddlehub/dataset/drcd.py
paddlehub/dataset/drcd.py
+15
-38
paddlehub/dataset/flowers.py
paddlehub/dataset/flowers.py
+24
-9
paddlehub/dataset/food101.py
paddlehub/dataset/food101.py
+23
-9
paddlehub/dataset/glue.py
paddlehub/dataset/glue.py
+54
-94
paddlehub/dataset/iflytek.py
paddlehub/dataset/iflytek.py
+23
-49
paddlehub/dataset/indoor67.py
paddlehub/dataset/indoor67.py
+24
-9
paddlehub/dataset/inews.py
paddlehub/dataset/inews.py
+24
-49
paddlehub/dataset/lcqmc.py
paddlehub/dataset/lcqmc.py
+25
-51
paddlehub/dataset/msra_ner.py
paddlehub/dataset/msra_ner.py
+27
-54
paddlehub/dataset/nlpcc_dbqa.py
paddlehub/dataset/nlpcc_dbqa.py
+25
-51
paddlehub/dataset/squad.py
paddlehub/dataset/squad.py
+39
-62
paddlehub/dataset/stanford_dogs.py
paddlehub/dataset/stanford_dogs.py
+24
-9
paddlehub/dataset/thucnews.py
paddlehub/dataset/thucnews.py
+23
-49
paddlehub/dataset/tnews.py
paddlehub/dataset/tnews.py
+22
-51
paddlehub/dataset/toxic.py
paddlehub/dataset/toxic.py
+25
-51
paddlehub/dataset/xnli.py
paddlehub/dataset/xnli.py
+28
-57
paddlehub/finetune/task/__init__.py
paddlehub/finetune/task/__init__.py
+1
-1
paddlehub/finetune/task/base_task.py
paddlehub/finetune/task/base_task.py
+2
-2
paddlehub/finetune/task/classifier_task.py
paddlehub/finetune/task/classifier_task.py
+2
-2
paddlehub/finetune/task/reading_comprehension_task.py
paddlehub/finetune/task/reading_comprehension_task.py
+16
-17
paddlehub/finetune/task/regression_task.py
paddlehub/finetune/task/regression_task.py
+2
-2
paddlehub/finetune/task/sequence_task.py
paddlehub/finetune/task/sequence_task.py
+2
-2
paddlehub/reader/base_reader.py
paddlehub/reader/base_reader.py
+20
-0
paddlehub/reader/cv_reader.py
paddlehub/reader/cv_reader.py
+31
-22
paddlehub/reader/nlp_reader.py
paddlehub/reader/nlp_reader.py
+124
-236
未找到文件。
demo/reading-comprehension/predict.py
浏览文件 @
07e21b51
...
...
@@ -20,20 +20,7 @@ from __future__ import print_function
import
argparse
import
ast
import
collections
import
json
import
io
import
math
import
numpy
as
np
import
os
import
six
import
sys
import
time
import
paddle
import
paddle.fluid
as
fluid
import
paddlehub
as
hub
from
paddlehub.finetune.task.reading_comprehension_task
import
write_predictions
hub
.
common
.
logger
.
logger
.
setLevel
(
"INFO"
)
...
...
paddlehub/dataset/__init__.py
浏览文件 @
07e21b51
...
...
@@ -14,7 +14,7 @@
# limitations under the License.
# NLP Dataset
from
.dataset
import
InputExample
,
Hub
Dataset
from
.dataset
import
InputExample
,
Base
Dataset
from
.chnsenticorp
import
ChnSentiCorp
from
.msra_ner
import
MSRA_NER
from
.nlpcc_dbqa
import
NLPCC_DBQA
...
...
paddlehub/dataset/base_cv_dataset.py
浏览文件 @
07e21b51
...
...
@@ -18,15 +18,61 @@ from __future__ import division
from
__future__
import
print_function
import
os
import
numpy
as
np
from
paddlehub.dataset
import
BaseDataset
import
paddlehub
as
hub
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.common.logger
import
logger
class
BaseCVDatast
(
BaseDataset
):
def
__init__
(
self
,
base_path
,
train_list_file
=
None
,
validate_list_file
=
None
,
test_list_file
=
None
,
predict_list_file
=
None
,
label_list_file
=
None
,
label_list
=
None
):
super
(
BaseCVDatast
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
train_list_file
,
dev_file
=
validate_list_file
,
test_file
=
test_list_file
,
predict_file
=
predict_list_file
,
label_file
=
label_list_file
,
label_list
=
label_list
)
def
_read_file
(
self
,
data_path
,
phase
=
None
):
data
=
[]
with
open
(
data_path
,
"r"
)
as
file
:
while
True
:
line
=
file
.
readline
()
if
not
line
:
break
line
=
line
.
strip
()
items
=
line
.
split
(
" "
)
if
len
(
items
)
>
2
:
image_path
=
" "
.
join
(
items
[
0
:
-
1
])
else
:
image_path
=
items
[
0
]
if
not
os
.
path
.
isabs
(
image_path
):
if
self
.
base_path
is
not
None
:
image_path
=
os
.
path
.
join
(
self
.
base_path
,
image_path
)
label
=
items
[
-
1
]
data
.
append
((
image_path
,
label
))
return
data
# discarded. please use BaseCVDatast
class
ImageClassificationDataset
(
object
):
def
__init__
(
self
):
logger
.
warning
(
"ImageClassificationDataset is no longer recommended from PaddleHub v1.5.0, "
"please use BaseCVDataset instead of ImageClassificationDataset. "
"It's more easy-to-use with more functions and support evaluating test set "
"in the end of finetune automatically."
)
self
.
base_path
=
None
self
.
train_list_file
=
None
self
.
test_list_file
=
None
...
...
@@ -99,12 +145,12 @@ class ImageClassificationDataset(object):
def
test_data
(
self
,
shuffle
=
False
):
test_data_path
=
os
.
path
.
join
(
self
.
base_path
,
self
.
test_list_file
)
return
self
.
_parse_data
(
test_data_path
,
shuffle
,
phase
=
'
dev
'
)
return
self
.
_parse_data
(
test_data_path
,
shuffle
,
phase
=
'
test
'
)
def
validate_data
(
self
,
shuffle
=
False
):
validate_data_path
=
os
.
path
.
join
(
self
.
base_path
,
self
.
validate_list_file
)
return
self
.
_parse_data
(
validate_data_path
,
shuffle
,
phase
=
'
test
'
)
return
self
.
_parse_data
(
validate_data_path
,
shuffle
,
phase
=
'
dev
'
)
def
get_train_examples
(
self
):
return
self
.
train_examples
...
...
paddlehub/dataset/base_nlp_dataset.py
0 → 100644
浏览文件 @
07e21b51
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
io
import
csv
from
paddlehub.dataset
import
InputExample
,
BaseDataset
class
BaseNLPDatast
(
BaseDataset
):
def
__init__
(
self
,
base_path
,
train_file
=
None
,
dev_file
=
None
,
test_file
=
None
,
predict_file
=
None
,
label_file
=
None
,
label_list
=
None
,
train_file_with_head
=
False
,
dev_file_with_head
=
False
,
test_file_with_head
=
False
,
predict_file_with_head
=
False
):
super
(
BaseNLPDatast
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
train_file
,
dev_file
=
dev_file
,
test_file
=
test_file
,
predict_file
=
predict_file
,
label_file
=
label_file
,
label_list
=
label_list
,
train_file_with_head
=
train_file_with_head
,
dev_file_with_head
=
dev_file_with_head
,
test_file_with_head
=
test_file_with_head
,
predict_file_with_head
=
predict_file_with_head
)
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Reads a tab separated value file."""
with
io
.
open
(
input_file
,
"r"
,
encoding
=
"UTF-8"
)
as
file
:
reader
=
csv
.
reader
(
file
,
delimiter
=
"
\t
"
,
quotechar
=
None
)
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
reader
):
if
i
==
0
:
ncol
=
len
(
line
)
if
self
.
if_file_with_head
[
phase
]:
continue
if
ncol
==
1
:
if
phase
!=
"predict"
:
example
=
InputExample
(
guid
=
i
,
text_a
=
line
[
0
])
else
:
raise
Exception
(
"the %s file: %s only has one column but it is not a predict file"
%
(
phase
,
input_file
))
elif
ncol
==
2
:
example
=
InputExample
(
guid
=
i
,
text_a
=
line
[
0
],
label
=
line
[
1
])
elif
ncol
==
3
:
example
=
InputExample
(
guid
=
i
,
text_a
=
line
[
0
],
text_b
=
line
[
1
],
label
=
line
[
2
])
else
:
raise
Exception
(
"the %s file: %s has too many columns (should <=3)"
%
(
phase
,
input_file
))
examples
.
append
(
example
)
return
examples
paddlehub/dataset/bq.py
浏览文件 @
07e21b51
...
...
@@ -17,76 +17,37 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
collections
import
namedtuple
import
io
import
os
import
csv
from
paddlehub.dataset
import
InputExample
,
HubDataset
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.
common.logger
import
logger
from
paddlehub.
dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/bq.tar.gz"
class
BQ
(
HubDataset
):
class
BQ
(
BaseNLPDatast
):
def
__init__
(
self
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"bq"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_test_examples
()
self
.
_load_dev_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"train.txt"
)
self
.
train_examples
=
self
.
_read_file
(
self
.
train_file
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"dev.txt"
)
self
.
dev_examples
=
self
.
_read_file
(
self
.
dev_file
)
def
_load_test_examples
(
self
):
self
.
test_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"test.txt"
)
self
.
test_examples
=
self
.
_read_file
(
self
.
test_file
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
get_labels
(
self
):
return
[
"0"
,
"1"
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_file
(
self
,
input_file
):
"""Reads a tab separated value file."""
with
io
.
open
(
input_file
,
"r"
,
encoding
=
"UTF-8"
)
as
file
:
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
file
):
data
=
line
.
strip
().
split
(
"
\t
"
)
example
=
InputExample
(
guid
=
i
,
label
=
data
[
2
],
text_a
=
data
[
0
],
text_b
=
data
[
1
])
examples
.
append
(
example
)
return
examples
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"bq"
)
base_path
=
self
.
_download_dataset
(
dataset_dir
,
url
=
"https://bj.bcebos.com/paddlehub-dataset/bq.tar.gz"
)
super
(
BQ
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"train.txt"
,
dev_file
=
"dev.txt"
,
test_file
=
"test.txt"
,
label_file
=
None
,
label_list
=
[
"0"
,
"1"
],
)
if
__name__
==
"__main__"
:
ds
=
BQ
()
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
ds
)
paddlehub/dataset/chnsenticorp.py
浏览文件 @
07e21b51
...
...
@@ -17,72 +17,39 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
collections
import
namedtuple
import
codecs
import
os
import
csv
from
paddlehub.dataset
import
InputExample
,
HubDataset
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.dataset
import
InputExample
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.
common.logger
import
logger
from
paddlehub.
dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/chnsenticorp.tar.gz"
class
ChnSentiCorp
(
HubDataset
):
class
ChnSentiCorp
(
BaseNLPDatast
):
"""
ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
opinion mining)
"""
def
__init__
(
self
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"chnsenticorp"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_test_examples
()
self
.
_load_dev_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"train.tsv"
)
self
.
train_examples
=
self
.
_read_tsv
(
self
.
train_file
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"dev.tsv"
)
self
.
dev_examples
=
self
.
_read_tsv
(
self
.
dev_file
)
def
_load_test_examples
(
self
):
self
.
test_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"test.tsv"
)
self
.
test_examples
=
self
.
_read_tsv
(
self
.
test_file
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
get_labels
(
self
):
return
[
"0"
,
"1"
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_tsv
(
self
,
input_file
,
quotechar
=
None
):
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"chnsenticorp"
)
base_path
=
self
.
_download_dataset
(
dataset_dir
,
url
=
"https://bj.bcebos.com/paddlehub-dataset/chnsenticorp.tar.gz"
)
super
(
ChnSentiCorp
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"train.tsv"
,
dev_file
=
"dev.tsv"
,
test_file
=
"test.tsv"
,
label_file
=
None
,
label_list
=
[
"0"
,
"1"
],
)
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Reads a tab separated value file."""
with
codecs
.
open
(
input_file
,
"r"
,
encoding
=
"UTF-8"
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
quotechar
)
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
None
)
examples
=
[]
seq_id
=
0
header
=
next
(
reader
)
# skip header
...
...
@@ -97,5 +64,5 @@ class ChnSentiCorp(HubDataset):
if
__name__
==
"__main__"
:
ds
=
ChnSentiCorp
()
for
e
in
ds
.
get_train_examples
():
for
e
in
ds
.
get_train_examples
()
[:
10
]
:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
paddlehub/dataset/cmrc2018.py
浏览文件 @
07e21b51
...
...
@@ -16,12 +16,11 @@
import
json
import
os
import
sys
from
paddlehub.reader
import
tokenization
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.common.logger
import
logger
from
paddlehub.dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz"
SPIECE_UNDERLINE
=
'▁'
...
...
@@ -63,42 +62,22 @@ class CMRC2018Example(object):
return
s
class
CMRC2018
(
objec
t
):
class
CMRC2018
(
BaseNLPDatas
t
):
"""A single set of features of data."""
def
__init__
(
self
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"cmrc2018"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_dev_examples
()
self
.
_load_test_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"cmrc2018_train.json"
)
self
.
train_examples
=
self
.
_read_json
(
self
.
train_file
,
is_training
=
True
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"cmrc2018_dev.json"
)
self
.
dev_examples
=
self
.
_read_json
(
self
.
dev_file
,
is_training
=
False
)
def
_load_test_examples
(
self
):
pass
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
[]
def
_read_json
(
self
,
input_file
,
is_training
=
False
):
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"cmrc2018"
)
base_path
=
self
.
_download_dataset
(
dataset_dir
,
url
=
_DATA_URL
)
super
(
CMRC2018
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"cmrc2018_train.json"
,
dev_file
=
"cmrc2018_dev.json"
,
test_file
=
None
,
label_file
=
None
,
label_list
=
None
,
)
def
_read_file
(
self
,
input_file
,
phase
=
False
):
"""Read a cmrc2018 json file into a list of CRCDExample."""
def
_is_chinese_char
(
cp
):
...
...
@@ -197,7 +176,7 @@ class CMRC2018(object):
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
if
is_training
:
if
phase
==
"train"
:
actual_text
=
""
.
join
(
doc_tokens
[
start_position
:(
end_position
+
1
)])
cleaned_answer_text
=
""
.
join
(
...
...
paddlehub/dataset/dataset.py
浏览文件 @
07e21b51
...
...
@@ -17,6 +17,12 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
paddlehub
as
hub
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.common.logger
import
logger
class
InputExample
(
object
):
"""
...
...
@@ -49,21 +55,124 @@ class InputExample(object):
self
.
text_a
,
self
.
text_b
,
self
.
label
)
class
HubDataset
(
object
):
class
BaseDataset
(
object
):
def
__init__
(
self
,
base_path
,
train_file
=
None
,
dev_file
=
None
,
test_file
=
None
,
predict_file
=
None
,
label_file
=
None
,
label_list
=
None
,
train_file_with_head
=
False
,
dev_file_with_head
=
False
,
test_file_with_head
=
False
,
predict_file_with_head
=
False
):
if
not
(
train_file
or
dev_file
or
test_file
):
raise
ValueError
(
"At least one file should be assigned"
)
self
.
base_path
=
base_path
self
.
train_file
=
train_file
self
.
dev_file
=
dev_file
self
.
test_file
=
test_file
self
.
predict_file
=
predict_file
self
.
label_file
=
label_file
self
.
label_list
=
label_list
self
.
train_examples
=
[]
self
.
dev_examples
=
[]
self
.
test_examples
=
[]
self
.
predict_examples
=
[]
self
.
if_file_with_head
=
{
"train"
:
train_file_with_head
,
"dev"
:
dev_file_with_head
,
"test"
:
test_file_with_head
,
"predict"
:
predict_file_with_head
}
if
train_file
:
self
.
_load_train_examples
()
if
dev_file
:
self
.
_load_dev_examples
()
if
test_file
:
self
.
_load_test_examples
()
if
predict_file
:
self
.
_load_predict_examples
()
if
self
.
label_file
:
if
not
self
.
label_list
:
self
.
label_list
=
self
.
_load_label_data
()
else
:
logger
.
warning
(
"As label_list has been assigned, label_file is noneffective"
)
def
get_train_examples
(
self
):
r
aise
NotImplementedError
()
r
eturn
self
.
train_examples
def
get_dev_examples
(
self
):
r
aise
NotImplementedError
()
r
eturn
self
.
dev_examples
def
get_test_examples
(
self
):
r
aise
NotImplementedError
()
r
eturn
self
.
test_examples
def
get_val_examples
(
self
):
return
self
.
get_dev_examples
()
def
get_predict_examples
(
self
):
return
self
.
predict_examples
def
get_labels
(
self
):
r
aise
NotImplementedError
()
r
eturn
self
.
label_list
@
property
def
num_labels
(
self
):
raise
NotImplementedError
()
return
len
(
self
.
label_list
)
def
label_dict
(
self
):
return
{
index
:
key
for
index
,
key
in
enumerate
(
self
.
label_list
)}
def
_download_dataset
(
self
,
dataset_path
,
url
):
if
not
os
.
path
.
exists
(
dataset_path
):
result
,
tips
,
dataset_path
=
default_downloader
.
download_file_and_uncompress
(
url
=
url
,
save_path
=
hub
.
common
.
dir
.
DATA_HOME
,
print_progress
=
True
,
replace
=
True
)
if
not
result
:
raise
Exception
(
tips
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
dataset_path
))
return
dataset_path
def
_load_train_examples
(
self
):
self
.
train_path
=
os
.
path
.
join
(
self
.
base_path
,
self
.
train_file
)
self
.
train_examples
=
self
.
_read_file
(
self
.
train_path
,
phase
=
"train"
)
def
_load_dev_examples
(
self
):
self
.
dev_path
=
os
.
path
.
join
(
self
.
base_path
,
self
.
dev_file
)
self
.
dev_examples
=
self
.
_read_file
(
self
.
dev_path
,
phase
=
"dev"
)
def
_load_test_examples
(
self
):
self
.
test_path
=
os
.
path
.
join
(
self
.
base_path
,
self
.
test_file
)
self
.
test_examples
=
self
.
_read_file
(
self
.
test_path
,
phase
=
"test"
)
def
_load_predict_examples
(
self
):
self
.
predict_path
=
os
.
path
.
join
(
self
.
base_path
,
self
.
predict_file
)
self
.
predict_examples
=
self
.
_read_file
(
self
.
predict_path
,
phase
=
"predict"
)
def
_read_file
(
self
,
path
,
phase
=
None
):
raise
NotImplementedError
def
_load_label_data
(
self
):
with
open
(
os
.
path
.
join
(
self
.
base_path
,
self
.
label_file
),
"r"
)
as
file
:
return
file
.
read
().
split
(
"
\n
"
)
def
__str__
(
self
):
return
"Dataset: %s with %i train examples, %i dev examples and %i test examples"
%
(
self
.
__class__
.
__name__
,
len
(
self
.
train_examples
),
len
(
self
.
dev_examples
),
len
(
self
.
test_examples
))
# add alias, compatible with old version
HubDataset
=
BaseDataset
paddlehub/dataset/dogcat.py
浏览文件 @
07e21b51
...
...
@@ -20,18 +20,33 @@ from __future__ import print_function
import
os
import
paddlehub
as
hub
from
paddlehub.dataset.base_cv_dataset
import
ImageClassificationDatase
t
from
paddlehub.dataset.base_cv_dataset
import
BaseCVDatas
t
class
DogCatDataset
(
ImageClassificationDatase
t
):
class
DogCatDataset
(
BaseCVDatas
t
):
def
__init__
(
self
):
super
(
DogCatDataset
,
self
).
__init__
()
dataset_path
=
os
.
path
.
join
(
hub
.
common
.
dir
.
DATA_HOME
,
"dog-cat"
)
self
.
base_path
=
self
.
_download_dataset
(
base_path
=
self
.
_download_dataset
(
dataset_path
=
dataset_path
,
url
=
"https://bj.bcebos.com/paddlehub-dataset/dog-cat.tar.gz"
)
self
.
train_list_file
=
"train_list.txt"
self
.
test_list_file
=
"test_list.txt"
self
.
validate_list_file
=
"validate_list.txt"
self
.
label_list_file
=
"label_list.txt"
self
.
num_labels
=
2
super
(
DogCatDataset
,
self
).
__init__
(
base_path
=
base_path
,
train_list_file
=
"train_list.txt"
,
validate_list_file
=
"validate_list.txt"
,
test_list_file
=
"test_list.txt"
,
label_list_file
=
"label_list.txt"
,
label_list
=
None
)
if
__name__
==
"__main__"
:
ds
=
DogCatDataset
()
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
e
)
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
e
)
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
e
)
print
(
ds
)
paddlehub/dataset/drcd.py
浏览文件 @
07e21b51
...
...
@@ -16,12 +16,11 @@
import
json
import
os
import
sys
from
paddlehub.reader
import
tokenization
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.common.logger
import
logger
from
paddlehub.dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz"
SPIECE_UNDERLINE
=
'▁'
...
...
@@ -39,8 +38,7 @@ class DRCDExample(object):
doc_tokens
,
orig_answer_text
=
None
,
start_position
=
None
,
end_position
=
None
,
is_impossible
=
False
):
end_position
=
None
):
self
.
qas_id
=
qas_id
self
.
question_text
=
question_text
self
.
doc_tokens
=
doc_tokens
...
...
@@ -64,43 +62,22 @@ class DRCDExample(object):
return
s
class
DRCD
(
objec
t
):
class
DRCD
(
BaseNLPDatas
t
):
"""A single set of features of data."""
def
__init__
(
self
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"drcd"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_dev_examples
()
self
.
_load_test_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"DRCD_training.json"
)
self
.
train_examples
=
self
.
_read_json
(
self
.
train_file
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"DRCD_dev.json"
)
self
.
dev_examples
=
self
.
_read_json
(
self
.
dev_file
)
def
_load_test_examples
(
self
):
self
.
test_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"DRCD_test.json"
)
self
.
test_examples
=
self
.
_read_json
(
self
.
test_file
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
_read_json
(
self
,
input_file
):
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"drcd"
)
base_path
=
self
.
_download_dataset
(
dataset_dir
,
url
=
_DATA_URL
)
super
(
DRCD
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"DRCD_training.json"
,
dev_file
=
"DRCD_dev.json"
,
test_file
=
"DRCD_test.json"
,
label_file
=
None
,
label_list
=
None
,
)
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Read a DRCD json file into a list of CRCDExample."""
def
_is_chinese_char
(
cp
):
...
...
paddlehub/dataset/flowers.py
浏览文件 @
07e21b51
...
...
@@ -20,18 +20,33 @@ from __future__ import print_function
import
os
import
paddlehub
as
hub
from
paddlehub.dataset.base_cv_dataset
import
ImageClassificationDatase
t
from
paddlehub.dataset.base_cv_dataset
import
BaseCVDatas
t
class
FlowersDataset
(
ImageClassificationDatase
t
):
class
FlowersDataset
(
BaseCVDatas
t
):
def
__init__
(
self
):
super
(
FlowersDataset
,
self
).
__init__
()
dataset_path
=
os
.
path
.
join
(
hub
.
common
.
dir
.
DATA_HOME
,
"flower_photos"
)
self
.
base_path
=
self
.
_download_dataset
(
base_path
=
self
.
_download_dataset
(
dataset_path
=
dataset_path
,
url
=
"https://bj.bcebos.com/paddlehub-dataset/flower_photos.tar.gz"
)
self
.
train_list_file
=
"train_list.txt"
self
.
test_list_file
=
"test_list.txt"
self
.
validate_list_file
=
"validate_list.txt"
self
.
label_list_file
=
"label_list.txt"
self
.
num_labels
=
5
super
(
FlowersDataset
,
self
).
__init__
(
base_path
=
base_path
,
train_list_file
=
"train_list.txt"
,
validate_list_file
=
"validate_list.txt"
,
test_list_file
=
"test_list.txt"
,
label_list_file
=
"label_list.txt"
,
label_list
=
None
)
if
__name__
==
"__main__"
:
ds
=
FlowersDataset
()
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
e
)
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
e
)
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
e
)
print
(
ds
)
paddlehub/dataset/food101.py
浏览文件 @
07e21b51
...
...
@@ -20,19 +20,33 @@ from __future__ import print_function
import
os
import
paddlehub
as
hub
from
paddlehub.dataset.base_cv_dataset
import
ImageClassificationDatase
t
from
paddlehub.dataset.base_cv_dataset
import
BaseCVDatas
t
class
Food101Dataset
(
ImageClassificationDatase
t
):
class
Food101Dataset
(
BaseCVDatas
t
):
def
__init__
(
self
):
super
(
Food101Dataset
,
self
).
__init__
()
dataset_path
=
os
.
path
.
join
(
hub
.
common
.
dir
.
DATA_HOME
,
"food-101"
,
"images"
)
self
.
base_path
=
self
.
_download_dataset
(
base_path
=
self
.
_download_dataset
(
dataset_path
=
dataset_path
,
url
=
"https://bj.bcebos.com/paddlehub-dataset/Food101.tar.gz"
)
self
.
train_list_file
=
"train_list.txt"
self
.
test_list_file
=
"test_list.txt"
self
.
validate_list_file
=
"validate_list.txt"
self
.
label_list_file
=
"label_list.txt"
self
.
num_labels
=
101
super
(
Food101Dataset
,
self
).
__init__
(
base_path
=
base_path
,
train_list_file
=
"train_list.txt"
,
test_list_file
=
"test_list.txt"
,
validate_list_file
=
"validate_list.txt"
,
label_list_file
=
"label_list.txt"
)
if
__name__
==
"__main__"
:
ds
=
Food101Dataset
()
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
e
)
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
e
)
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
e
)
print
(
ds
)
paddlehub/dataset/glue.py
浏览文件 @
07e21b51
...
...
@@ -21,15 +21,15 @@ import os
import
csv
import
io
from
paddlehub.dataset
import
InputExample
,
HubDataset
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.dataset
import
InputExample
from
paddlehub.common.logger
import
logger
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/glue_data.tar.gz"
class
GLUE
(
HubDatase
t
):
class
GLUE
(
BaseNLPDatas
t
):
"""
Please refer to
https://gluebenchmark.com
...
...
@@ -43,147 +43,107 @@ class GLUE(HubDataset):
'RTE'
,
'SST-2'
,
'STS-B'
]:
raise
Exception
(
sub_dataset
+
" is not in GLUE benchmark. Please confirm the data set"
)
self
.
mismatch
=
False
"%s is not in GLUE benchmark. Please confirm the data set"
%
sub_dataset
)
mismatch
=
False
if
sub_dataset
==
'MNLI_mm'
:
sub_dataset
=
'MNLI'
self
.
mismatch
=
True
mismatch
=
True
elif
sub_dataset
==
'MNLI_m'
:
sub_dataset
=
'MNLI'
self
.
sub_dataset
=
sub_dataset
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"glue_data"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_dev_examples
()
self
.
_load_test_examples
()
self
.
_load_predict_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
self
.
sub_dataset
,
"train.tsv"
)
self
.
train_examples
=
self
.
_read_tsv
(
self
.
train_file
)
def
_load_dev_examples
(
self
):
if
self
.
sub_dataset
==
'MNLI'
and
not
self
.
mismatch
:
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
self
.
sub_dataset
,
"dev_matched.tsv"
)
elif
self
.
sub_dataset
==
'MNLI'
and
self
.
mismatch
:
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
self
.
sub_dataset
,
"dev_mismatched.tsv"
)
else
:
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
self
.
sub_dataset
,
"dev.tsv"
)
self
.
dev_examples
=
self
.
_read_tsv
(
self
.
dev_file
)
def
_load_test_examples
(
self
):
self
.
test_examples
=
[]
def
_load_predict_examples
(
self
):
if
self
.
sub_dataset
==
'MNLI'
and
not
self
.
mismatch
:
self
.
predict_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
self
.
sub_dataset
,
"test_matched.tsv"
)
elif
self
.
sub_dataset
==
'MNLI'
and
self
.
mismatch
:
self
.
predict_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
self
.
sub_dataset
,
"test_mismatched.tsv"
)
else
:
self
.
predict_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
self
.
sub_dataset
,
"test.tsv"
)
self
.
predict_examples
=
self
.
_read_tsv
(
self
.
predict_file
,
wo_label
=
True
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
get_predict_examples
(
self
):
return
self
.
predict_examples
def
get_labels
(
self
):
"""See base class."""
if
self
.
sub_dataset
in
[
'MRPC'
,
'QQP'
,
'SST-2'
,
'CoLA'
]:
return
[
"0"
,
"1"
]
elif
self
.
sub_dataset
in
[
'QNLI'
,
'RTE'
]:
return
[
'not_entailment'
,
'entailment'
]
elif
self
.
sub_dataset
in
[
'MNLI'
]:
return
[
"neutral"
,
"contradiction"
,
"entailment"
]
elif
self
.
sub_dataset
in
[
'STS-B'
]:
return
Exception
(
"No category labels for regreesion tasks"
)
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_tsv
(
self
,
input_file
,
quotechar
=
None
,
wo_label
=
False
):
# test.tsv has not label,so it is a predict file
dev_file
=
"dev.tsv"
predict_file
=
"test.tsv"
if
sub_dataset
==
'MNLI'
and
not
mismatch
:
dev_file
=
'dev_matched.tsv'
predict_file
=
"test_matched.tsv"
elif
sub_dataset
==
'MNLI'
and
mismatch
:
dev_file
=
'dev_mismatched.tsv'
predict_file
=
"test_mismatched.tsv"
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"glue_data"
)
dataset_dir
=
self
.
_download_dataset
(
dataset_dir
,
url
=
_DATA_URL
)
base_path
=
os
.
path
.
join
(
dataset_dir
,
self
.
sub_dataset
)
label_list
=
None
if
sub_dataset
in
[
'MRPC'
,
'QQP'
,
'SST-2'
,
'CoLA'
]:
label_list
=
[
"0"
,
"1"
]
elif
sub_dataset
in
[
'QNLI'
,
'RTE'
]:
label_list
=
[
'not_entailment'
,
'entailment'
]
elif
sub_dataset
in
[
'MNLI'
]:
label_list
=
[
"neutral"
,
"contradiction"
,
"entailment"
]
elif
sub_dataset
in
[
'STS-B'
]:
label_list
=
None
super
(
GLUE
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"train.tsv"
,
dev_file
=
dev_file
,
predict_file
=
predict_file
,
label_file
=
None
,
label_list
=
label_list
,
)
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Reads a tab separated value file."""
with
io
.
open
(
input_file
,
"r"
,
encoding
=
"UTF-8"
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
quotechar
)
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
None
)
examples
=
[]
seq_id
=
0
if
self
.
sub_dataset
!=
'CoLA'
or
wo_label
:
if
self
.
sub_dataset
!=
'CoLA'
or
phase
==
"predict"
:
header
=
next
(
reader
)
# skip header
if
self
.
sub_dataset
in
[
'MRPC'
,
]:
if
wo_label
:
if
phase
==
"predict"
:
label_index
,
text_a_index
,
text_b_index
=
[
None
,
-
2
,
-
1
]
else
:
label_index
,
text_a_index
,
text_b_index
=
[
0
,
-
2
,
-
1
]
elif
self
.
sub_dataset
in
[
'QNLI'
,
]:
if
wo_label
:
if
phase
==
"predict"
:
label_index
,
text_a_index
,
text_b_index
=
[
None
,
1
,
2
]
else
:
label_index
,
text_a_index
,
text_b_index
=
[
3
,
1
,
2
]
elif
self
.
sub_dataset
in
[
'QQP'
,
]:
if
wo_label
:
if
phase
==
"predict"
:
label_index
,
text_a_index
,
text_b_index
=
[
None
,
1
,
2
]
else
:
label_index
,
text_a_index
,
text_b_index
=
[
5
,
3
,
4
]
elif
self
.
sub_dataset
in
[
'RTE'
,
]:
if
wo_label
:
if
phase
==
"predict"
:
label_index
,
text_a_index
,
text_b_index
=
[
None
,
1
,
2
]
else
:
label_index
,
text_a_index
,
text_b_index
=
[
3
,
1
,
2
]
elif
self
.
sub_dataset
in
[
'SST-2'
,
]:
if
wo_label
:
if
phase
==
"predict"
:
label_index
,
text_a_index
,
text_b_index
=
[
None
,
1
,
None
]
else
:
label_index
,
text_a_index
,
text_b_index
=
[
1
,
0
,
None
]
elif
self
.
sub_dataset
in
[
'MNLI'
,
]:
if
wo_label
:
if
phase
==
"predict"
:
label_index
,
text_a_index
,
text_b_index
=
[
None
,
8
,
9
]
else
:
label_index
,
text_a_index
,
text_b_index
=
[
-
1
,
8
,
9
]
elif
self
.
sub_dataset
in
[
'CoLA'
]:
if
wo_label
:
if
phase
==
"predict"
:
label_index
,
text_a_index
,
text_b_index
=
[
None
,
1
,
None
]
else
:
label_index
,
text_a_index
,
text_b_index
=
[
1
,
3
,
None
]
elif
self
.
sub_dataset
in
[
'STS-B'
]:
if
wo_label
:
if
phase
==
"predict"
:
label_index
,
text_a_index
,
text_b_index
=
[
None
,
-
2
,
-
1
]
else
:
label_index
,
text_a_index
,
text_b_index
=
[
-
1
,
-
3
,
-
2
]
...
...
paddlehub/dataset/iflytek.py
浏览文件 @
07e21b51
...
...
@@ -17,64 +17,30 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
collections
import
namedtuple
import
io
import
os
import
csv
from
paddlehub.dataset
import
InputExample
,
HubDataset
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.dataset
import
InputExample
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.
common.logger
import
logger
from
paddlehub.
dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz"
class
IFLYTEK
(
HubDatase
t
):
class
IFLYTEK
(
BaseNLPDatas
t
):
def
__init__
(
self
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"iflytek"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_test_examples
()
self
.
_load_dev_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"train.txt"
)
self
.
train_examples
=
self
.
_read_file
(
self
.
train_file
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"dev.txt"
)
self
.
dev_examples
=
self
.
_read_file
(
self
.
dev_file
)
def
_load_test_examples
(
self
):
self
.
test_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"test.txt"
)
self
.
test_examples
=
self
.
_read_file
(
self
.
test_file
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
get_labels
(
self
):
return
[
str
(
i
)
for
i
in
range
(
119
)]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_file
(
self
,
input_file
):
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"iflytek"
)
base_path
=
self
.
_download_dataset
(
dataset_dir
,
url
=
_DATA_URL
)
super
(
IFLYTEK
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"train.txt"
,
dev_file
=
"dev.txt"
,
test_file
=
"test.txt"
,
label_file
=
None
,
label_list
=
[
str
(
i
)
for
i
in
range
(
119
)],
)
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Reads a tab separated value file."""
with
io
.
open
(
input_file
,
"r"
,
encoding
=
"UTF-8"
)
as
file
:
examples
=
[]
...
...
@@ -91,5 +57,13 @@ class IFLYTEK(HubDataset):
if
__name__
==
"__main__"
:
ds
=
IFLYTEK
()
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
ds
)
paddlehub/dataset/indoor67.py
浏览文件 @
07e21b51
...
...
@@ -20,18 +20,33 @@ from __future__ import print_function
import
os
import
paddlehub
as
hub
from
paddlehub.dataset.base_cv_dataset
import
ImageClassificationDatase
t
from
paddlehub.dataset.base_cv_dataset
import
BaseCVDatas
t
class
Indoor67Dataset
(
ImageClassificationDatase
t
):
class
Indoor67Dataset
(
BaseCVDatas
t
):
def
__init__
(
self
):
super
(
Indoor67Dataset
,
self
).
__init__
()
dataset_path
=
os
.
path
.
join
(
hub
.
common
.
dir
.
DATA_HOME
,
"Indoor67"
)
self
.
base_path
=
self
.
_download_dataset
(
base_path
=
self
.
_download_dataset
(
dataset_path
=
dataset_path
,
url
=
"https://bj.bcebos.com/paddlehub-dataset/Indoor67.tar.gz"
)
self
.
train_list_file
=
"train_list.txt"
self
.
test_list_file
=
"test_list.txt"
self
.
validate_list_file
=
"validate_list.txt"
self
.
label_list_file
=
"label_list.txt"
self
.
num_labels
=
67
super
(
Indoor67Dataset
,
self
).
__init__
(
base_path
=
base_path
,
train_list_file
=
"train_list.txt"
,
validate_list_file
=
"validate_list.txt"
,
test_list_file
=
"test_list.txt"
,
label_list_file
=
"label_list.txt"
,
label_list
=
None
)
if
__name__
==
"__main__"
:
ds
=
Indoor67Dataset
()
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
e
)
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
e
)
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
e
)
print
(
ds
)
paddlehub/dataset/inews.py
浏览文件 @
07e21b51
...
...
@@ -17,73 +17,40 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
collections
import
namedtuple
import
io
import
os
import
csv
from
paddlehub.dataset
import
InputExample
,
HubDataset
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.dataset
import
InputExample
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.
common.logger
import
logger
from
paddlehub.
dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz"
class
INews
(
HubDatase
t
):
class
INews
(
BaseNLPDatas
t
):
"""
INews is a sentiment analysis dataset for Internet News
"""
def
__init__
(
self
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"inews"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_test_examples
()
self
.
_load_dev_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"train.txt"
)
self
.
train_examples
=
self
.
_read_file
(
self
.
train_file
,
is_training
=
True
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"dev.txt"
)
self
.
dev_examples
=
self
.
_read_file
(
self
.
dev_file
,
is_training
=
False
)
def
_load_test_examples
(
self
):
self
.
test_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"test.txt"
)
self
.
test_examples
=
self
.
_read_file
(
self
.
test_file
,
is_training
=
False
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
get_labels
(
self
):
return
[
"0"
,
"1"
,
"2"
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_file
(
self
,
input_file
,
is_training
):
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"inews"
)
base_path
=
self
.
_download_dataset
(
dataset_dir
,
url
=
_DATA_URL
)
super
(
INews
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"train.txt"
,
dev_file
=
"dev.txt"
,
test_file
=
"test.txt"
,
label_file
=
None
,
label_list
=
[
"0"
,
"1"
,
"2"
],
)
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Reads a tab separated value file."""
with
io
.
open
(
input_file
,
"r"
,
encoding
=
"UTF-8"
)
as
file
:
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
file
):
if
i
==
0
and
is_training
:
if
i
==
0
and
phase
==
'train'
:
continue
data
=
line
.
strip
().
split
(
"_!_"
)
example
=
InputExample
(
...
...
@@ -94,5 +61,13 @@ class INews(HubDataset):
if
__name__
==
"__main__"
:
ds
=
INews
()
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
ds
)
paddlehub/dataset/lcqmc.py
浏览文件 @
07e21b51
...
...
@@ -17,68 +17,34 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
collections
import
namedtuple
import
codecs
import
os
import
csv
from
paddlehub.dataset
import
InputExample
,
HubDataset
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.dataset
import
InputExample
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.
common.logger
import
logger
from
paddlehub.
dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"
class
LCQMC
(
HubDatase
t
):
class
LCQMC
(
BaseNLPDatas
t
):
def
__init__
(
self
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"lcqmc"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_test_examples
()
self
.
_load_dev_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"train.tsv"
)
self
.
train_examples
=
self
.
_read_tsv
(
self
.
train_file
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"dev.tsv"
)
self
.
dev_examples
=
self
.
_read_tsv
(
self
.
dev_file
)
def
_load_test_examples
(
self
):
self
.
test_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"test.tsv"
)
self
.
test_examples
=
self
.
_read_tsv
(
self
.
test_file
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_tsv
(
self
,
input_file
,
quotechar
=
None
):
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"lcqmc"
)
base_path
=
self
.
_download_dataset
(
dataset_dir
,
url
=
_DATA_URL
)
super
(
LCQMC
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"train.tsv"
,
dev_file
=
"dev.tsv"
,
test_file
=
"test.tsv"
,
label_file
=
None
,
label_list
=
[
"0"
,
"1"
],
)
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Reads a tab separated value file."""
with
codecs
.
open
(
input_file
,
"r"
,
encoding
=
"UTF-8"
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
quotechar
)
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
None
)
examples
=
[]
seq_id
=
0
header
=
next
(
reader
)
# skip header
...
...
@@ -93,5 +59,13 @@ class LCQMC(HubDataset):
if
__name__
==
"__main__"
:
ds
=
LCQMC
()
for
e
in
ds
.
get_train_examples
():
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
ds
)
paddlehub/dataset/msra_ner.py
浏览文件 @
07e21b51
...
...
@@ -20,18 +20,15 @@ from __future__ import print_function
import
os
import
codecs
import
csv
import
json
from
collections
import
namedtuple
from
paddlehub.dataset
import
InputExample
,
HubDataset
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.dataset
import
InputExample
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.
common.logger
import
logger
from
paddlehub.
dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz"
class
MSRA_NER
(
HubDatase
t
):
class
MSRA_NER
(
BaseNLPDatas
t
):
"""
A set of manually annotated Chinese word-segmentation data and
specifications for training and testing a Chinese word-segmentation system
...
...
@@ -40,55 +37,23 @@ class MSRA_NER(HubDataset):
"""
def
__init__
(
self
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"msra_ner"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_test_examples
()
self
.
_load_dev_examples
()
def
_load_train_examples
(
self
):
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"train.tsv"
)
self
.
train_examples
=
self
.
_read_tsv
(
train_file
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"dev.tsv"
)
self
.
dev_examples
=
self
.
_read_tsv
(
self
.
dev_file
)
def
_load_test_examples
(
self
):
self
.
test_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"test.tsv"
)
self
.
test_examples
=
self
.
_read_tsv
(
self
.
test_file
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
get_labels
(
self
):
return
[
"B-PER"
,
"I-PER"
,
"B-ORG"
,
"I-ORG"
,
"B-LOC"
,
"I-LOC"
,
"O"
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
get_label_map
(
self
):
return
self
.
label_map
def
_read_tsv
(
self
,
input_file
,
quotechar
=
None
):
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"msra_ner"
)
base_path
=
self
.
_download_dataset
(
dataset_dir
,
url
=
_DATA_URL
)
super
(
MSRA_NER
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"train.tsv"
,
dev_file
=
"dev.tsv"
,
test_file
=
"test.tsv"
,
label_file
=
None
,
label_list
=
[
"B-PER"
,
"I-PER"
,
"B-ORG"
,
"I-ORG"
,
"B-LOC"
,
"I-LOC"
,
"O"
],
)
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Reads a tab separated value file."""
with
codecs
.
open
(
input_file
,
"r"
,
encoding
=
"UTF-8"
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
quotechar
)
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
None
)
examples
=
[]
seq_id
=
0
header
=
next
(
reader
)
# skip header
...
...
@@ -103,5 +68,13 @@ class MSRA_NER(HubDataset):
if
__name__
==
"__main__"
:
ds
=
MSRA_NER
()
for
e
in
ds
.
get_train_examples
():
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
ds
)
paddlehub/dataset/nlpcc_dbqa.py
浏览文件 @
07e21b51
...
...
@@ -17,20 +17,18 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
collections
import
namedtuple
import
codecs
import
os
import
csv
from
paddlehub.dataset
import
InputExample
,
HubDataset
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.dataset
import
InputExample
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.
common.logger
import
logger
from
paddlehub.
dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz"
class
NLPCC_DBQA
(
HubDatase
t
):
class
NLPCC_DBQA
(
BaseNLPDatas
t
):
"""
Please refer to
http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf
...
...
@@ -38,53 +36,21 @@ class NLPCC_DBQA(HubDataset):
"""
def
__init__
(
self
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"nlpcc-dbqa"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_test_examples
()
self
.
_load_dev_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"train.tsv"
)
self
.
train_examples
=
self
.
_read_tsv
(
self
.
train_file
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"dev.tsv"
)
self
.
dev_examples
=
self
.
_read_tsv
(
self
.
dev_file
)
def
_load_test_examples
(
self
):
self
.
test_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"test.tsv"
)
self
.
test_examples
=
self
.
_read_tsv
(
self
.
test_file
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_tsv
(
self
,
input_file
,
quotechar
=
None
):
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"nlpcc-dbqa"
)
base_path
=
self
.
_download_dataset
(
dataset_dir
,
url
=
_DATA_URL
)
super
(
NLPCC_DBQA
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"train.tsv"
,
dev_file
=
"dev.tsv"
,
test_file
=
"test.tsv"
,
label_file
=
None
,
label_list
=
[
"0"
,
"1"
],
)
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Reads a tab separated value file."""
with
codecs
.
open
(
input_file
,
"r"
,
encoding
=
"UTF-8"
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
quotechar
)
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
None
)
examples
=
[]
seq_id
=
0
header
=
next
(
reader
)
# skip header
...
...
@@ -99,5 +65,13 @@ class NLPCC_DBQA(HubDataset):
if
__name__
==
"__main__"
:
ds
=
NLPCC_DBQA
()
for
e
in
ds
.
get_train_examples
():
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
ds
)
paddlehub/dataset/squad.py
浏览文件 @
07e21b51
...
...
@@ -16,12 +16,11 @@
import
json
import
os
import
sys
from
paddlehub.reader
import
tokenization
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.common.logger
import
logger
from
paddlehub.dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz"
...
...
@@ -66,61 +65,31 @@ class SquadExample(object):
return
s
class
SQUAD
(
objec
t
):
class
SQUAD
(
BaseNLPDatas
t
):
"""A single set of features of data."""
def
__init__
(
self
,
version_2_with_negative
=
False
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"squad_data"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
version_2_with_negative
=
version_2_with_negative
self
.
_load_train_examples
(
version_2_with_negative
,
if_has_answer
=
True
)
self
.
_load_dev_examples
(
version_2_with_negative
,
if_has_answer
=
True
)
def
_load_train_examples
(
self
,
version_2_with_negative
=
False
,
if_has_answer
=
True
):
if
not
version_2_with_negative
:
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"train-v1.1.json"
)
else
:
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"train-v2.0.json"
)
self
.
train_examples
=
self
.
_read_json
(
self
.
train_file
,
if_has_answer
,
version_2_with_negative
)
def
_load_dev_examples
(
self
,
version_2_with_negative
=
False
,
if_has_answer
=
True
):
if
not
version_2_with_negative
:
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"dev-v1.1.json"
)
train_file
=
"train-v1.1.json"
dev_file
=
"dev-v1.1.json"
else
:
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"dev-v2.0.json"
)
self
.
dev_examples
=
self
.
_read_json
(
self
.
dev_file
,
if_has_answer
,
version_2_with_negative
)
def
_load_test_examples
(
self
,
version_2_with_negative
=
False
,
is_training
=
False
):
self
.
test_file
=
None
logger
.
error
(
"not test_file"
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
[]
def
_read_json
(
self
,
input_file
,
if_has_answer
,
version_2_with_negative
=
False
):
train_file
=
"train-v2.0.json"
dev_file
=
"dev-v2.0.json"
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"squad_data"
)
base_path
=
self
.
_download_dataset
(
dataset_dir
,
url
=
_DATA_URL
)
super
(
SQUAD
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
train_file
,
dev_file
=
dev_file
,
test_file
=
None
,
label_file
=
None
,
label_list
=
None
,
)
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Read a SQuAD json file into a list of SquadExample."""
with
open
(
input_file
,
"r"
)
as
reader
:
input_data
=
json
.
load
(
reader
)[
"data"
]
...
...
@@ -156,13 +125,15 @@ class SQUAD(object):
end_position
=
None
orig_answer_text
=
None
is_impossible
=
False
if
if_has_answer
:
if
version_2_with_negative
:
if
phase
in
[
"train"
,
"dev"
]
:
if
self
.
version_2_with_negative
:
is_impossible
=
qa
[
"is_impossible"
]
# if (len(qa["answers"]) != 1) and (not is_impossible):
# raise ValueError(
# "For training, each question should have exactly 1 answer."
# )
if
phase
==
"train"
and
(
len
(
qa
[
"answers"
])
!=
1
)
and
(
not
is_impossible
):
print
(
qa
)
raise
ValueError
(
"For training, each question should have exactly 1 answer."
)
if
not
is_impossible
:
answer
=
qa
[
"answers"
][
0
]
orig_answer_text
=
answer
[
"text"
]
...
...
@@ -206,8 +177,14 @@ class SQUAD(object):
if
__name__
==
"__main__"
:
ds
=
SQUAD
(
version_2_with_negative
=
False
)
examples
=
ds
.
get_train_examples
()
for
index
,
e
in
enumerate
(
examples
):
if
index
<
10
:
print
(
e
)
ds
=
SQUAD
(
version_2_with_negative
=
True
)
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
2
]:
print
(
e
)
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
2
]:
print
(
e
)
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
2
]:
print
(
e
)
print
(
ds
)
paddlehub/dataset/stanford_dogs.py
浏览文件 @
07e21b51
...
...
@@ -20,20 +20,35 @@ from __future__ import print_function
import
os
import
paddlehub
as
hub
from
paddlehub.dataset.base_cv_dataset
import
ImageClassificationDatase
t
from
paddlehub.dataset.base_cv_dataset
import
BaseCVDatas
t
class
StanfordDogsDataset
(
ImageClassificationDatase
t
):
class
StanfordDogsDataset
(
BaseCVDatas
t
):
def
__init__
(
self
):
super
(
StanfordDogsDataset
,
self
).
__init__
()
dataset_path
=
os
.
path
.
join
(
hub
.
common
.
dir
.
DATA_HOME
,
"StanfordDogs-120"
)
self
.
base_path
=
self
.
_download_dataset
(
base_path
=
self
.
_download_dataset
(
dataset_path
=
dataset_path
,
url
=
"https://bj.bcebos.com/paddlehub-dataset/StanfordDogs-120.tar.gz"
)
self
.
train_list_file
=
"train_list.txt"
self
.
test_list_file
=
"test_list.txt"
self
.
validate_list_file
=
"validate_list.txt"
self
.
label_list_file
=
"label_list.txt"
self
.
num_labels
=
120
super
(
StanfordDogsDataset
,
self
).
__init__
(
base_path
=
base_path
,
train_list_file
=
"train_list.txt"
,
validate_list_file
=
"validate_list.txt"
,
test_list_file
=
"test_list.txt"
,
label_list_file
=
"label_list.txt"
,
label_list
=
None
)
if
__name__
==
"__main__"
:
ds
=
StanfordDogsDataset
()
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
e
)
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
e
)
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
e
)
print
(
ds
)
paddlehub/dataset/thucnews.py
浏览文件 @
07e21b51
...
...
@@ -17,64 +17,30 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
collections
import
namedtuple
import
io
import
os
import
csv
from
paddlehub.dataset
import
InputExample
,
HubDataset
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.dataset
import
InputExample
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.
common.logger
import
logger
from
paddlehub.
dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/thucnews.tar.gz"
class
THUCNEWS
(
HubDatase
t
):
class
THUCNEWS
(
BaseNLPDatas
t
):
def
__init__
(
self
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"thucnews"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_test_examples
()
self
.
_load_dev_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"train.txt"
)
self
.
train_examples
=
self
.
_read_file
(
self
.
train_file
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"dev.txt"
)
self
.
dev_examples
=
self
.
_read_file
(
self
.
dev_file
)
def
_load_test_examples
(
self
):
self
.
test_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"test.txt"
)
self
.
test_examples
=
self
.
_read_file
(
self
.
test_file
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
get_labels
(
self
):
return
[
str
(
i
)
for
i
in
range
(
14
)]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_file
(
self
,
input_file
):
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"thucnews"
)
base_path
=
self
.
_download_dataset
(
dataset_dir
,
url
=
_DATA_URL
)
super
(
THUCNEWS
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"train.txt"
,
dev_file
=
"dev.txt"
,
test_file
=
"test.txt"
,
label_file
=
None
,
label_list
=
[
str
(
i
)
for
i
in
range
(
14
)],
)
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Reads a tab separated value file."""
with
io
.
open
(
input_file
,
"r"
,
encoding
=
"UTF-8"
)
as
file
:
examples
=
[]
...
...
@@ -91,5 +57,13 @@ class THUCNEWS(HubDataset):
if
__name__
==
"__main__"
:
ds
=
THUCNEWS
()
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
ds
)
paddlehub/dataset/tnews.py
浏览文件 @
07e21b51
...
...
@@ -17,15 +17,11 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
collections
import
namedtuple
import
io
import
os
import
csv
from
paddlehub.dataset
import
InputExample
,
HubDataset
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.dataset
import
InputExample
,
BaseDataset
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.common.logger
import
logger
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/tnews.tar.gz"
...
...
@@ -48,64 +44,31 @@ LABEL_NAME = {
}
class
TNews
(
Hub
Dataset
):
class
TNews
(
Base
Dataset
):
"""
TNews is the chinese news classification dataset on Jinri Toutiao App.
"""
def
__init__
(
self
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"tnews"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_test_examples
()
self
.
_load_dev_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"toutiao_category_train.txt"
)
self
.
train_examples
=
self
.
_read_file
(
self
.
train_file
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"toutiao_category_dev.txt"
)
self
.
dev_examples
=
self
.
_read_file
(
self
.
dev_file
)
def
_load_test_examples
(
self
):
self
.
test_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"toutiao_category_test.txt"
)
self
.
test_examples
=
self
.
_read_file
(
self
.
test_file
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
get_labels
(
self
):
return
[
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"tnews"
)
base_path
=
self
.
_download_dataset
(
dataset_dir
,
url
=
_DATA_URL
)
label_list
=
[
'100'
,
'101'
,
'102'
,
'103'
,
'104'
,
'106'
,
'107'
,
'108'
,
'109'
,
'110'
,
'112'
,
'113'
,
'114'
,
'115'
,
'116'
]
super
(
TNews
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"toutiao_category_train.txt"
,
dev_file
=
"toutiao_category_dev.txt"
,
test_file
=
"toutiao_category_test.txt"
,
label_file
=
None
,
label_list
=
label_list
,
)
def
get_label_name
(
self
,
id
):
return
LABEL_NAME
[
id
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_file
(
self
,
input_file
):
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Reads a tab separated value file."""
with
io
.
open
(
input_file
,
"r"
,
encoding
=
"UTF-8"
)
as
file
:
examples
=
[]
...
...
@@ -120,5 +83,13 @@ class TNews(HubDataset):
if
__name__
==
"__main__"
:
ds
=
TNews
()
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
ds
)
paddlehub/dataset/toxic.py
浏览文件 @
07e21b51
...
...
@@ -17,73 +17,39 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
collections
import
namedtuple
import
codecs
import
os
import
pandas
as
pd
from
numpy
import
nan
from
paddlehub.dataset
import
InputExample
,
HubDataset
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.dataset
import
InputExample
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.
common.logger
import
logger
from
paddlehub.
dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz"
class
Toxic
(
HubDatase
t
):
class
Toxic
(
BaseNLPDatas
t
):
"""
The kaggle Toxic dataset:
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
"""
def
__init__
(
self
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"toxic"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_test_examples
()
self
.
_load_dev_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"train.csv"
)
self
.
train_examples
=
self
.
_read_csv
(
self
.
train_file
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"dev.csv"
)
self
.
dev_examples
=
self
.
_read_csv
(
self
.
dev_file
)
def
_load_test_examples
(
self
):
self
.
test_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"test.csv"
)
self
.
test_examples
=
self
.
_read_csv
(
self
.
test_file
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
get_labels
(
self
):
return
[
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"toxic"
)
base_path
=
self
.
_download_dataset
(
dataset_dir
,
url
=
_DATA_URL
)
label_list
=
[
'toxic'
,
'severe_toxic'
,
'obscene'
,
'threat'
,
'insult'
,
'identity_hate'
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_csv
(
self
,
input_file
,
quotechar
=
None
):
super
(
Toxic
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"train.csv"
,
dev_file
=
"dev.csv"
,
test_file
=
"test.csv"
,
label_file
=
None
,
label_list
=
label_list
,
)
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Reads a tab separated value file."""
data
=
pd
.
read_csv
(
input_file
,
encoding
=
"UTF-8"
)
examples
=
[]
...
...
@@ -99,5 +65,13 @@ class Toxic(HubDataset):
if
__name__
==
"__main__"
:
ds
=
Toxic
()
for
e
in
ds
.
get_train_examples
():
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
ds
)
paddlehub/dataset/xnli.py
浏览文件 @
07e21b51
...
...
@@ -23,15 +23,14 @@ import io
import
os
import
csv
from
paddlehub.dataset
import
InputExample
,
HubDataset
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.dataset
import
InputExample
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.
common.logger
import
logger
from
paddlehub.
dataset.base_nlp_dataset
import
BaseNLPDatast
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/XNLI-lan.tar.gz"
class
XNLI
(
HubDatase
t
):
class
XNLI
(
BaseNLPDatas
t
):
"""
Please refer to
https://arxiv.org/pdf/1809.05053.pdf
...
...
@@ -43,61 +42,25 @@ class XNLI(HubDataset):
"ar"
,
"bg"
,
"de"
,
"el"
,
"en"
,
"es"
,
"fr"
,
"hi"
,
"ru"
,
"sw"
,
"th"
,
"tr"
,
"ur"
,
"vi"
,
"zh"
]:
raise
Exception
(
language
+
"is not in XNLI. Please confirm the language"
)
raise
Exception
(
"%s is not in XNLI. Please confirm the language"
%
language
)
self
.
language
=
language
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"XNLI-lan"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_test_examples
()
self
.
_load_dev_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
self
.
language
,
self
.
language
+
"_train.tsv"
)
self
.
train_examples
=
self
.
_read_tsv
(
self
.
train_file
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
self
.
language
,
self
.
language
+
"_dev.tsv"
)
self
.
dev_examples
=
self
.
_read_tsv
(
self
.
dev_file
)
def
_load_test_examples
(
self
):
self
.
test_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
self
.
language
,
self
.
language
+
"_test.tsv"
)
self
.
test_examples
=
self
.
_read_tsv
(
self
.
test_file
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
get_labels
(
self
):
"""See base class."""
return
[
"neutral"
,
"contradiction"
,
"entailment"
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_tsv
(
self
,
input_file
,
quotechar
=
None
):
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"XNLI-lan"
)
dataset_dir
=
self
.
_download_dataset
(
dataset_dir
,
url
=
_DATA_URL
)
base_path
=
os
.
path
.
join
(
dataset_dir
,
language
)
super
(
XNLI
,
self
).
__init__
(
base_path
=
base_path
,
train_file
=
"%s_train.tsv"
%
language
,
dev_file
=
"%s_dev.tsv"
%
language
,
test_file
=
"%s_test.tsv"
%
language
,
label_file
=
None
,
label_list
=
[
"neutral"
,
"contradiction"
,
"entailment"
],
)
def
_read_file
(
self
,
input_file
,
phase
=
None
):
"""Reads a tab separated value file."""
with
io
.
open
(
input_file
,
"r"
,
encoding
=
"UTF-8"
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
quotechar
)
reader
=
csv
.
reader
(
f
,
delimiter
=
"
\t
"
,
quotechar
=
None
)
examples
=
[]
seq_id
=
0
header
=
next
(
reader
)
# skip header
...
...
@@ -112,5 +75,13 @@ class XNLI(HubDataset):
if
__name__
==
"__main__"
:
ds
=
XNLI
()
for
e
in
ds
.
get_train_examples
()[:
3
]:
print
(
"first 10 dev"
)
for
e
in
ds
.
get_dev_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 train"
)
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
"first 10 test"
)
for
e
in
ds
.
get_test_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
print
(
ds
)
paddlehub/finetune/task/__init__.py
浏览文件 @
07e21b51
...
...
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from
.bas
ic_task
import
Basic
Task
,
RunEnv
,
RunState
from
.bas
e_task
import
Base
Task
,
RunEnv
,
RunState
from
.classifier_task
import
ClassifierTask
,
ImageClassifierTask
,
TextClassifierTask
,
MultiLabelClassifierTask
from
.reading_comprehension_task
import
ReadingComprehensionTask
from
.regression_task
import
RegressionTask
...
...
paddlehub/finetune/task/bas
ic
_task.py
→
paddlehub/finetune/task/bas
e
_task.py
浏览文件 @
07e21b51
...
...
@@ -192,7 +192,7 @@ class TaskHooks():
return
self
.
info
(
only_customized
=
False
)
class
Bas
ic
Task
(
object
):
class
Bas
e
Task
(
object
):
def
__init__
(
self
,
feed_list
,
data_reader
,
...
...
@@ -265,7 +265,7 @@ class BasicTask(object):
for
hook_type
,
event_hooks
in
self
.
_hooks
.
_registered_hooks
.
items
():
self
.
_hooks
.
add
(
hook_type
,
"default"
,
eval
(
"self._default_%s_event"
%
hook_type
))
setattr
(
Bas
ic
Task
,
"_%s_event"
%
hook_type
,
setattr
(
Bas
e
Task
,
"_%s_event"
%
hook_type
,
self
.
create_event_function
(
hook_type
))
# accelerate predict
...
...
paddlehub/finetune/task/classifier_task.py
浏览文件 @
07e21b51
...
...
@@ -23,10 +23,10 @@ import numpy as np
import
paddle.fluid
as
fluid
from
paddlehub.finetune.evaluate
import
calculate_f1_np
,
matthews_corrcoef
from
.bas
ic_task
import
Basic
Task
from
.bas
e_task
import
Base
Task
class
ClassifierTask
(
Bas
ic
Task
):
class
ClassifierTask
(
Bas
e
Task
):
def
__init__
(
self
,
feature
,
num_classes
,
...
...
paddlehub/finetune/task/reading_comprehension_task.py
浏览文件 @
07e21b51
...
...
@@ -28,7 +28,7 @@ from collections import OrderedDict
import
numpy
as
np
import
paddle.fluid
as
fluid
from
.bas
ic_task
import
Basic
Task
from
.bas
e_task
import
Base
Task
from
paddlehub.common.logger
import
logger
from
paddlehub.reader
import
tokenization
from
paddlehub.finetune.evaluator
import
squad1_evaluate
...
...
@@ -176,6 +176,13 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
output_nbest_file
,
output_null_log_odds_file
,
version_2_with_negative
,
null_score_diff_threshold
,
is_english
):
_PrelimPrediction
=
collections
.
namedtuple
(
"PrelimPrediction"
,
[
"feature_index"
,
"start_index"
,
"end_index"
,
"start_logit"
,
"end_logit"
])
_NbestPrediction
=
collections
.
namedtuple
(
"NbestPrediction"
,
[
"text"
,
"start_logit"
,
"end_logit"
])
example_index_to_features
=
collections
.
defaultdict
(
list
)
for
feature
in
all_features
:
example_index_to_features
[
feature
.
example_index
].
append
(
feature
)
...
...
@@ -184,10 +191,6 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
for
result
in
all_results
:
unique_id_to_result
[
result
.
unique_id
]
=
result
_PrelimPrediction
=
collections
.
namedtuple
(
"PrelimPrediction"
,
[
"feature_index"
,
"start_index"
,
"end_index"
,
"start_logit"
,
"end_logit"
])
all_predictions
=
collections
.
OrderedDict
()
all_nbest_json
=
collections
.
OrderedDict
()
scores_diff_json
=
collections
.
OrderedDict
()
...
...
@@ -262,9 +265,6 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
key
=
lambda
x
:
(
x
.
start_logit
+
x
.
end_logit
),
reverse
=
True
)
_NbestPrediction
=
collections
.
namedtuple
(
# pylint: disable=invalid-name
"NbestPrediction"
,
[
"text"
,
"start_logit"
,
"end_logit"
])
seen_predictions
=
{}
nbest
=
[]
if
not
prelim_predictions
:
...
...
@@ -384,7 +384,7 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
+
"
\n
"
)
class
ReadingComprehensionTask
(
Bas
ic
Task
):
class
ReadingComprehensionTask
(
Bas
e
Task
):
def
__init__
(
self
,
feature
,
feed_list
,
...
...
@@ -420,6 +420,9 @@ class ReadingComprehensionTask(BasicTask):
self
.
n_best_size
=
n_best_size
self
.
max_answer_length
=
max_answer_length
self
.
RawResult
=
collections
.
namedtuple
(
"RawResult"
,
[
"unique_id"
,
"start_logits"
,
"end_logits"
])
def
_build_net
(
self
):
self
.
unique_ids
=
fluid
.
layers
.
data
(
name
=
"unique_ids"
,
shape
=
[
-
1
,
1
],
lod_level
=
0
,
dtype
=
"int64"
)
...
...
@@ -493,8 +496,6 @@ class ReadingComprehensionTask(BasicTask):
def
_calculate_metrics
(
self
,
run_states
):
total_cost
,
total_num_seqs
,
all_results
=
[],
[],
[]
run_step
=
0
RawResult
=
collections
.
namedtuple
(
"RawResult"
,
[
"unique_id"
,
"start_logits"
,
"end_logits"
])
for
run_state
in
run_states
:
np_loss
=
run_state
.
run_results
[
0
]
np_num_seqs
=
run_state
.
run_results
[
1
]
...
...
@@ -510,7 +511,7 @@ class ReadingComprehensionTask(BasicTask):
start_logits
=
[
float
(
x
)
for
x
in
np_start_logits
[
idx
].
flat
]
end_logits
=
[
float
(
x
)
for
x
in
np_end_logits
[
idx
].
flat
]
all_results
.
append
(
RawResult
(
self
.
RawResult
(
unique_id
=
unique_id
,
start_logits
=
start_logits
,
end_logits
=
end_logits
))
...
...
@@ -544,13 +545,13 @@ class ReadingComprehensionTask(BasicTask):
is_english
=
self
.
is_english
)
if
self
.
phase
==
'val'
or
self
.
phase
==
'dev'
:
with
open
(
self
.
data_reader
.
dataset
.
dev_
file
,
'r'
,
self
.
data_reader
.
dataset
.
dev_
path
,
'r'
,
encoding
=
"utf8"
)
as
dataset_file
:
dataset_json
=
json
.
load
(
dataset_file
)
dataset
=
dataset_json
[
'data'
]
elif
self
.
phase
==
'test'
:
with
open
(
self
.
data_reader
.
dataset
.
test_
file
,
'r'
,
self
.
data_reader
.
dataset
.
test_
path
,
'r'
,
encoding
=
"utf8"
)
as
dataset_file
:
dataset_json
=
json
.
load
(
dataset_file
)
dataset
=
dataset_json
[
'data'
]
...
...
@@ -577,8 +578,6 @@ class ReadingComprehensionTask(BasicTask):
def
_default_predict_end_event
(
self
,
run_states
):
all_results
=
[]
RawResult
=
collections
.
namedtuple
(
"RawResult"
,
[
"unique_id"
,
"start_logits"
,
"end_logits"
])
for
run_state
in
run_states
:
np_unique_ids
=
run_state
.
run_results
[
0
]
np_start_logits
=
run_state
.
run_results
[
1
]
...
...
@@ -588,7 +587,7 @@ class ReadingComprehensionTask(BasicTask):
start_logits
=
[
float
(
x
)
for
x
in
np_start_logits
[
idx
].
flat
]
end_logits
=
[
float
(
x
)
for
x
in
np_end_logits
[
idx
].
flat
]
all_results
.
append
(
RawResult
(
self
.
RawResult
(
unique_id
=
unique_id
,
start_logits
=
start_logits
,
end_logits
=
end_logits
))
...
...
paddlehub/finetune/task/regression_task.py
浏览文件 @
07e21b51
...
...
@@ -23,10 +23,10 @@ from collections import OrderedDict
import
numpy
as
np
import
paddle.fluid
as
fluid
from
scipy.stats
import
spearmanr
from
.bas
ic_task
import
Basic
Task
from
.bas
e_task
import
Base
Task
class
RegressionTask
(
Bas
ic
Task
):
class
RegressionTask
(
Bas
e
Task
):
def
__init__
(
self
,
feature
,
feed_list
,
...
...
paddlehub/finetune/task/sequence_task.py
浏览文件 @
07e21b51
...
...
@@ -25,10 +25,10 @@ import paddle
import
paddle.fluid
as
fluid
from
paddlehub.finetune.evaluate
import
chunk_eval
,
calculate_f1
from
paddlehub.common.utils
import
version_compare
from
.bas
ic_task
import
Basic
Task
from
.bas
e_task
import
Base
Task
class
SequenceLabelTask
(
Bas
ic
Task
):
class
SequenceLabelTask
(
Bas
e
Task
):
def
__init__
(
self
,
feature
,
max_seq_len
,
...
...
paddlehub/reader/base_reader.py
0 → 100644
浏览文件 @
07e21b51
import
numpy
as
np
class
BaseReader
(
object
):
def
__init__
(
self
,
dataset
,
random_seed
=
None
):
self
.
dataset
=
dataset
self
.
num_examples
=
{
'train'
:
-
1
,
'dev'
:
-
1
,
'test'
:
-
1
}
np
.
random
.
seed
(
random_seed
)
def
get_train_examples
(
self
):
return
self
.
dataset
.
get_train_examples
()
def
get_dev_examples
(
self
):
return
self
.
dataset
.
get_dev_examples
()
def
get_test_examples
(
self
):
return
self
.
dataset
.
get_test_examples
()
def
data_generator
(
self
):
raise
NotImplementedError
paddlehub/reader/cv_reader.py
浏览文件 @
07e21b51
...
...
@@ -22,6 +22,7 @@ import numpy as np
from
PIL
import
Image
import
paddlehub.io.augmentation
as
image_augmentation
from
.base_reader
import
BaseReader
channel_order_dict
=
{
"RGB"
:
[
0
,
1
,
2
],
...
...
@@ -33,7 +34,7 @@ channel_order_dict = {
}
class
ImageClassificationReader
(
object
):
class
ImageClassificationReader
(
BaseReader
):
def
__init__
(
self
,
image_width
,
image_height
,
...
...
@@ -41,15 +42,15 @@ class ImageClassificationReader(object):
channel_order
=
"RGB"
,
images_mean
=
None
,
images_std
=
None
,
data_augmentation
=
False
):
data_augmentation
=
False
,
random_seed
=
None
):
super
(
ImageClassificationReader
,
self
).
__init__
(
dataset
,
random_seed
)
self
.
image_width
=
image_width
self
.
image_height
=
image_height
self
.
channel_order
=
channel_order
self
.
dataset
=
dataset
self
.
data_augmentation
=
data_augmentation
self
.
images_std
=
images_std
self
.
images_mean
=
images_mean
self
.
num_examples
=
{
'train'
:
-
1
,
'dev'
:
-
1
,
'test'
:
-
1
}
if
self
.
images_mean
is
None
:
try
:
...
...
@@ -73,24 +74,38 @@ class ImageClassificationReader(object):
raise
ValueError
(
"Image width and height should not be negative."
)
def
data_generator
(
self
,
batch_size
,
batch_size
=
1
,
phase
=
"train"
,
shuffle
=
False
,
data
=
None
):
if
phase
!=
'predict'
and
not
self
.
dataset
:
raise
ValueError
(
"The dataset is none and it's not allowed!"
)
if
phase
==
"train"
:
data
=
self
.
dataset
.
train_data
(
shuffle
)
self
.
num_examples
[
'train'
]
=
len
(
self
.
get_train_examples
())
elif
phase
==
"test"
:
shuffle
=
False
data
=
self
.
dataset
.
test_data
(
shuffle
)
self
.
num_examples
[
'test'
]
=
len
(
self
.
get_test_examples
())
shuffle
=
True
if
hasattr
(
self
.
dataset
,
"train_data"
):
# Compatible with ImageClassificationDataset which has done shuffle
self
.
dataset
.
train_data
()
shuffle
=
False
data
=
self
.
get_train_examples
()
self
.
num_examples
[
'train'
]
=
len
(
data
)
elif
phase
==
"val"
or
phase
==
"dev"
:
shuffle
=
False
data
=
self
.
dataset
.
validate_data
(
shuffle
)
self
.
num_examples
[
'dev'
]
=
len
(
self
.
get_dev_examples
())
if
hasattr
(
self
.
dataset
,
"validate_data"
):
# Compatible with ImageClassificationDataset
self
.
dataset
.
validate_data
()
shuffle
=
False
data
=
self
.
get_dev_examples
()
self
.
num_examples
[
'dev'
]
=
len
(
data
)
elif
phase
==
"test"
:
shuffle
=
False
if
hasattr
(
self
.
dataset
,
"test_data"
):
# Compatible with ImageClassificationDataset
data
=
self
.
dataset
.
test_data
()
shuffle
=
False
data
=
self
.
get_test_examples
()
self
.
num_examples
[
'test'
]
=
len
(
data
)
elif
phase
==
"predict"
:
shuffle
=
False
data
=
data
def
preprocess
(
image_path
):
...
...
@@ -118,6 +133,9 @@ class ImageClassificationReader(object):
return
image
def
_data_reader
():
if
shuffle
:
np
.
random
.
shuffle
(
data
)
if
phase
==
"predict"
:
for
image_path
in
data
:
image
=
preprocess
(
image_path
)
...
...
@@ -128,12 +146,3 @@ class ImageClassificationReader(object):
yield
(
image
,
label
)
return
paddle
.
batch
(
_data_reader
,
batch_size
=
batch_size
)
def
get_train_examples
(
self
):
return
self
.
dataset
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dataset
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
dataset
.
test_examples
paddlehub/reader/nlp_reader.py
浏览文件 @
07e21b51
此差异已折叠。
点击以展开。
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录