Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
a000c6ff
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 1 年 前同步成功
通知
282
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
a000c6ff
编写于
10月 14, 2019
作者:
Z
zhangxuefei
提交者:
wuzewu
10月 16, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add dataset tnews
上级
bb1a88bd
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
108 addition
and
2 deletion
+108
-2
paddlehub/dataset/__init__.py
paddlehub/dataset/__init__.py
+1
-0
paddlehub/dataset/tnews.py
paddlehub/dataset/tnews.py
+105
-0
paddlehub/dataset/toxic.py
paddlehub/dataset/toxic.py
+2
-2
未找到文件。
paddlehub/dataset/__init__.py
浏览文件 @
a000c6ff
...
...
@@ -23,6 +23,7 @@ from .toxic import Toxic
from
.squad
import
SQUAD
from
.xnli
import
XNLI
from
.glue
import
GLUE
from
.tnews
import
TNews
# CV Dataset
from
.dogcat
import
DogCatDataset
as
DogCat
...
...
paddlehub/dataset/tnews.py
0 → 100644
浏览文件 @
a000c6ff
# coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
from
collections
import
namedtuple
import
io
import
os
import
csv
from
paddlehub.dataset
import
InputExample
,
HubDataset
from
paddlehub.common.downloader
import
default_downloader
from
paddlehub.common.dir
import
DATA_HOME
from
paddlehub.common.logger
import
logger
_DATA_URL
=
"https://bj.bcebos.com/paddlehub-dataset/tnews.tar.gz"
class
TNews
(
HubDataset
):
"""
TNews is the chinese news classification dataset on JinRiTouDiao App.
"""
def
__init__
(
self
):
self
.
dataset_dir
=
os
.
path
.
join
(
DATA_HOME
,
"tnews"
)
if
not
os
.
path
.
exists
(
self
.
dataset_dir
):
ret
,
tips
,
self
.
dataset_dir
=
default_downloader
.
download_file_and_uncompress
(
url
=
_DATA_URL
,
save_path
=
DATA_HOME
,
print_progress
=
True
)
else
:
logger
.
info
(
"Dataset {} already cached."
.
format
(
self
.
dataset_dir
))
self
.
_load_train_examples
()
self
.
_load_test_examples
()
self
.
_load_dev_examples
()
def
_load_train_examples
(
self
):
self
.
train_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"toutiao_category_train.txt"
)
self
.
train_examples
=
self
.
_read_file
(
self
.
train_file
)
def
_load_dev_examples
(
self
):
self
.
dev_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"toutiao_category_dev.txt"
)
self
.
dev_examples
=
self
.
_read_file
(
self
.
dev_file
)
def
_load_test_examples
(
self
):
self
.
test_file
=
os
.
path
.
join
(
self
.
dataset_dir
,
"toutiao_category_test.txt"
)
self
.
test_examples
=
self
.
_read_file
(
self
.
test_file
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
def
get_labels
(
self
):
return
[
'news_game'
,
'news_sports'
,
'news_finance'
,
'news_entertainment'
,
'news_tech'
,
'news_house'
,
'news_car'
,
'news_culture'
,
'news_world'
,
'news_travel'
,
'news_agriculture'
,
'news_military'
,
'news_edu'
,
'news_story'
,
'stock'
]
@
property
def
num_labels
(
self
):
"""
Return the number of labels in the dataset.
"""
return
len
(
self
.
get_labels
())
def
_read_file
(
self
,
input_file
):
"""Reads a tab separated value file."""
with
io
.
open
(
input_file
,
"r"
,
encoding
=
"UTF-8"
)
as
file
:
examples
=
[]
for
line
in
file
:
data
=
line
.
strip
().
split
(
"_!_"
)
example
=
InputExample
(
guid
=
data
[
0
],
label
=
data
[
2
],
text_a
=
data
[
3
])
examples
.
append
(
example
)
return
examples
if
__name__
==
"__main__"
:
ds
=
TNews
()
for
e
in
ds
.
get_train_examples
()[:
10
]:
print
(
"{}
\t
{}
\t
{}
\t
{}"
.
format
(
e
.
guid
,
e
.
text_a
,
e
.
text_b
,
e
.
label
))
paddlehub/dataset/toxic.py
浏览文件 @
a000c6ff
...
...
@@ -33,8 +33,8 @@ _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/toxic.tar.gz"
class
Toxic
(
HubDataset
):
"""
ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
opinion mining)
The kaggle Toxic dataset:
https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge
"""
def
__init__
(
self
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录