Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
7f1a2c0b
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 1 年 前同步成功
通知
282
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7f1a2c0b
编写于
8月 02, 2019
作者:
Z
zhangxuefei
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Fix the tokenize encoding bug and Update cv dataset (add train dev test examples)
上级
f0accf42
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
93 addition
and
10 deletion
+93
-10
paddlehub/dataset/base_cv_dataset.py
paddlehub/dataset/base_cv_dataset.py
+24
-4
paddlehub/finetune/task.py
paddlehub/finetune/task.py
+4
-2
paddlehub/reader/cv_reader.py
paddlehub/reader/cv_reader.py
+9
-0
paddlehub/reader/nlp_reader.py
paddlehub/reader/nlp_reader.py
+54
-2
paddlehub/reader/tokenization.py
paddlehub/reader/tokenization.py
+2
-2
未找到文件。
paddlehub/dataset/base_cv_dataset.py
浏览文件 @
7f1a2c0b
...
...
@@ -35,6 +35,10 @@ class ImageClassificationDataset(object):
self
.
num_labels
=
0
self
.
label_list
=
[]
self
.
train_examples
=
[]
self
.
dev_examples
=
[]
self
.
test_examples
=
[]
def
_download_dataset
(
self
,
dataset_path
,
url
):
if
not
os
.
path
.
exists
(
dataset_path
):
result
,
tips
,
dataset_path
=
default_downloader
.
download_file_and_uncompress
(
...
...
@@ -47,7 +51,7 @@ class ImageClassificationDataset(object):
exit
()
return
dataset_path
def
_parse_data
(
self
,
data_path
,
shuffle
=
False
):
def
_parse_data
(
self
,
data_path
,
shuffle
=
False
,
phase
=
None
):
def
_base_reader
():
data
=
[]
with
open
(
data_path
,
"r"
)
as
file
:
...
...
@@ -68,6 +72,13 @@ class ImageClassificationDataset(object):
label
=
items
[
-
1
]
data
.
append
((
image_path
,
items
[
-
1
]))
if
phase
==
'train'
:
self
.
train_examples
=
data
elif
phase
==
'dev'
:
self
.
dev_examples
=
data
elif
phase
==
'test'
:
self
.
test_examples
=
data
if
shuffle
:
np
.
random
.
shuffle
(
data
)
...
...
@@ -85,13 +96,22 @@ class ImageClassificationDataset(object):
def
train_data
(
self
,
shuffle
=
True
):
train_data_path
=
os
.
path
.
join
(
self
.
base_path
,
self
.
train_list_file
)
return
self
.
_parse_data
(
train_data_path
,
shuffle
)
return
self
.
_parse_data
(
train_data_path
,
shuffle
,
phase
=
'train'
)
def
test_data
(
self
,
shuffle
=
False
):
test_data_path
=
os
.
path
.
join
(
self
.
base_path
,
self
.
test_list_file
)
return
self
.
_parse_data
(
test_data_path
,
shuffle
)
return
self
.
_parse_data
(
test_data_path
,
shuffle
,
phase
=
'dev'
)
def
validate_data
(
self
,
shuffle
=
False
):
validate_data_path
=
os
.
path
.
join
(
self
.
base_path
,
self
.
validate_list_file
)
return
self
.
_parse_data
(
validate_data_path
,
shuffle
)
return
self
.
_parse_data
(
validate_data_path
,
shuffle
,
phase
=
'test'
)
def
get_train_examples
(
self
):
return
self
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
test_examples
paddlehub/finetune/task.py
浏览文件 @
7f1a2c0b
...
...
@@ -520,7 +520,9 @@ class BasicTask(object):
self
.
save_checkpoint
()
# Final evaluation
if
self
.
_base_data_reader
.
get_dev_examples
()
!=
[]:
self
.
eval
(
phase
=
"dev"
)
if
self
.
_base_data_reader
.
get_test_examples
()
!=
[]:
self
.
eval
(
phase
=
"test"
)
self
.
_finetune_end_event
(
run_states
)
...
...
paddlehub/reader/cv_reader.py
浏览文件 @
7f1a2c0b
...
...
@@ -122,3 +122,12 @@ class ImageClassificationReader(object):
yield
(
image
,
label
)
return
paddle
.
batch
(
_data_reader
,
batch_size
=
batch_size
)
def
get_train_examples
(
self
):
return
self
.
dataset
.
train_examples
def
get_dev_examples
(
self
):
return
self
.
dataset
.
dev_examples
def
get_test_examples
(
self
):
return
self
.
dataset
.
test_examples
paddlehub/reader/nlp_reader.py
浏览文件 @
7f1a2c0b
...
...
@@ -42,7 +42,8 @@ class BaseReader(object):
label_map_config
=
None
,
max_seq_len
=
512
,
do_lower_case
=
True
,
random_seed
=
None
):
random_seed
=
None
,
use_task_id
=
False
):
self
.
max_seq_len
=
max_seq_len
self
.
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
vocab_path
,
do_lower_case
=
do_lower_case
)
...
...
@@ -52,6 +53,10 @@ class BaseReader(object):
self
.
cls_id
=
self
.
vocab
[
"[CLS]"
]
self
.
sep_id
=
self
.
vocab
[
"[SEP]"
]
self
.
in_tokens
=
False
self
.
use_task_id
=
use_task_id
if
self
.
use_task_id
:
self
.
task_id
=
0
np
.
random
.
seed
(
random_seed
)
...
...
@@ -232,7 +237,6 @@ class BaseReader(object):
phase
=
'train'
,
shuffle
=
True
,
data
=
None
):
if
phase
==
'train'
:
shuffle
=
True
examples
=
self
.
get_train_examples
()
...
...
@@ -313,12 +317,25 @@ class ClassifyReader(BaseReader):
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
,
batch_labels
]
if
self
.
use_task_id
:
padded_task_ids
=
np
.
ones_like
(
padded_token_ids
,
dtype
=
"int64"
)
*
self
.
task_id
return_list
=
[
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
,
padded_task_ids
,
batch_labels
]
else
:
return_list
=
[
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
]
if
self
.
use_task_id
:
return_list
=
[
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
,
padded_task_ids
]
return
return_list
...
...
@@ -355,11 +372,30 @@ class SequenceLabelReader(BaseReader):
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
,
padded_label_ids
,
batch_seq_lens
]
if
self
.
use_task_id
:
padded_task_ids
=
np
.
ones_like
(
padded_token_ids
,
dtype
=
"int64"
)
*
self
.
task_id
return_list
=
[
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
,
padded_task_ids
,
padded_label_ids
,
batch_seq_lens
]
else
:
return_list
=
[
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
,
batch_seq_lens
]
if
self
.
use_task_id
:
padded_task_ids
=
np
.
ones_like
(
padded_token_ids
,
dtype
=
"int64"
)
*
self
.
task_id
return_list
=
[
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
,
padded_task_ids
,
batch_seq_lens
]
return
return_list
def
_reseg_token_label
(
self
,
tokens
,
tokenizer
,
phase
,
labels
=
None
):
...
...
@@ -585,11 +621,27 @@ class MultiLabelClassifyReader(BaseReader):
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
,
batch_labels
]
if
self
.
use_task_id
:
padded_task_ids
=
np
.
ones_like
(
padded_token_ids
,
dtype
=
"int64"
)
*
self
.
task_id
return_list
=
[
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
,
padded_task_ids
,
batch_labels
]
else
:
return_list
=
[
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
]
if
self
.
use_task_id
:
padded_task_ids
=
np
.
ones_like
(
padded_token_ids
,
dtype
=
"int64"
)
*
self
.
task_id
return_list
=
[
padded_token_ids
,
padded_position_ids
,
padded_text_type_ids
,
input_mask
,
padded_task_ids
]
return
return_list
def
_convert_example_to_record
(
self
,
...
...
paddlehub/reader/tokenization.py
浏览文件 @
7f1a2c0b
...
...
@@ -18,8 +18,8 @@ from __future__ import absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
codecs
import
collections
import
io
import
unicodedata
import
six
...
...
@@ -71,7 +71,7 @@ def printable_text(text):
def
load_vocab
(
vocab_file
):
"""Loads a vocabulary file into a dictionary."""
vocab
=
collections
.
OrderedDict
()
fin
=
codecs
.
open
(
vocab_file
,
"r"
,
"UTF-8"
)
fin
=
io
.
open
(
vocab_file
,
"r"
,
"UTF-8"
)
for
num
,
line
in
enumerate
(
fin
):
items
=
convert_to_unicode
(
line
.
strip
()).
split
(
"
\t
"
)
if
len
(
items
)
>
2
:
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录