Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
014f4f40
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 2 年 前同步成功
通知
285
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
014f4f40
编写于
3月 29, 2019
作者:
Z
Zeyu Chen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
change bert task reader interface
上级
34868288
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
47 addition
and
15 deletion
+47
-15
demo/bert-cls/finetune_with_hub.py
demo/bert-cls/finetune_with_hub.py
+4
-4
demo/bert-cls/reader/cls.py
demo/bert-cls/reader/cls.py
+41
-5
demo/bert-cls/run_fintune_with_hub.sh
demo/bert-cls/run_fintune_with_hub.sh
+2
-5
paddle_hub/module/module.py
paddle_hub/module/module.py
+0
-1
未找到文件。
demo/bert-cls/finetune_with_hub.py
浏览文件 @
014f4f40
...
...
@@ -73,11 +73,11 @@ if __name__ == '__main__':
warmup_proportion
=
args
.
warmup_proportion
)
# loading paddlehub BERT
#
module = hub.Module(
#
module_dir="./hub_module/chinese_L-12_H-768_A-12.hub_module")
module
=
hub
.
Module
(
module_dir
=
"./hub_module/ernie-stable.hub_module"
)
module
=
hub
.
Module
(
module_dir
=
"./hub_module/chinese_L-12_H-768_A-12.hub_module"
)
#
module = hub.Module(module_dir="./hub_module/ernie-stable.hub_module")
processor
=
reader
.
ChnsenticorpProcesso
r
(
processor
=
reader
.
BERTClassifyReade
r
(
data_dir
=
args
.
data_dir
,
vocab_path
=
module
.
get_vocab_path
(),
max_seq_len
=
args
.
max_seq_len
)
...
...
demo/bert-cls/reader/cls.py
浏览文件 @
014f4f40
...
...
@@ -109,9 +109,10 @@ class DataProcessor(object):
def
get_num_examples
(
self
,
phase
):
"""Get number of examples for train, dev or test."""
if
phase
not
in
[
'train'
,
'
dev
'
,
'test'
]:
if
phase
not
in
[
'train'
,
'
validate
'
,
'test'
]:
raise
ValueError
(
"Unknown phase, which should be in ['train', 'dev', 'test']."
)
"Unknown phase, which should be in ['train', 'validate, 'test']."
)
return
self
.
num_examples
[
phase
]
def
get_train_progress
(
self
):
...
...
@@ -131,9 +132,9 @@ class DataProcessor(object):
if
phase
==
'train'
:
examples
=
self
.
get_train_examples
(
self
.
data_dir
)
self
.
num_examples
[
'train'
]
=
len
(
examples
)
elif
phase
==
'
dev
'
:
elif
phase
==
'
validate
'
:
examples
=
self
.
get_dev_examples
(
self
.
data_dir
)
self
.
num_examples
[
'
dev
'
]
=
len
(
examples
)
self
.
num_examples
[
'
validate
'
]
=
len
(
examples
)
elif
phase
==
'test'
:
examples
=
self
.
get_test_examples
(
self
.
data_dir
)
self
.
num_examples
[
'test'
]
=
len
(
examples
)
...
...
@@ -190,7 +191,7 @@ class DataProcessor(object):
return_input_mask
=
True
,
return_max_len
=
True
,
return_num_token
=
False
)
yield
batch_data
yield
[
batch_data
]
return
wrapper
...
...
@@ -473,6 +474,41 @@ class ChnsenticorpProcessor(DataProcessor):
return
examples
class
BERTClassifyReader
(
DataProcessor
):
"""Processor for the Chnsenticorp data set."""
def
get_train_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"train.tsv"
)),
"train"
)
def
get_dev_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"dev.tsv"
)),
"dev"
)
def
get_test_examples
(
self
,
data_dir
):
"""See base class."""
return
self
.
_create_examples
(
self
.
_read_tsv
(
os
.
path
.
join
(
data_dir
,
"test.tsv"
)),
"test"
)
def
get_labels
(
self
):
"""See base class."""
return
[
"0"
,
"1"
]
def
_create_examples
(
self
,
lines
,
set_type
):
"""Creates examples for the training and dev sets."""
examples
=
[]
for
(
i
,
line
)
in
enumerate
(
lines
):
guid
=
"%s-%s"
%
(
set_type
,
i
)
text_a
=
tokenization
.
convert_to_unicode
(
line
[
1
])
label
=
tokenization
.
convert_to_unicode
(
line
[
0
])
examples
.
append
(
InputExample
(
guid
=
guid
,
text_a
=
text_a
,
text_b
=
None
,
label
=
label
))
return
examples
def
convert_single_example_to_unicode
(
guid
,
single_example
):
text_a
=
tokenization
.
convert_to_unicode
(
single_example
[
0
])
text_b
=
tokenization
.
convert_to_unicode
(
single_example
[
1
])
...
...
demo/bert-cls/run_fintune_with_hub.sh
浏览文件 @
014f4f40
export
CUDA_VISIBLE_DEVICES
=
2
export
CUDA_VISIBLE_DEVICES
=
5
DATA_PATH
=
./chnsenticorp_data
rm
-rf
$CKPT_PATH
python
-u
finetune_with_hub.py
\
--use_cuda
true
\
--batch_size
32
\
--in_tokens
false
\
--data_dir
${
DATA_PATH
}
\
--weight_decay
0.01
\
--warmup_proportion
0.0
\
--validation_steps
50
\
--epoch
3
\
--max_seq_len
128
\
--learning_rate
5e-5
\
--skip_steps
10
--learning_rate
5e-5
paddle_hub/module/module.py
浏览文件 @
014f4f40
...
...
@@ -255,7 +255,6 @@ class Module:
def
get_vocab_path
(
self
):
for
assets_file
in
self
.
assets
:
print
(
assets_file
)
if
"vocab.txt"
in
assets_file
:
return
assets_file
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录