Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
hapi
提交
1373e294
H
hapi
项目概览
PaddlePaddle
/
hapi
通知
11
Star
2
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
4
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
H
hapi
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
4
Issue
4
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
1373e294
编写于
4月 13, 2020
作者:
X
xyzhou-puck
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add leveldb reader for bert
上级
b2f94aa8
变更
6
展开全部
显示空白变更内容
内联
并排
Showing
6 changed file
with
647 addition
and
8 deletion
+647
-8
examples/bert_leveldb/bert.yaml
examples/bert_leveldb/bert.yaml
+27
-0
examples/bert_leveldb/bert_classifier.py
examples/bert_leveldb/bert_classifier.py
+116
-0
examples/bert_leveldb/cls.py
examples/bert_leveldb/cls.py
+73
-0
examples/bert_leveldb/nohup.out
examples/bert_leveldb/nohup.out
+312
-0
examples/bert_leveldb/run_classifier_single_gpu.sh
examples/bert_leveldb/run_classifier_single_gpu.sh
+29
-0
hapi/text/bert/dataloader.py
hapi/text/bert/dataloader.py
+90
-8
未找到文件。
examples/bert_leveldb/bert.yaml
0 → 100644
浏览文件 @
1373e294
bert_config_path
:
"
./config/bert_config.json"
init_checkpoint
:
None
init_pretraining_params
:
None
checkpoints
:
"
./saved_model"
epoch
:
3
learning_rate
:
0.0001
lr_scheduler
:
"
linear_warmup_decay"
weight_decay
:
0.01
warmup_proportion
:
0.1
save_steps
:
100000
validation_steps
:
100000
loss_scaling
:
1.0
skip_steps
:
100
data_dir
:
None
vocab_path
:
None
max_seq_len
:
512
batch_size
:
32
in_tokens
:
False
do_lower_case
:
True
random_seed
:
5512
use_cuda
:
False
shuffle
:
True
do_train
:
True
do_test
:
True
use_data_parallel
:
False
verbose
:
False
examples/bert_leveldb/bert_classifier.py
0 → 100755
浏览文件 @
1373e294
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT fine-tuning in Paddle Dygraph Mode."""
import
paddle.fluid
as
fluid
from
hapi.metrics
import
Accuracy
from
hapi.configure
import
Config
from
hapi.model
import
set_device
,
Model
,
SoftmaxWithCrossEntropy
,
Input
from
cls
import
ClsModelLayer
import
hapi.text.tokenizer.tokenization
as
tokenization
from
hapi.text.bert
import
Optimizer
,
BertConfig
,
BertDataLoader
,
BertInputExample
def
train
():
config
=
Config
(
yaml_file
=
"./bert.yaml"
)
config
.
build
()
config
.
Print
()
device
=
set_device
(
"gpu"
if
config
.
use_cuda
else
"cpu"
)
fluid
.
enable_dygraph
(
device
)
bert_config
=
BertConfig
(
config
.
bert_config_path
)
bert_config
.
print_config
()
trainer_count
=
fluid
.
dygraph
.
parallel
.
Env
().
nranks
tokenizer
=
tokenization
.
FullTokenizer
(
vocab_file
=
config
.
vocab_path
,
do_lower_case
=
config
.
do_lower_case
)
def
mnli_line_processor
(
line_id
,
line
):
if
line_id
==
"0"
:
return
None
uid
=
tokenization
.
convert_to_unicode
(
line
[
0
])
text_a
=
tokenization
.
convert_to_unicode
(
line
[
8
])
text_b
=
tokenization
.
convert_to_unicode
(
line
[
9
])
label
=
tokenization
.
convert_to_unicode
(
line
[
-
1
])
if
label
not
in
[
"contradiction"
,
"entailment"
,
"neutral"
]:
label
=
"contradiction"
return
BertInputExample
(
uid
=
uid
,
text_a
=
text_a
,
text_b
=
text_b
,
label
=
label
)
bert_dataloader
=
BertDataLoader
(
"./data/glue_data/MNLI/train.tsv"
,
tokenizer
,
[
"contradiction"
,
"entailment"
,
"neutral"
],
max_seq_length
=
64
,
batch_size
=
32
,
line_processor
=
mnli_line_processor
,
mode
=
"leveldb"
)
num_train_examples
=
len
(
bert_dataloader
.
dataset
)
max_train_steps
=
config
.
epoch
*
num_train_examples
//
config
.
batch_size
//
trainer_count
warmup_steps
=
int
(
max_train_steps
*
config
.
warmup_proportion
)
print
(
"Trainer count: %d"
%
trainer_count
)
print
(
"Num train examples: %d"
%
num_train_examples
)
print
(
"Max train steps: %d"
%
max_train_steps
)
print
(
"Num warmup steps: %d"
%
warmup_steps
)
inputs
=
[
Input
(
[
None
,
None
],
'int64'
,
name
=
'src_ids'
),
Input
(
[
None
,
None
],
'int64'
,
name
=
'pos_ids'
),
Input
(
[
None
,
None
],
'int64'
,
name
=
'sent_ids'
),
Input
(
[
None
,
None
],
'float32'
,
name
=
'input_mask'
)
]
labels
=
[
Input
([
None
,
1
],
'int64'
,
name
=
'label'
)]
cls_model
=
ClsModelLayer
(
config
,
bert_config
,
len
([
"contradiction"
,
"entailment"
,
"neutral"
]),
is_training
=
True
,
return_pooled_out
=
True
)
optimizer
=
Optimizer
(
warmup_steps
=
warmup_steps
,
num_train_steps
=
max_train_steps
,
learning_rate
=
config
.
learning_rate
,
model_cls
=
cls_model
,
weight_decay
=
config
.
weight_decay
,
scheduler
=
config
.
lr_scheduler
,
loss_scaling
=
config
.
loss_scaling
,
parameter_list
=
cls_model
.
parameters
())
cls_model
.
prepare
(
optimizer
,
SoftmaxWithCrossEntropy
(),
Accuracy
(
topk
=
(
1
,
2
)),
inputs
,
labels
,
device
=
device
)
cls_model
.
bert_layer
.
init_parameters
(
config
.
init_pretraining_params
,
verbose
=
config
.
verbose
)
cls_model
.
fit
(
train_data
=
bert_dataloader
.
dataloader
,
epochs
=
config
.
epoch
)
return
cls_model
if
__name__
==
'__main__'
:
cls_model
=
train
()
examples/bert_leveldb/cls.py
0 → 100644
浏览文件 @
1373e294
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"dygraph transformer layers"
import
six
import
json
import
numpy
as
np
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid.dygraph
import
Linear
,
Layer
from
hapi.text.bert
import
BertEncoder
from
hapi.model
import
Model
class
ClsModelLayer
(
Model
):
"""
classify model
"""
def
__init__
(
self
,
args
,
config
,
num_labels
,
is_training
=
True
,
return_pooled_out
=
True
,
use_fp16
=
False
):
super
(
ClsModelLayer
,
self
).
__init__
()
self
.
config
=
config
self
.
is_training
=
is_training
self
.
use_fp16
=
use_fp16
self
.
loss_scaling
=
args
.
loss_scaling
self
.
bert_layer
=
BertEncoder
(
config
=
self
.
config
,
return_pooled_out
=
True
,
use_fp16
=
self
.
use_fp16
)
self
.
cls_fc
=
Linear
(
input_dim
=
self
.
config
[
"hidden_size"
],
output_dim
=
num_labels
,
param_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_w"
,
initializer
=
fluid
.
initializer
.
TruncatedNormal
(
scale
=
0.02
)),
bias_attr
=
fluid
.
ParamAttr
(
name
=
"cls_out_b"
,
initializer
=
fluid
.
initializer
.
Constant
(
0.
)))
def
forward
(
self
,
src_ids
,
position_ids
,
sentence_ids
,
input_mask
):
"""
forward
"""
enc_output
,
next_sent_feat
=
self
.
bert_layer
(
src_ids
,
position_ids
,
sentence_ids
,
input_mask
)
cls_feats
=
fluid
.
layers
.
dropout
(
x
=
next_sent_feat
,
dropout_prob
=
0.1
,
dropout_implementation
=
"upscale_in_train"
)
logits
=
self
.
cls_fc
(
cls_feats
)
return
logits
examples/bert_leveldb/nohup.out
0 → 100644
浏览文件 @
1373e294
此差异已折叠。
点击以展开。
examples/bert_leveldb/run_classifier_single_gpu.sh
0 → 100755
浏览文件 @
1373e294
#!/bin/bash
BERT_BASE_PATH
=
"./data/pretrained_models/uncased_L-12_H-768_A-12/"
TASK_NAME
=
'MNLI'
DATA_PATH
=
"./data/glue_data/MNLI/"
CKPT_PATH
=
"./data/saved_model/mnli_models"
export
CUDA_VISIBLE_DEVICES
=
7
# start fine-tuning
python3.7 bert_classifier.py
\
--use_cuda
true
\
--do_train
true
\
--do_test
true
\
--batch_size
64
\
--init_pretraining_params
${
BERT_BASE_PATH
}
/dygraph_params/
\
--data_dir
${
DATA_PATH
}
\
--vocab_path
${
BERT_BASE_PATH
}
/vocab.txt
\
--checkpoints
${
CKPT_PATH
}
\
--save_steps
1000
\
--weight_decay
0.01
\
--warmup_proportion
0.1
\
--validation_steps
100
\
--epoch
3
\
--max_seq_len
128
\
--bert_config_path
${
BERT_BASE_PATH
}
/bert_config.json
\
--learning_rate
5e-5
\
--skip_steps
10
\
--shuffle
true
hapi/text/bert/dataloader.py
浏览文件 @
1373e294
...
@@ -19,6 +19,7 @@ import csv
...
@@ -19,6 +19,7 @@ import csv
import
glob
import
glob
import
tarfile
import
tarfile
import
itertools
import
itertools
import
leveldb
from
functools
import
partial
from
functools
import
partial
import
numpy
as
np
import
numpy
as
np
...
@@ -167,10 +168,14 @@ class SingleSentenceDataset(Dataset):
...
@@ -167,10 +168,14 @@ class SingleSentenceDataset(Dataset):
assert
isinstance
(
mode
,
assert
isinstance
(
mode
,
str
),
"mode of SingleSentenceDataset should be str"
str
),
"mode of SingleSentenceDataset should be str"
assert
mode
in
[
assert
mode
in
[
"all_in_memory"
,
"leveldb"
"all_in_memory"
,
"leveldb"
,
"streaming"
],
"mode of SingleSentenceDataset should be in [all_in_memory, leveldb], but get"
%
mode
],
"mode of SingleSentenceDataset should be in [all_in_memory, leveldb
, streaming
], but get"
%
mode
self
.
delimiter
=
None
self
.
mode
=
mode
self
.
examples
=
[]
self
.
examples
=
[]
self
.
_db
=
None
self
.
_line_processor
=
None
def
load_all_data_in_memory
(
self
,
def
load_all_data_in_memory
(
self
,
input_file
,
input_file
,
...
@@ -202,14 +207,88 @@ class SingleSentenceDataset(Dataset):
...
@@ -202,14 +207,88 @@ class SingleSentenceDataset(Dataset):
tokenizer
)
tokenizer
)
self
.
examples
.
append
(
input_feature
)
self
.
examples
.
append
(
input_feature
)
def
prepare_leveldb
(
self
,
input_file
,
leveldb_file
,
label_list
,
max_seq_length
,
tokenizer
,
line_processor
=
None
,
delimiter
=
"
\t
"
,
quotechar
=
None
):
def
default_line_processor
(
line_id
,
line
):
assert
len
(
line
)
==
2
text_a
=
line
[
0
]
label
=
line
[
1
]
return
BertInputExample
(
str
(
line_id
),
text_a
=
text_a
,
text_b
=
None
,
label
=
label
)
if
line_processor
is
None
:
line_processor
=
default_line_processor
if
not
os
.
path
.
exists
(
leveldb_file
):
print
(
"putting data %s into leveldb %s"
%
(
input_file
,
leveldb_file
))
_example_num
=
0
_db
=
leveldb
.
LevelDB
(
leveldb_file
,
create_if_missing
=
True
)
with
io
.
open
(
input_file
,
"r"
,
encoding
=
"utf8"
)
as
f
:
reader
=
csv
.
reader
(
f
,
delimiter
=
delimiter
,
quotechar
=
quotechar
)
line_id
=
0
for
(
_line_id
,
line
)
in
enumerate
(
reader
):
if
line_processor
(
str
(
_line_id
),
line
)
is
None
:
continue
line_str
=
delimiter
.
join
(
line
)
_db
.
Put
(
str
(
line_id
).
encode
(
"utf8"
),
line_str
.
encode
(
"utf8"
))
line_id
+=
1
_example_num
+=
1
_db
.
Put
(
"_example_num_"
.
encode
(
"utf8"
),
str
(
_example_num
).
encode
(
"utf8"
))
else
:
_db
=
leveldb
.
LevelDB
(
leveldb_file
,
create_if_missing
=
False
)
self
.
label_list
=
label_list
self
.
max_seq_length
=
max_seq_length
self
.
tokenizer
=
tokenizer
self
.
delimiter
=
delimiter
self
.
_db
=
_db
self
.
_line_processor
=
line_processor
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
if
self
.
mode
==
"all_in_memory"
:
return
self
.
examples
[
idx
].
input_ids
,
self
.
examples
[
return
self
.
examples
[
idx
].
input_ids
,
self
.
examples
[
idx
].
pos_ids
,
self
.
examples
[
idx
].
segment_ids
,
self
.
examples
[
idx
].
pos_ids
,
self
.
examples
[
idx
].
segment_ids
,
self
.
examples
[
idx
].
label_id
idx
].
label_id
if
self
.
mode
==
"leveldb"
:
assert
self
.
_db
is
not
None
,
"you shold call prepare_leveldb before you run dataloader"
line_str
=
self
.
_db
.
Get
(
str
(
idx
).
encode
(
"utf8"
))
line_str
=
line_str
.
decode
(
"utf8"
)
line
=
line_str
.
split
(
self
.
delimiter
)
input_example
=
self
.
_line_processor
(
str
(
idx
+
1
),
line
)
input_example
=
convert_single_example
(
str
(
idx
+
1
),
input_example
,
self
.
label_list
,
self
.
max_seq_length
,
self
.
tokenizer
)
return
input_example
.
input_ids
,
input_example
.
pos_ids
,
input_example
.
segment_ids
,
input_example
.
label_id
def
__len__
(
self
):
def
__len__
(
self
):
if
self
.
mode
==
"all_in_memory"
:
return
len
(
self
.
examples
)
return
len
(
self
.
examples
)
if
self
.
mode
==
"leveldb"
:
assert
self
.
_db
is
not
None
,
"you shold call prepare_leveldb before you run dataloader"
exmaple_num
=
self
.
_db
.
Get
(
"_example_num_"
.
encode
(
"utf8"
))
exmaple_num
=
exmaple_num
.
decode
(
"utf8"
)
return
int
(
exmaple_num
)
class
SentencePairDataset
(
Dataset
):
class
SentencePairDataset
(
Dataset
):
def
__init__
(
self
,
def
__init__
(
self
,
...
@@ -299,6 +378,7 @@ class BertDataLoader(object):
...
@@ -299,6 +378,7 @@ class BertDataLoader(object):
shuffle
=
False
,
shuffle
=
False
,
drop_last
=
False
,
drop_last
=
False
,
mode
=
"all_in_memory"
,
mode
=
"all_in_memory"
,
leveldb_file
=
"./leveldb"
,
line_processor
=
None
,
line_processor
=
None
,
delimiter
=
"
\t
"
,
delimiter
=
"
\t
"
,
quotechar
=
None
,
quotechar
=
None
,
...
@@ -314,8 +394,10 @@ class BertDataLoader(object):
...
@@ -314,8 +394,10 @@ class BertDataLoader(object):
input_file
,
label_list
,
max_seq_length
,
tokenizer
,
input_file
,
label_list
,
max_seq_length
,
tokenizer
,
line_processor
,
delimiter
,
quotechar
)
line_processor
,
delimiter
,
quotechar
)
elif
mode
==
"leveldb"
:
elif
mode
==
"leveldb"
:
#TODO add leveldb reader
#prepare_leveldb(self, input_file, leveldb_file, label_list, max_seq_length, tokenizer, line_processor=None, delimiter="\t", quotechar=None):
pass
self
.
dataset
.
prepare_leveldb
(
input_file
,
leveldb_file
,
label_list
,
max_seq_length
,
tokenizer
,
line_processor
,
delimiter
,
quotechar
)
else
:
else
:
raise
ValueError
(
"mode should be in [all_in_memory, leveldb]"
)
raise
ValueError
(
"mode should be in [all_in_memory, leveldb]"
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录