Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
b45479ee
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 1 年 前同步成功
通知
282
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b45479ee
编写于
3月 29, 2019
作者:
Z
Zeyu Chen
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add get_vocab_path api for bert module
上级
fc72dc60
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
13 addition
and
216 deletion
+13
-216
demo/bert-cls/create_module.py
demo/bert-cls/create_module.py
+0
-182
demo/bert-cls/finetune_with_hub.py
demo/bert-cls/finetune_with_hub.py
+5
-2
demo/bert-cls/run_create_module.sh
demo/bert-cls/run_create_module.sh
+0
-29
paddle_hub/module/module.py
paddle_hub/module/module.py
+8
-3
未找到文件。
demo/bert-cls/create_module.py
已删除
100644 → 0
浏览文件 @
fc72dc60
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetuning on classification tasks."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
time
import
argparse
import
numpy
as
np
import
multiprocessing
import
paddle
import
paddle.fluid
as
fluid
import
paddle_hub
as
hub
import
reader.cls
as
reader
from
model.bert
import
BertConfig
from
model.classifier
import
create_bert_module
from
optimization
import
optimization
from
utils.args
import
ArgumentGroup
,
print_arguments
from
utils.init
import
init_pretraining_params
,
init_checkpoint
# yapf: disable
parser
=
argparse
.
ArgumentParser
(
__doc__
)
model_g
=
ArgumentGroup
(
parser
,
"model"
,
"model configuration and paths."
)
model_g
.
add_arg
(
"bert_config_path"
,
str
,
None
,
"Path to the json file for bert model config."
)
model_g
.
add_arg
(
"init_checkpoint"
,
str
,
None
,
"Init checkpoint to resume training from."
)
model_g
.
add_arg
(
"init_pretraining_params"
,
str
,
None
,
"Init pre-training params which preforms fine-tuning from. If the "
"arg 'init_checkpoint' has been set, this argument wouldn't be valid."
)
model_g
.
add_arg
(
"checkpoints"
,
str
,
"checkpoints"
,
"Path to save checkpoints."
)
train_g
=
ArgumentGroup
(
parser
,
"training"
,
"training options."
)
train_g
.
add_arg
(
"epoch"
,
int
,
3
,
"Number of epoches for fine-tuning."
)
train_g
.
add_arg
(
"learning_rate"
,
float
,
5e-5
,
"Learning rate used to train with warmup."
)
train_g
.
add_arg
(
"lr_scheduler"
,
str
,
"linear_warmup_decay"
,
"scheduler of learning rate."
,
choices
=
[
'linear_warmup_decay'
,
'noam_decay'
])
train_g
.
add_arg
(
"weight_decay"
,
float
,
0.01
,
"Weight decay rate for L2 regularizer."
)
train_g
.
add_arg
(
"warmup_proportion"
,
float
,
0.1
,
"Proportion of training steps to perform linear learning rate warmup for."
)
train_g
.
add_arg
(
"save_steps"
,
int
,
10000
,
"The steps interval to save checkpoints."
)
train_g
.
add_arg
(
"validation_steps"
,
int
,
1000
,
"The steps interval to evaluate model performance."
)
train_g
.
add_arg
(
"use_fp16"
,
bool
,
False
,
"Whether to use fp16 mixed precision training."
)
train_g
.
add_arg
(
"loss_scaling"
,
float
,
1.0
,
"Loss scaling factor for mixed precision training, only valid when use_fp16 is enabled."
)
log_g
=
ArgumentGroup
(
parser
,
"logging"
,
"logging related."
)
log_g
.
add_arg
(
"skip_steps"
,
int
,
10
,
"The steps interval to print loss."
)
log_g
.
add_arg
(
"verbose"
,
bool
,
False
,
"Whether to output verbose log."
)
data_g
=
ArgumentGroup
(
parser
,
"data"
,
"Data paths, vocab paths and data processing options"
)
data_g
.
add_arg
(
"data_dir"
,
str
,
None
,
"Path to training data."
)
data_g
.
add_arg
(
"vocab_path"
,
str
,
None
,
"Vocabulary path."
)
data_g
.
add_arg
(
"max_seq_len"
,
int
,
512
,
"Number of words of the longest seqence."
)
data_g
.
add_arg
(
"batch_size"
,
int
,
32
,
"Total examples' number in batch for training. see also --in_tokens."
)
data_g
.
add_arg
(
"in_tokens"
,
bool
,
False
,
"If set, the batch size will be the maximum number of tokens in one batch. "
"Otherwise, it will be the maximum number of examples in one batch."
)
data_g
.
add_arg
(
"do_lower_case"
,
bool
,
True
,
"Whether to lower case the input text. Should be True for uncased models and False for cased models."
)
data_g
.
add_arg
(
"random_seed"
,
int
,
0
,
"Random seed."
)
run_type_g
=
ArgumentGroup
(
parser
,
"run_type"
,
"running type options."
)
run_type_g
.
add_arg
(
"use_cuda"
,
bool
,
True
,
"If set, use GPU for training."
)
run_type_g
.
add_arg
(
"use_fast_executor"
,
bool
,
False
,
"If set, use fast parallel executor (in experiment)."
)
run_type_g
.
add_arg
(
"num_iteration_per_drop_scope"
,
int
,
1
,
"Ihe iteration intervals to clean up temporary variables."
)
run_type_g
.
add_arg
(
"task_name"
,
str
,
None
,
"The name of task to perform fine-tuning, should be in {'xnli', 'mnli', 'cola', 'mrpc'}."
)
run_type_g
.
add_arg
(
"do_train"
,
bool
,
True
,
"Whether to perform training."
)
run_type_g
.
add_arg
(
"do_val"
,
bool
,
True
,
"Whether to perform evaluation on dev data set."
)
run_type_g
.
add_arg
(
"do_test"
,
bool
,
True
,
"Whether to perform evaluation on test data set."
)
args
=
parser
.
parse_args
()
# yapf: enable.
def
evaluate
(
exe
,
test_program
,
test_pyreader
,
fetch_list
,
eval_phase
):
test_pyreader
.
start
()
total_cost
,
total_acc
,
total_num_seqs
=
[],
[],
[]
time_begin
=
time
.
time
()
while
True
:
try
:
np_loss
,
np_acc
,
np_num_seqs
=
exe
.
run
(
program
=
test_program
,
fetch_list
=
fetch_list
)
total_cost
.
extend
(
np_loss
*
np_num_seqs
)
total_acc
.
extend
(
np_acc
*
np_num_seqs
)
total_num_seqs
.
extend
(
np_num_seqs
)
except
fluid
.
core
.
EOFException
:
test_pyreader
.
reset
()
break
time_end
=
time
.
time
()
print
(
"[%s evaluation] ave loss: %f, ave acc: %f, elapsed time: %f s"
%
(
eval_phase
,
np
.
sum
(
total_cost
)
/
np
.
sum
(
total_num_seqs
),
np
.
sum
(
total_acc
)
/
np
.
sum
(
total_num_seqs
),
time_end
-
time_begin
))
def
main
(
args
):
bert_config
=
BertConfig
(
args
.
bert_config_path
)
bert_config
.
print_config
()
if
args
.
use_cuda
:
place
=
fluid
.
CUDAPlace
(
0
)
dev_count
=
fluid
.
core
.
get_cuda_device_count
()
else
:
place
=
fluid
.
CPUPlace
()
dev_count
=
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
task_name
=
args
.
task_name
.
lower
()
processors
=
{
'xnli'
:
reader
.
XnliProcessor
,
'cola'
:
reader
.
ColaProcessor
,
'mrpc'
:
reader
.
MrpcProcessor
,
'mnli'
:
reader
.
MnliProcessor
,
'chnsenticorp'
:
reader
.
ChnsenticorpProcessor
}
processor
=
processors
[
task_name
](
data_dir
=
args
.
data_dir
,
vocab_path
=
args
.
vocab_path
,
max_seq_len
=
args
.
max_seq_len
,
do_lower_case
=
args
.
do_lower_case
,
in_tokens
=
args
.
in_tokens
,
random_seed
=
args
.
random_seed
)
num_labels
=
len
(
processor
.
get_labels
())
startup_prog
=
fluid
.
Program
()
train_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
train_program
,
startup_prog
):
with
fluid
.
unique_name
.
guard
():
src_ids
,
pos_ids
,
sent_ids
,
input_mask
,
pooled_output
,
sequence_output
=
create_bert_module
(
args
,
pyreader_name
=
'train_reader'
,
bert_config
=
bert_config
,
num_labels
=
num_labels
)
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_prog
)
init_pretraining_params
(
exe
,
args
.
init_pretraining_params
,
main_program
=
startup_prog
,
use_fp16
=
args
.
use_fp16
)
pooled_output_sign
=
hub
.
create_signature
(
"pooled_output"
,
inputs
=
[
src_ids
,
pos_ids
,
sent_ids
,
input_mask
],
outputs
=
[
pooled_output
],
feed_names
=
[
"src_ids"
,
"pos_ids"
,
"sent_ids"
,
"input_mask"
],
fetch_names
=
[
"pooled_output"
])
sequence_output_sign
=
hub
.
create_signature
(
"sequence_output"
,
inputs
=
[
src_ids
,
pos_ids
,
sent_ids
,
input_mask
],
outputs
=
[
sequence_output
],
feed_names
=
[
"src_ids"
,
"pos_ids"
,
"sent_ids"
,
"input_mask"
],
fetch_names
=
[
"sequence_output"
])
hub
.
create_module
(
sign_arr
=
[
pooled_output_sign
,
sequence_output_sign
],
module_dir
=
"./chinese_L-12_H-768_A-12.hub_module"
,
exe
=
exe
,
assets
=
[])
if
__name__
==
'__main__'
:
print_arguments
(
args
)
main
(
args
)
demo/bert-cls/finetune_with_hub.py
浏览文件 @
b45479ee
...
...
@@ -89,9 +89,13 @@ if __name__ == '__main__':
optimizer
=
None
,
warmup_proportion
=
args
.
warmup_proportion
)
module
=
hub
.
Module
(
module_dir
=
"./hub_module/chinese_L-12_H-768_A-12.hub_module"
)
print
(
"vocab_path = {}"
.
format
(
module
.
get_vocab_path
()))
processor
=
reader
.
ChnsenticorpProcessor
(
data_dir
=
args
.
data_dir
,
vocab_path
=
args
.
vocab_path
,
vocab_path
=
module
.
get_vocab_path
()
,
max_seq_len
=
args
.
max_seq_len
,
do_lower_case
=
args
.
do_lower_case
,
in_tokens
=
args
.
in_tokens
,
...
...
@@ -100,7 +104,6 @@ if __name__ == '__main__':
num_labels
=
len
(
processor
.
get_labels
())
# loading paddlehub BERT
module
=
hub
.
Module
(
module_dir
=
"./chinese_L-12_H-768_A-12.hub_module"
)
# bert's input tensor, output tensor and forward graph
# If you want to fine-tune the pretrain model parameter, please set
...
...
demo/bert-cls/run_create_module.sh
已删除
100644 → 0
浏览文件 @
fc72dc60
export
FLAGS_enable_parallel_graph
=
1
export
FLAGS_sync_nccl_allreduce
=
1
export
CUDA_VISIBLE_DEVICES
=
0
BERT_BASE_PATH
=
"chinese_L-12_H-768_A-12"
TASK_NAME
=
'chnsenticorp'
DATA_PATH
=
chnsenticorp_data
CKPT_PATH
=
chn_checkpoints
python
-u
create_module.py
--task_name
${
TASK_NAME
}
\
--use_cuda
true
\
--do_train
true
\
--do_val
true
\
--do_test
true
\
--batch_size
4096
\
--in_tokens
true
\
--init_pretraining_params
${
BERT_BASE_PATH
}
/params
\
--data_dir
${
DATA_PATH
}
\
--vocab_path
${
BERT_BASE_PATH
}
/vocab.txt
\
--checkpoints
${
CKPT_PATH
}
\
--save_steps
100
\
--weight_decay
0.01
\
--warmup_proportion
0.0
\
--validation_steps
50
\
--epoch
3
\
--max_seq_len
128
\
--bert_config_path
${
BERT_BASE_PATH
}
/bert_config.json
\
--learning_rate
5e-5
\
--skip_steps
10
paddle_hub/module/module.py
浏览文件 @
b45479ee
...
...
@@ -98,7 +98,6 @@ class Module:
self
.
default_signature
=
None
self
.
module_info
=
None
self
.
processor
=
None
self
.
assets
=
[]
self
.
name
=
"temp"
if
url
:
self
.
_init_with_url
(
url
=
url
)
...
...
@@ -111,8 +110,8 @@ class Module:
),
"processor should be sub class of hub.BaseProcessor"
if
assets
:
self
.
assets
=
utils
.
to_list
(
assets
)
for
asset
in
assets
:
utils
.
check_path
(
assets
)
#
for asset in assets:
#
utils.check_path(assets)
self
.
processor
=
processor
self
.
_generate_module_info
(
module_info
)
self
.
_init_with_signature
(
signatures
=
signatures
)
...
...
@@ -254,6 +253,12 @@ class Module:
self
.
__dict__
[
sign
]
=
functools
.
partial
(
self
.
__call__
,
sign_name
=
sign
)
def
get_vocab_path
(
self
):
for
assets_file
in
self
.
assets
:
print
(
assets_file
)
if
"vocab.txt"
in
assets_file
:
return
assets_file
def
_recover_from_desc
(
self
):
# recover signature
for
sign
,
module_var
in
self
.
desc
.
sign2var
.
items
():
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录