Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleHub
提交
41f6d967
P
PaddleHub
项目概览
PaddlePaddle
/
PaddleHub
大约 1 年 前同步成功
通知
282
Star
12117
Fork
2091
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
200
列表
看板
标记
里程碑
合并请求
4
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleHub
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
200
Issue
200
列表
看板
标记
里程碑
合并请求
4
合并请求
4
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
41f6d967
编写于
3月 10, 2020
作者:
K
kinghuin
提交者:
GitHub
3月 10, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add BERTEmbeddingTask and BERTModule (#424)
* add bertmodule
上级
b547f728
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
258 addition
and
8 deletion
+258
-8
demo/text_classification/text_classifier.py
demo/text_classification/text_classifier.py
+0
-1
paddlehub/__init__.py
paddlehub/__init__.py
+2
-0
paddlehub/module/module.py
paddlehub/module/module.py
+0
-7
paddlehub/module/nlp_module.py
paddlehub/module/nlp_module.py
+256
-0
未找到文件。
demo/text_classification/text_classifier.py
浏览文件 @
41f6d967
...
...
@@ -16,7 +16,6 @@
import
argparse
import
ast
import
paddle.fluid
as
fluid
import
paddlehub
as
hub
# yapf: disable
...
...
paddlehub/__init__.py
浏览文件 @
41f6d967
...
...
@@ -63,3 +63,5 @@ from .finetune.strategy import ULMFiTStrategy
from
.finetune.strategy
import
CombinedStrategy
from
.autofinetune.evaluator
import
report_final_result
from
.module.nlp_module
import
BERTModule
paddlehub/module/module.py
浏览文件 @
41f6d967
...
...
@@ -23,8 +23,6 @@ import sys
import
functools
import
inspect
import
importlib
import
tarfile
import
six
import
shutil
import
paddle
...
...
@@ -36,15 +34,10 @@ from paddlehub.common.dir import CACHE_HOME
from
paddlehub.common.lock
import
lock
from
paddlehub.common.logger
import
logger
from
paddlehub.common.hub_server
import
CacheUpdater
from
paddlehub.common
import
tmp_dir
from
paddlehub.common.downloader
import
progress
from
paddlehub.module
import
module_desc_pb2
from
paddlehub.module.manager
import
default_module_manager
from
paddlehub.module.checker
import
ModuleChecker
from
paddlehub.module.signature
import
Signature
,
create_signature
from
paddlehub.module.base_processor
import
BaseProcessor
from
paddlehub.io.parser
import
yaml_parser
from
paddlehub
import
version
# PaddleHub module dir name
ASSETS_DIRNAME
=
"assets"
...
...
paddlehub/module/nlp_module.py
0 → 100644
浏览文件 @
41f6d967
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
os
import
re
import
paddlehub
as
hub
import
paddle.fluid
as
fluid
from
paddlehub
import
logger
class
_BERTEmbeddingTask
(
hub
.
BaseTask
):
def
__init__
(
self
,
pooled_feature
,
seq_feature
,
feed_list
,
data_reader
,
config
=
None
):
main_program
=
pooled_feature
.
block
.
program
super
(
_BERTEmbeddingTask
,
self
).
__init__
(
main_program
=
main_program
,
data_reader
=
data_reader
,
feed_list
=
feed_list
,
config
=
config
,
metrics_choices
=
[])
self
.
pooled_feature
=
pooled_feature
self
.
seq_feature
=
seq_feature
def
_build_net
(
self
):
return
[
self
.
pooled_feature
,
self
.
seq_feature
]
def
_postprocessing
(
self
,
run_states
):
results
=
[]
for
batch_state
in
run_states
:
batch_result
=
batch_state
.
run_results
batch_pooled_features
=
batch_result
[
0
]
batch_seq_features
=
batch_result
[
1
]
for
i
in
range
(
len
(
batch_pooled_features
)):
results
.
append
(
[
batch_pooled_features
[
i
],
batch_seq_features
[
i
]])
return
results
class
BERTModule
(
hub
.
Module
):
def
_initialize
(
self
):
"""
Must override this method.
some member variables are required, others are optional.
"""
# required config
self
.
MAX_SEQ_LEN
=
None
self
.
params_path
=
None
self
.
vocab_path
=
None
# optional config
self
.
spm_path
=
None
self
.
word_dict_path
=
None
raise
NotImplementedError
def
init_pretraining_params
(
self
,
exe
,
pretraining_params_path
,
main_program
):
assert
os
.
path
.
exists
(
pretraining_params_path
),
"[%s] cann't be found."
%
pretraining_params_path
def
existed_params
(
var
):
if
not
isinstance
(
var
,
fluid
.
framework
.
Parameter
):
return
False
return
os
.
path
.
exists
(
os
.
path
.
join
(
pretraining_params_path
,
var
.
name
))
fluid
.
io
.
load_vars
(
exe
,
pretraining_params_path
,
main_program
=
main_program
,
predicate
=
existed_params
)
logger
.
info
(
"Load pretraining parameters from {}."
.
format
(
pretraining_params_path
))
def
context
(
self
,
max_seq_len
=
128
,
trainable
=
True
,
):
"""
get inputs, outputs and program from pre-trained module
Args:
max_seq_len (int): the max sequence length
trainable (bool): optimizing the pre-trained module params during training or not
Returns: inputs, outputs, program.
The inputs is a dict with keys named input_ids, position_ids, segment_ids, input_mask and task_ids
The outputs is a dict with two keys named pooled_output and sequence_output.
"""
assert
max_seq_len
<=
self
.
MAX_SEQ_LEN
and
max_seq_len
>=
1
,
"max_seq_len({}) should be in the range of [1, {}]"
.
format
(
max_seq_len
,
self
.
MAX_SEQ_LEN
)
module_program
=
fluid
.
Program
()
startup_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
module_program
,
startup_program
):
with
fluid
.
unique_name
.
guard
(
"@HUB_%s@"
%
self
.
name
):
input_ids
=
fluid
.
layers
.
data
(
name
=
'input_ids'
,
shape
=
[
-
1
,
max_seq_len
,
1
],
dtype
=
'int64'
,
lod_level
=
0
)
position_ids
=
fluid
.
layers
.
data
(
name
=
'position_ids'
,
shape
=
[
-
1
,
max_seq_len
,
1
],
dtype
=
'int64'
,
lod_level
=
0
)
segment_ids
=
fluid
.
layers
.
data
(
name
=
'segment_ids'
,
shape
=
[
-
1
,
max_seq_len
,
1
],
dtype
=
'int64'
,
lod_level
=
0
)
input_mask
=
fluid
.
layers
.
data
(
name
=
'input_mask'
,
shape
=
[
-
1
,
max_seq_len
,
1
],
dtype
=
'float32'
,
lod_level
=
0
)
pooled_output
,
sequence_output
=
self
.
net
(
input_ids
,
position_ids
,
segment_ids
,
input_mask
)
inputs
=
{
'input_ids'
:
input_ids
,
'position_ids'
:
position_ids
,
'segment_ids'
:
segment_ids
,
'input_mask'
:
input_mask
,
}
outputs
=
{
"pooled_output"
:
pooled_output
,
"sequence_output"
:
sequence_output
,
0
:
pooled_output
,
1
:
sequence_output
}
place
=
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup_program
)
self
.
init_pretraining_params
(
exe
,
self
.
params_path
,
main_program
=
startup_program
)
self
.
params_layer
=
{}
for
param
in
module_program
.
global_block
().
iter_parameters
():
param
.
trainable
=
trainable
match
=
re
.
match
(
r
'.*layer_(\d+).*'
,
param
.
name
)
if
match
:
# layer num begins from 0
layer
=
match
.
group
(
1
)
self
.
params_layer
[
param
.
name
]
=
int
(
layer
)
return
inputs
,
outputs
,
module_program
def
get_embedding
(
self
,
texts
,
use_gpu
=
False
,
batch_size
=
1
):
"""
get pooled_output and sequence_output for input texts.
Warnings: this method depends on Paddle Inference Library, it may not work properly in PaddlePaddle < 1.6.2.
Args:
texts (list): each element is a text sample, each sample include text_a and text_b where text_b can be omitted.
for example: [[sample0_text_a, sample0_text_b], [sample1_text_a, sample1_text_b], ...]
use_gpu (bool): use gpu or not, default False.
batch_size (int): the data batch size, default 1.
Returns:
pooled_outputs(list): its element is a numpy array, the first feature of each text sample.
sequence_outputs(list): its element is a numpy array, the whole features of each text sample.
"""
if
not
hasattr
(
self
,
"emb_job"
)
or
self
.
emb_job
[
"batch_size"
]
!=
batch_size
or
self
.
emb_job
[
"use_gpu"
]
!=
use_gpu
:
inputs
,
outputs
,
program
=
self
.
context
(
trainable
=
True
,
max_seq_len
=
self
.
MAX_SEQ_LEN
)
reader
=
hub
.
reader
.
ClassifyReader
(
dataset
=
None
,
vocab_path
=
self
.
get_vocab_path
(),
max_seq_len
=
self
.
MAX_SEQ_LEN
,
sp_model_path
=
self
.
get_spm_path
()
if
hasattr
(
self
,
"get_spm_path"
)
else
None
,
word_dict_path
=
self
.
get_word_dict_path
()
if
hasattr
(
self
,
"word_dict_path"
)
else
None
)
feed_list
=
[
inputs
[
"input_ids"
].
name
,
inputs
[
"position_ids"
].
name
,
inputs
[
"segment_ids"
].
name
,
inputs
[
"input_mask"
].
name
,
]
pooled_feature
,
seq_feature
=
outputs
[
"pooled_output"
],
outputs
[
"sequence_output"
]
config
=
hub
.
RunConfig
(
use_data_parallel
=
False
,
use_cuda
=
use_gpu
,
batch_size
=
batch_size
)
self
.
emb_job
=
{}
self
.
emb_job
[
"task"
]
=
_BERTEmbeddingTask
(
pooled_feature
=
pooled_feature
,
seq_feature
=
seq_feature
,
feed_list
=
feed_list
,
data_reader
=
reader
,
config
=
config
,
)
self
.
emb_job
[
"batch_size"
]
=
batch_size
self
.
emb_job
[
"use_gpu"
]
=
use_gpu
return
self
.
emb_job
[
"task"
].
predict
(
data
=
texts
,
return_result
=
True
,
accelerate_mode
=
True
)
def
get_vocab_path
(
self
):
return
self
.
vocab_path
def
get_spm_path
(
self
):
if
hasattr
(
self
,
"spm_path"
):
return
self
.
spm_path
else
:
return
None
def
get_word_dict_path
(
self
):
if
hasattr
(
self
,
"word_dict_path"
):
return
self
.
word_dict_path
else
:
return
None
def
get_params_layer
(
self
):
if
not
hasattr
(
self
,
"params_layer"
):
raise
AttributeError
(
"The module context has not been initialized. "
"Please call context() before using get_params_layer"
)
return
self
.
params_layer
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录