未验证 提交 41f6d967 编写于 作者: K kinghuin 提交者: GitHub

add BERTEmbeddingTask and BERTModule (#424)

* add bertmodule
上级 b547f728
...@@ -16,7 +16,6 @@ ...@@ -16,7 +16,6 @@
import argparse import argparse
import ast import ast
import paddle.fluid as fluid
import paddlehub as hub import paddlehub as hub
# yapf: disable # yapf: disable
......
...@@ -63,3 +63,5 @@ from .finetune.strategy import ULMFiTStrategy ...@@ -63,3 +63,5 @@ from .finetune.strategy import ULMFiTStrategy
from .finetune.strategy import CombinedStrategy from .finetune.strategy import CombinedStrategy
from .autofinetune.evaluator import report_final_result from .autofinetune.evaluator import report_final_result
from .module.nlp_module import BERTModule
...@@ -23,8 +23,6 @@ import sys ...@@ -23,8 +23,6 @@ import sys
import functools import functools
import inspect import inspect
import importlib import importlib
import tarfile
import six
import shutil import shutil
import paddle import paddle
...@@ -36,15 +34,10 @@ from paddlehub.common.dir import CACHE_HOME ...@@ -36,15 +34,10 @@ from paddlehub.common.dir import CACHE_HOME
from paddlehub.common.lock import lock from paddlehub.common.lock import lock
from paddlehub.common.logger import logger from paddlehub.common.logger import logger
from paddlehub.common.hub_server import CacheUpdater from paddlehub.common.hub_server import CacheUpdater
from paddlehub.common import tmp_dir
from paddlehub.common.downloader import progress
from paddlehub.module import module_desc_pb2 from paddlehub.module import module_desc_pb2
from paddlehub.module.manager import default_module_manager from paddlehub.module.manager import default_module_manager
from paddlehub.module.checker import ModuleChecker from paddlehub.module.checker import ModuleChecker
from paddlehub.module.signature import Signature, create_signature from paddlehub.module.signature import Signature, create_signature
from paddlehub.module.base_processor import BaseProcessor
from paddlehub.io.parser import yaml_parser
from paddlehub import version
# PaddleHub module dir name # PaddleHub module dir name
ASSETS_DIRNAME = "assets" ASSETS_DIRNAME = "assets"
......
#coding:utf-8
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import re
import paddlehub as hub
import paddle.fluid as fluid
from paddlehub import logger
class _BERTEmbeddingTask(hub.BaseTask):
def __init__(self,
pooled_feature,
seq_feature,
feed_list,
data_reader,
config=None):
main_program = pooled_feature.block.program
super(_BERTEmbeddingTask, self).__init__(
main_program=main_program,
data_reader=data_reader,
feed_list=feed_list,
config=config,
metrics_choices=[])
self.pooled_feature = pooled_feature
self.seq_feature = seq_feature
def _build_net(self):
return [self.pooled_feature, self.seq_feature]
def _postprocessing(self, run_states):
results = []
for batch_state in run_states:
batch_result = batch_state.run_results
batch_pooled_features = batch_result[0]
batch_seq_features = batch_result[1]
for i in range(len(batch_pooled_features)):
results.append(
[batch_pooled_features[i], batch_seq_features[i]])
return results
class BERTModule(hub.Module):
def _initialize(self):
"""
Must override this method.
some member variables are required, others are optional.
"""
# required config
self.MAX_SEQ_LEN = None
self.params_path = None
self.vocab_path = None
# optional config
self.spm_path = None
self.word_dict_path = None
raise NotImplementedError
def init_pretraining_params(self, exe, pretraining_params_path,
main_program):
assert os.path.exists(
pretraining_params_path
), "[%s] cann't be found." % pretraining_params_path
def existed_params(var):
if not isinstance(var, fluid.framework.Parameter):
return False
return os.path.exists(
os.path.join(pretraining_params_path, var.name))
fluid.io.load_vars(
exe,
pretraining_params_path,
main_program=main_program,
predicate=existed_params)
logger.info("Load pretraining parameters from {}.".format(
pretraining_params_path))
def context(
self,
max_seq_len=128,
trainable=True,
):
"""
get inputs, outputs and program from pre-trained module
Args:
max_seq_len (int): the max sequence length
trainable (bool): optimizing the pre-trained module params during training or not
Returns: inputs, outputs, program.
The inputs is a dict with keys named input_ids, position_ids, segment_ids, input_mask and task_ids
The outputs is a dict with two keys named pooled_output and sequence_output.
"""
assert max_seq_len <= self.MAX_SEQ_LEN and max_seq_len >= 1, "max_seq_len({}) should be in the range of [1, {}]".format(
max_seq_len, self.MAX_SEQ_LEN)
module_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(module_program, startup_program):
with fluid.unique_name.guard("@HUB_%s@" % self.name):
input_ids = fluid.layers.data(
name='input_ids',
shape=[-1, max_seq_len, 1],
dtype='int64',
lod_level=0)
position_ids = fluid.layers.data(
name='position_ids',
shape=[-1, max_seq_len, 1],
dtype='int64',
lod_level=0)
segment_ids = fluid.layers.data(
name='segment_ids',
shape=[-1, max_seq_len, 1],
dtype='int64',
lod_level=0)
input_mask = fluid.layers.data(
name='input_mask',
shape=[-1, max_seq_len, 1],
dtype='float32',
lod_level=0)
pooled_output, sequence_output = self.net(
input_ids, position_ids, segment_ids, input_mask)
inputs = {
'input_ids': input_ids,
'position_ids': position_ids,
'segment_ids': segment_ids,
'input_mask': input_mask,
}
outputs = {
"pooled_output": pooled_output,
"sequence_output": sequence_output,
0: pooled_output,
1: sequence_output
}
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_program)
self.init_pretraining_params(
exe, self.params_path, main_program=startup_program)
self.params_layer = {}
for param in module_program.global_block().iter_parameters():
param.trainable = trainable
match = re.match(r'.*layer_(\d+).*', param.name)
if match:
# layer num begins from 0
layer = match.group(1)
self.params_layer[param.name] = int(layer)
return inputs, outputs, module_program
def get_embedding(self, texts, use_gpu=False, batch_size=1):
"""
get pooled_output and sequence_output for input texts.
Warnings: this method depends on Paddle Inference Library, it may not work properly in PaddlePaddle < 1.6.2.
Args:
texts (list): each element is a text sample, each sample include text_a and text_b where text_b can be omitted.
for example: [[sample0_text_a, sample0_text_b], [sample1_text_a, sample1_text_b], ...]
use_gpu (bool): use gpu or not, default False.
batch_size (int): the data batch size, default 1.
Returns:
pooled_outputs(list): its element is a numpy array, the first feature of each text sample.
sequence_outputs(list): its element is a numpy array, the whole features of each text sample.
"""
if not hasattr(
self, "emb_job"
) or self.emb_job["batch_size"] != batch_size or self.emb_job[
"use_gpu"] != use_gpu:
inputs, outputs, program = self.context(
trainable=True, max_seq_len=self.MAX_SEQ_LEN)
reader = hub.reader.ClassifyReader(
dataset=None,
vocab_path=self.get_vocab_path(),
max_seq_len=self.MAX_SEQ_LEN,
sp_model_path=self.get_spm_path() if hasattr(
self, "get_spm_path") else None,
word_dict_path=self.get_word_dict_path() if hasattr(
self, "word_dict_path") else None)
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
pooled_feature, seq_feature = outputs["pooled_output"], outputs[
"sequence_output"]
config = hub.RunConfig(
use_data_parallel=False,
use_cuda=use_gpu,
batch_size=batch_size)
self.emb_job = {}
self.emb_job["task"] = _BERTEmbeddingTask(
pooled_feature=pooled_feature,
seq_feature=seq_feature,
feed_list=feed_list,
data_reader=reader,
config=config,
)
self.emb_job["batch_size"] = batch_size
self.emb_job["use_gpu"] = use_gpu
return self.emb_job["task"].predict(
data=texts, return_result=True, accelerate_mode=True)
def get_vocab_path(self):
return self.vocab_path
def get_spm_path(self):
if hasattr(self, "spm_path"):
return self.spm_path
else:
return None
def get_word_dict_path(self):
if hasattr(self, "word_dict_path"):
return self.word_dict_path
else:
return None
def get_params_layer(self):
if not hasattr(self, "params_layer"):
raise AttributeError(
"The module context has not been initialized. "
"Please call context() before using get_params_layer")
return self.params_layer
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册