未验证 提交 afba7194 编写于 作者: jm_12138's avatar jm_12138 提交者: GitHub

update lac (#2025)

Co-authored-by: Nwuzewu <wuzewu@baidu.com>
Co-authored-by: Nchenjian <chenjian26@baidu.com>
上级 cfd8f7f5
...@@ -283,10 +283,10 @@ ...@@ -283,10 +283,10 @@
升级自定义词典功能,支持增加不属于lac默认提供的词性 升级自定义词典功能,支持增加不属于lac默认提供的词性
* 2.2.1 * 2.3.0
移除 fluid api 移除 fluid api
- ```shell - ```shell
$ hub install lac==2.2.1 $ hub install lac==2.3.0
``` ```
...@@ -6,25 +6,20 @@ from __future__ import print_function ...@@ -6,25 +6,20 @@ from __future__ import print_function
import argparse import argparse
import ast import ast
import io import io
import json
import math import math
import os import os
import numpy as np import numpy as np
import paddle
import six import six
from lac.custom import Customization from .custom import Customization
from lac.processor import load_kv_dict from .processor import load_kv_dict
from lac.processor import parse_result from .processor import parse_result
from lac.processor import word_to_ids from .processor import word_to_ids
from paddle.inference import Config from paddle.inference import Config
from paddle.inference import create_predictor from paddle.inference import create_predictor
import paddlehub as hub from paddlehub.utils.utils import sys_stdin_encoding
from paddlehub.common.logger import logger from paddlehub.utils.parser import txt_parser
from paddlehub.common.paddle_helper import add_vars_prefix
from paddlehub.common.utils import sys_stdin_encoding
from paddlehub.io.parser import txt_parser
from paddlehub.module.module import moduleinfo from paddlehub.module.module import moduleinfo
from paddlehub.module.module import runnable from paddlehub.module.module import runnable
from paddlehub.module.module import serving from paddlehub.module.module import serving
...@@ -38,19 +33,18 @@ class DataFormatError(Exception): ...@@ -38,19 +33,18 @@ class DataFormatError(Exception):
@moduleinfo( @moduleinfo(
name="lac", name="lac",
version="2.2.1", version="2.3.0",
summary= summary=
"Baidu's open-source lexical analysis tool for Chinese, including word segmentation, part-of-speech tagging & named entity recognition", "Baidu's open-source lexical analysis tool for Chinese, including word segmentation, part-of-speech tagging & named entity recognition",
author="baidu-nlp", author="baidu-nlp",
author_email="paddle-dev@baidu.com", author_email="paddle-dev@baidu.com",
type="nlp/lexical_analysis") type="nlp/lexical_analysis")
class LAC(hub.Module): class LAC:
def __init__(self, user_dict=None):
def _initialize(self, user_dict=None):
""" """
initialize with the necessary elements initialize with the necessary elements
""" """
self.pretrained_model_path = os.path.join(self.directory, "infer_model") self.default_pretrained_model_path = os.path.join(self.directory, "infer_model", "model")
self.word2id_dict = load_kv_dict(os.path.join(self.directory, "assets/word.dic"), reverse=True, value_func=int) self.word2id_dict = load_kv_dict(os.path.join(self.directory, "assets/word.dic"), reverse=True, value_func=int)
self.id2word_dict = load_kv_dict(os.path.join(self.directory, "assets/word.dic")) self.id2word_dict = load_kv_dict(os.path.join(self.directory, "assets/word.dic"))
self.label2id_dict = load_kv_dict(os.path.join(self.directory, "assets/tag.dic"), reverse=True, value_func=int) self.label2id_dict = load_kv_dict(os.path.join(self.directory, "assets/tag.dic"), reverse=True, value_func=int)
...@@ -72,7 +66,9 @@ class LAC(hub.Module): ...@@ -72,7 +66,9 @@ class LAC(hub.Module):
""" """
predictor config setting predictor config setting
""" """
cpu_config = Config(self.pretrained_model_path) model = self.default_pretrained_model_path+'.pdmodel'
params = self.default_pretrained_model_path+'.pdiparams'
cpu_config = Config(model, params)
cpu_config.disable_glog_info() cpu_config.disable_glog_info()
cpu_config.disable_gpu() cpu_config.disable_gpu()
self.cpu_predictor = create_predictor(cpu_config) self.cpu_predictor = create_predictor(cpu_config)
...@@ -84,7 +80,7 @@ class LAC(hub.Module): ...@@ -84,7 +80,7 @@ class LAC(hub.Module):
except: except:
use_gpu = False use_gpu = False
if use_gpu: if use_gpu:
gpu_config = Config(self.pretrained_model_path) gpu_config = Config(model, params)
gpu_config.disable_glog_info() gpu_config.disable_glog_info()
gpu_config.enable_use_gpu(memory_pool_init_size_mb=500, device_id=0) gpu_config.enable_use_gpu(memory_pool_init_size_mb=500, device_id=0)
self.gpu_predictor = create_predictor(gpu_config) self.gpu_predictor = create_predictor(gpu_config)
......
# -*- coding:utf-8 -*-
import paddle.fluid as fluid
def lex_net(word_dict_len, label_dict_len):
"""
define the lexical analysis network structure
"""
word_emb_dim = 128
grnn_hidden_dim = 128
emb_lr = 2
crf_lr = 0.2
bigru_num = 2
init_bound = 0.1
IS_SPARSE = True
def _bigru_layer(input_feature):
"""
define the bidirectional gru layer
"""
pre_gru = fluid.layers.fc(
input=input_feature,
size=grnn_hidden_dim * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
gru = fluid.layers.dynamic_gru(
input=pre_gru,
size=grnn_hidden_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
pre_gru_r = fluid.layers.fc(
input=input_feature,
size=grnn_hidden_dim * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
gru_r = fluid.layers.dynamic_gru(
input=pre_gru_r,
size=grnn_hidden_dim,
is_reverse=True,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1)
return bi_merge
def _net_conf(word):
"""
Configure the network
"""
word_embedding = fluid.layers.embedding(
input=word,
size=[word_dict_len, word_emb_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(
learning_rate=emb_lr,
name="word_emb",
initializer=fluid.initializer.Uniform(low=-init_bound, high=init_bound)))
input_feature = word_embedding
for i in range(bigru_num):
bigru_output = _bigru_layer(input_feature)
input_feature = bigru_output
emission = fluid.layers.fc(
size=label_dict_len,
input=bigru_output,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
size = emission.shape[1]
fluid.layers.create_parameter(shape=[size + 2, size], dtype=emission.dtype, name='crfw')
crf_decode = fluid.layers.crf_decoding(input=emission, param_attr=fluid.ParamAttr(name='crfw'))
return crf_decode, emission
word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
crf_decode, emission = _net_conf(word)
return crf_decode, word, emission
# -*- coding:utf-8 -*- # -*- coding:utf-8 -*-
import io import io
import os
import numpy as np import numpy as np
import six import six
......
import os
import shutil
import unittest
import paddlehub as hub
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
class TestHubModule(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
cls.text = "今天是个好日子"
cls.texts = ["今天是个好日子", "天气预报说今天要下雨", "下一班地铁马上就要到了"]
cls.module = hub.Module(name="lac")
@classmethod
def tearDownClass(cls) -> None:
shutil.rmtree('inference')
def test_cut1(self):
results = self.module.cut(
text=self.text,
use_gpu=False,
batch_size=1,
return_tag=False
)
self.assertEqual(results, ['今天', '是', '个', '好日子'])
def test_cut2(self):
results = self.module.cut(
text=self.texts,
use_gpu=False,
batch_size=1,
return_tag=False
)
self.assertEqual(results, [
{'word': ['今天', '是', '个', '好日子']},
{'word': ['天气预报', '说', '今天', '要', '下雨']},
{'word': ['下', '一班', '地铁', '马上', '就要', '到', '了']}
])
def test_cut3(self):
results = self.module.cut(
text=self.texts,
use_gpu=False,
batch_size=2,
return_tag=False
)
self.assertEqual(results, [
{'word': ['今天', '是', '个', '好日子']},
{'word': ['天气预报', '说', '今天', '要', '下雨']},
{'word': ['下', '一班', '地铁', '马上', '就要', '到', '了']}
])
def test_cut4(self):
results = self.module.cut(
text=self.texts,
use_gpu=True,
batch_size=2,
return_tag=False
)
self.assertEqual(results, [
{'word': ['今天', '是', '个', '好日子']},
{'word': ['天气预报', '说', '今天', '要', '下雨']},
{'word': ['下', '一班', '地铁', '马上', '就要', '到', '了']}
])
def test_cut5(self):
results = self.module.cut(
text=self.texts,
use_gpu=True,
batch_size=2,
return_tag=True
)
self.assertEqual(results, [
{
'word': ['今天', '是', '个', '好日子'],
'tag': ['TIME', 'v', 'q', 'n']
},
{
'word': ['天气预报', '说', '今天', '要', '下雨'],
'tag': ['n', 'v', 'TIME', 'v', 'v']
},
{
'word': ['下', '一班', '地铁', '马上', '就要', '到', '了'],
'tag': ['f', 'm', 'n', 'd', 'v', 'v', 'xc']
}
])
def test_save_inference_model(self):
self.module.save_inference_model('./inference/model')
self.assertTrue(os.path.exists('./inference/model.pdmodel'))
self.assertTrue(os.path.exists('./inference/model.pdiparams'))
if __name__ == '__main__':
unittest.main()
春天/SEASON
花/n 开/v
秋天的风
落 阳
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册