未验证 提交 afba7194 编写于 作者: jm_12138's avatar jm_12138 提交者: GitHub

update lac (#2025)

Co-authored-by: Nwuzewu <wuzewu@baidu.com>
Co-authored-by: Nchenjian <chenjian26@baidu.com>
上级 cfd8f7f5
......@@ -283,10 +283,10 @@
升级自定义词典功能,支持增加不属于lac默认提供的词性
* 2.2.1
* 2.3.0
移除 fluid api
- ```shell
$ hub install lac==2.2.1
$ hub install lac==2.3.0
```
......@@ -6,25 +6,20 @@ from __future__ import print_function
import argparse
import ast
import io
import json
import math
import os
import numpy as np
import paddle
import six
from lac.custom import Customization
from lac.processor import load_kv_dict
from lac.processor import parse_result
from lac.processor import word_to_ids
from .custom import Customization
from .processor import load_kv_dict
from .processor import parse_result
from .processor import word_to_ids
from paddle.inference import Config
from paddle.inference import create_predictor
import paddlehub as hub
from paddlehub.common.logger import logger
from paddlehub.common.paddle_helper import add_vars_prefix
from paddlehub.common.utils import sys_stdin_encoding
from paddlehub.io.parser import txt_parser
from paddlehub.utils.utils import sys_stdin_encoding
from paddlehub.utils.parser import txt_parser
from paddlehub.module.module import moduleinfo
from paddlehub.module.module import runnable
from paddlehub.module.module import serving
......@@ -38,19 +33,18 @@ class DataFormatError(Exception):
@moduleinfo(
name="lac",
version="2.2.1",
version="2.3.0",
summary=
"Baidu's open-source lexical analysis tool for Chinese, including word segmentation, part-of-speech tagging & named entity recognition",
author="baidu-nlp",
author_email="paddle-dev@baidu.com",
type="nlp/lexical_analysis")
class LAC(hub.Module):
def _initialize(self, user_dict=None):
class LAC:
def __init__(self, user_dict=None):
"""
initialize with the necessary elements
"""
self.pretrained_model_path = os.path.join(self.directory, "infer_model")
self.default_pretrained_model_path = os.path.join(self.directory, "infer_model", "model")
self.word2id_dict = load_kv_dict(os.path.join(self.directory, "assets/word.dic"), reverse=True, value_func=int)
self.id2word_dict = load_kv_dict(os.path.join(self.directory, "assets/word.dic"))
self.label2id_dict = load_kv_dict(os.path.join(self.directory, "assets/tag.dic"), reverse=True, value_func=int)
......@@ -72,7 +66,9 @@ class LAC(hub.Module):
"""
predictor config setting
"""
cpu_config = Config(self.pretrained_model_path)
model = self.default_pretrained_model_path+'.pdmodel'
params = self.default_pretrained_model_path+'.pdiparams'
cpu_config = Config(model, params)
cpu_config.disable_glog_info()
cpu_config.disable_gpu()
self.cpu_predictor = create_predictor(cpu_config)
......@@ -84,7 +80,7 @@ class LAC(hub.Module):
except:
use_gpu = False
if use_gpu:
gpu_config = Config(self.pretrained_model_path)
gpu_config = Config(model, params)
gpu_config.disable_glog_info()
gpu_config.enable_use_gpu(memory_pool_init_size_mb=500, device_id=0)
self.gpu_predictor = create_predictor(gpu_config)
......
# -*- coding:utf-8 -*-
import paddle.fluid as fluid
def lex_net(word_dict_len, label_dict_len):
"""
define the lexical analysis network structure
"""
word_emb_dim = 128
grnn_hidden_dim = 128
emb_lr = 2
crf_lr = 0.2
bigru_num = 2
init_bound = 0.1
IS_SPARSE = True
def _bigru_layer(input_feature):
"""
define the bidirectional gru layer
"""
pre_gru = fluid.layers.fc(
input=input_feature,
size=grnn_hidden_dim * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
gru = fluid.layers.dynamic_gru(
input=pre_gru,
size=grnn_hidden_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
pre_gru_r = fluid.layers.fc(
input=input_feature,
size=grnn_hidden_dim * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
gru_r = fluid.layers.dynamic_gru(
input=pre_gru_r,
size=grnn_hidden_dim,
is_reverse=True,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1)
return bi_merge
def _net_conf(word):
"""
Configure the network
"""
word_embedding = fluid.layers.embedding(
input=word,
size=[word_dict_len, word_emb_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(
learning_rate=emb_lr,
name="word_emb",
initializer=fluid.initializer.Uniform(low=-init_bound, high=init_bound)))
input_feature = word_embedding
for i in range(bigru_num):
bigru_output = _bigru_layer(input_feature)
input_feature = bigru_output
emission = fluid.layers.fc(
size=label_dict_len,
input=bigru_output,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(regularization_coeff=1e-4)))
size = emission.shape[1]
fluid.layers.create_parameter(shape=[size + 2, size], dtype=emission.dtype, name='crfw')
crf_decode = fluid.layers.crf_decoding(input=emission, param_attr=fluid.ParamAttr(name='crfw'))
return crf_decode, emission
word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
crf_decode, emission = _net_conf(word)
return crf_decode, word, emission
# -*- coding:utf-8 -*-
import io
import os
import numpy as np
import six
......
import os
import shutil
import unittest
import paddlehub as hub
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
class TestHubModule(unittest.TestCase):
@classmethod
def setUpClass(cls) -> None:
cls.text = "今天是个好日子"
cls.texts = ["今天是个好日子", "天气预报说今天要下雨", "下一班地铁马上就要到了"]
cls.module = hub.Module(name="lac")
@classmethod
def tearDownClass(cls) -> None:
shutil.rmtree('inference')
def test_cut1(self):
results = self.module.cut(
text=self.text,
use_gpu=False,
batch_size=1,
return_tag=False
)
self.assertEqual(results, ['今天', '是', '个', '好日子'])
def test_cut2(self):
results = self.module.cut(
text=self.texts,
use_gpu=False,
batch_size=1,
return_tag=False
)
self.assertEqual(results, [
{'word': ['今天', '是', '个', '好日子']},
{'word': ['天气预报', '说', '今天', '要', '下雨']},
{'word': ['下', '一班', '地铁', '马上', '就要', '到', '了']}
])
def test_cut3(self):
results = self.module.cut(
text=self.texts,
use_gpu=False,
batch_size=2,
return_tag=False
)
self.assertEqual(results, [
{'word': ['今天', '是', '个', '好日子']},
{'word': ['天气预报', '说', '今天', '要', '下雨']},
{'word': ['下', '一班', '地铁', '马上', '就要', '到', '了']}
])
def test_cut4(self):
results = self.module.cut(
text=self.texts,
use_gpu=True,
batch_size=2,
return_tag=False
)
self.assertEqual(results, [
{'word': ['今天', '是', '个', '好日子']},
{'word': ['天气预报', '说', '今天', '要', '下雨']},
{'word': ['下', '一班', '地铁', '马上', '就要', '到', '了']}
])
def test_cut5(self):
results = self.module.cut(
text=self.texts,
use_gpu=True,
batch_size=2,
return_tag=True
)
self.assertEqual(results, [
{
'word': ['今天', '是', '个', '好日子'],
'tag': ['TIME', 'v', 'q', 'n']
},
{
'word': ['天气预报', '说', '今天', '要', '下雨'],
'tag': ['n', 'v', 'TIME', 'v', 'v']
},
{
'word': ['下', '一班', '地铁', '马上', '就要', '到', '了'],
'tag': ['f', 'm', 'n', 'd', 'v', 'v', 'xc']
}
])
def test_save_inference_model(self):
self.module.save_inference_model('./inference/model')
self.assertTrue(os.path.exists('./inference/model.pdmodel'))
self.assertTrue(os.path.exists('./inference/model.pdiparams'))
if __name__ == '__main__':
unittest.main()
春天/SEASON
花/n 开/v
秋天的风
落 阳
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册