提交 fd34ef4f 编写于 作者: W wuzewu

update lac demo

上级 19e2a4fa
import numpy as np
import paddle
import paddle.fluid as fluid
import reader
import paddle_hub as hub
import processor
import os
from network import lex_net
def create_module():
word_dict_path = "resources/word.dic"
label_dict_path = "resources/tag.dic"
word_rep_dict_path = "resources/q2b.dic"
pretrained_model = "resources/model"
word2id_dict = reader.load_reverse_dict(word_dict_path)
label2id_dict = reader.load_reverse_dict(label_dict_path)
word_rep_dict = reader.load_dict(word_rep_dict_path)
word_dict_len = max(map(int, word2id_dict.values())) + 1
label_dict_len = max(map(int, label2id_dict.values())) + 1
avg_cost, crf_decode, word, target = lex_net(word_dict_len, label_dict_len)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
# load the lac pretrained model
def if_exist(var):
return os.path.exists(os.path.join(pretrained_model, var.name))
fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
# assets
assets = [word_dict_path, label_dict_path, word_rep_dict_path]
# create a module and save as hub_module_lac
sign = hub.create_signature(
name="lexical_analysis", inputs=[word], outputs=[crf_decode])
hub.create_module(
sign_arr=[sign],
module_dir="hub_module_lac",
exe=exe,
module_info="resources/module_info.yml",
processor=processor.Processor,
assets=assets)
if __name__ == "__main__":
create_module()
#!/bin/bash
set -o nounset
set -o errexit
script_path=$(cd `dirname $0`; pwd)
cd $script_path
python create_module.py
python ../../paddle_hub/commands/hub.py run hub_module_lac/ --signature lexical_analysis --config resources/test/test.yml --dataset resources/test/test.csv
import sys
import os
import math
import paddle.fluid as fluid
from paddle.fluid.initializer import NormalInitializer
def lex_net(word_dict_len, label_dict_len):
"""
define the lexical analysis network structure
"""
word_emb_dim = 128
grnn_hidden_dim = 256
emb_lr = 5
crf_lr = 0.2
bigru_num = 2
init_bound = 0.1
IS_SPARSE = True
def _bigru_layer(input_feature):
"""
define the bidirectional gru layer
"""
pre_gru = fluid.layers.fc(
input=input_feature,
size=grnn_hidden_dim * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
gru = fluid.layers.dynamic_gru(
input=pre_gru,
size=grnn_hidden_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
pre_gru_r = fluid.layers.fc(
input=input_feature,
size=grnn_hidden_dim * 3,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
gru_r = fluid.layers.dynamic_gru(
input=pre_gru_r,
size=grnn_hidden_dim,
is_reverse=True,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1)
return bi_merge
def _net_conf(word, target):
"""
Configure the network
"""
word_embedding = fluid.layers.embedding(
input=word,
size=[word_dict_len, word_emb_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(
learning_rate=emb_lr,
name="word_emb",
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound)))
input_feature = word_embedding
for i in range(bigru_num):
bigru_output = _bigru_layer(input_feature)
input_feature = bigru_output
emission = fluid.layers.fc(
size=label_dict_len,
input=bigru_output,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Uniform(
low=-init_bound, high=init_bound),
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=1e-4)))
crf_cost = fluid.layers.linear_chain_crf(
input=emission,
label=target,
param_attr=fluid.ParamAttr(name='crfw', learning_rate=crf_lr))
crf_decode = fluid.layers.crf_decoding(
input=emission, param_attr=fluid.ParamAttr(name='crfw'))
avg_cost = fluid.layers.mean(x=crf_cost)
return avg_cost, crf_decode
word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
target = fluid.layers.data(
name="target", shape=[1], dtype='int64', lod_level=1)
avg_cost, crf_decode = _net_conf(word, target)
return avg_cost, crf_decode, word, target
import paddle
import paddle.fluid as fluid
import paddle_hub as hub
import numpy as np
import os
import io
from paddle_hub import BaseProcessor
class Processor(BaseProcessor):
def __init__(self, module):
self.module = module
assets_path = self.module.helper.assets_path()
word_dict_path = os.path.join(assets_path, "word.dic")
label_dict_path = os.path.join(assets_path, "tag.dic")
word_rep_dict_path = os.path.join(assets_path, "q2b.dic")
self.id2word_dict = self.load_dict(word_dict_path)
self.word2id_dict = self.load_reverse_dict(word_dict_path)
self.id2label_dict = self.load_dict(label_dict_path)
self.label2id_dict = self.load_reverse_dict(label_dict_path)
self.q2b_dict = self.load_dict(word_rep_dict_path)
def load_dict(self, dict_path):
result_dict = {}
for line in io.open(dict_path, "r", encoding='utf8'):
terms = line.strip("\n").split("\t")
if len(terms) != 2:
continue
result_dict[terms[0]] = terms[1]
return result_dict
def load_reverse_dict(self, dict_path):
result_dict = {}
for line in io.open(dict_path, "r", encoding='utf8'):
terms = line.strip("\n").split("\t")
if len(terms) != 2:
continue
result_dict[terms[1]] = terms[0]
return result_dict
def preprocess(self, sign_name, data_dict):
result = {'text': []}
for sentence in data_dict['text']:
result_i = {}
result_i['origin'] = sentence
line = sentence.strip()
word_idx = []
for word in line:
if ord(word) < 0x20:
word = ' '
if word in self.q2b_dict:
word = self.q2b_dict[word]
if word in self.word2id_dict:
word_idx.append(int(self.word2id_dict[word]))
else:
word_idx.append(int(self.word2id_dict["OOV"]))
result_i['attach'] = line
result_i['processed'] = [x for x in word_idx]
result['text'].append(result_i)
return result
def postprocess(self, sign_name, data_out, data_info, **kwargs):
if sign_name == "lexical_analysis":
result = []
crf_decode = data_out[0]
lod_info = (crf_decode.lod())[0]
np_data = np.array(crf_decode)
for index in range(len(lod_info) - 1):
seg_result = {"word": [], "tag": []}
word_index = 0
outstr = ""
offset = 0
cur_full_word = ""
cur_full_tag = ""
words = data_info['text'][index]['attach']
for tag_index in range(lod_info[index], lod_info[index + 1]):
cur_word = words[word_index]
cur_tag = self.id2label_dict[str(np_data[tag_index][0])]
if cur_tag.endswith("-B") or cur_tag.endswith("O"):
if len(cur_full_word) != 0:
seg_result['word'].append(cur_full_word)
seg_result['tag'].append(cur_full_tag)
cur_full_word = cur_word
cur_full_tag = self.get_real_tag(cur_tag)
else:
cur_full_word += cur_word
word_index += 1
seg_result['word'].append(cur_full_word)
seg_result['tag'].append(cur_full_tag)
result.append(seg_result)
return result
def get_real_tag(self, origin_tag):
if origin_tag == "O":
return "O"
return origin_tag[0:len(origin_tag) - 2]
def data_format(self, sign_name):
if sign_name == "lexical_analysis":
return {
"text": {
"type": hub.DataType.TEXT,
"feed_key": self.module.signatures[sign_name].inputs[0].name
}
}
return None
"""
The file_reader converts raw corpus to input.
"""
import os
import __future__
import io
def file_reader(file_dir,
word2id_dict,
label2id_dict,
word_replace_dict,
filename_feature=""):
"""
define the reader to read files in file_dir
"""
word_dict_len = max(map(int, word2id_dict.values())) + 1
label_dict_len = max(map(int, label2id_dict.values())) + 1
def reader():
"""
the data generator
"""
index = 0
for root, dirs, files in os.walk(file_dir):
for filename in files:
if not filename.startswith(filename_feature):
continue
for line in io.open(
os.path.join(root, filename), 'r', encoding='utf8'):
index += 1
bad_line = False
line = line.strip("\n")
if len(line) == 0:
continue
seg_tag = line.rfind("\t")
word_part = line[0:seg_tag]
label_part = line[seg_tag + 1:]
word_idx = []
words = word_part
for word in words:
if ord(word) < 0x20:
word = ' '
if word in word_replace_dict:
word = word_replace_dict[word]
if word in word2id_dict:
word_idx.append(int(word2id_dict[word]))
else:
word_idx.append(int(word2id_dict["OOV"]))
target_idx = []
labels = label_part.strip().split(" ")
for label in labels:
if label in label2id_dict:
target_idx.append(int(label2id_dict[label]))
else:
target_idx.append(int(label2id_dict["O"]))
if len(word_idx) != len(target_idx):
continue
yield word_idx, target_idx
return reader
def test_reader(file_dir,
word2id_dict,
label2id_dict,
word_replace_dict,
filename_feature=""):
"""
define the reader to read test files in file_dir
"""
word_dict_len = max(map(int, word2id_dict.values())) + 1
label_dict_len = max(map(int, label2id_dict.values())) + 1
def reader():
"""
the data generator
"""
index = 0
for root, dirs, files in os.walk(file_dir):
for filename in files:
if not filename.startswith(filename_feature):
continue
for line in io.open(
os.path.join(root, filename), 'r', encoding='utf8'):
index += 1
bad_line = False
line = line.strip("\n")
if len(line) == 0:
continue
seg_tag = line.rfind("\t")
if seg_tag == -1:
seg_tag = len(line)
word_part = line[0:seg_tag]
label_part = line[seg_tag + 1:]
word_idx = []
words = word_part
for word in words:
if ord(word) < 0x20:
word = ' '
if word in word_replace_dict:
word = word_replace_dict[word]
if word in word2id_dict:
word_idx.append(int(word2id_dict[word]))
else:
word_idx.append(int(word2id_dict["OOV"]))
yield word_idx, words
return reader
def load_dict(dict_path):
"""
Load a dict. The first column is the key and the second column is the value.
"""
result_dict = {}
for line in io.open(dict_path, "r", encoding='utf8'):
terms = line.strip("\n").split("\t")
if len(terms) != 2:
continue
result_dict[terms[0]] = terms[1]
return result_dict
def load_reverse_dict(dict_path):
"""
Load a dict. The first column is the value and the second column is the key.
"""
result_dict = {}
for line in io.open(dict_path, "r", encoding='utf8'):
terms = line.strip("\n").split("\t")
if len(terms) != 2:
continue
result_dict[terms[1]] = terms[0]
return result_dict
#!/bin/bash
set -o nounset
set -o errexit
script_path=$(cd `dirname $0`; pwd)
cd $script_path
wget --no-check-certificate https://paddlehub.bj.bcebos.com/paddle_model/lac.tar.gz
tar xvzf lac.tar.gz
rm lac.tar.gz
name: lac
type: nlp/lexical_analysis
author: paddlepaddle
author_email: paddle-dev@baidu.com
version: 1.0.0
 
、 ,
。 .
— -
~ ~
‖ |
… .
‘ '
’ '
“ "
” "
〔 (
〕 )
〈 <
〉 >
「 '
」 '
『 "
』 "
〖 [
〗 ]
【 [
】 ]
∶ :
$ $
! !
" "
# #
% %
& &
' '
( (
) )
* *
+ +
, ,
- -
. .
/ /
0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
: :
; ;
< <
= =
> >
? ?
@ @
A a
B b
C c
D d
E e
F f
G g
H h
I i
J j
K k
L l
M m
N n
O o
P p
Q q
R r
S s
T t
U u
V v
W w
X x
Y y
Z z
[ [
\ \
] ]
^ ^
_ _
` `
a a
b b
c c
d d
e e
f f
g g
h h
i i
j j
k k
l l
m m
n n
o o
p p
q q
r r
s s
t t
u u
v v
w w
x x
y y
z z
{ {
| |
} }
 ̄ ~
〝 "
〞 "
﹐ ,
﹑ ,
﹒ .
﹔ ;
﹕ :
﹖ ?
﹗ !
﹙ (
﹚ )
﹛ {
﹜ {
﹝ [
﹞ ]
﹟ #
﹠ &
﹡ *
﹢ +
﹣ -
﹤ <
﹥ >
﹦ =
﹨ \
﹩ $
﹪ %
﹫ @
,
A a
B b
C c
D d
E e
F f
G g
H h
I i
J j
K k
L l
M m
N n
O o
P p
Q q
R r
S s
T t
U u
V v
W w
X x
Y y
Z z
0 a-B
1 a-I
2 ad-B
3 ad-I
4 an-B
5 an-I
6 c-B
7 c-I
8 d-B
9 d-I
10 f-B
11 f-I
12 m-B
13 m-I
14 n-B
15 n-I
16 nr-B
17 nr-I
18 ns-B
19 ns-I
20 nt-B
21 nt-I
22 nw-B
23 nw-I
24 nz-B
25 nz-I
26 p-B
27 p-I
28 q-B
29 q-I
30 r-B
31 r-I
32 s-B
33 s-I
34 t-B
35 t-I
36 u-B
37 u-I
38 v-B
39 v-I
40 vd-B
41 vd-I
42 vn-B
43 vn-I
44 w-B
45 w-I
46 xc-B
47 xc-I
48 PER-B
49 PER-I
50 LOC-B
51 LOC-I
52 ORG-B
53 ORG-I
54 TIME-B
55 TIME-I
56 O
TEXT_INPUT
今天是个好日子
input_data:
text:
type : TEXT
key : TEXT_INPUT
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册