update lac demo

fd34ef4f · wuzewu · 19e2a4fa · fd34ef4f · fd34ef4f · fd34ef4f
16 changed file
--- a/demo/lac/README.md
+++ b/demo/lac/README.md
--- a/demo/lac/create_module.py
+++ b/demo/lac/create_module.py
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import reader
+import paddle_hub as hub
+import processor
+import os
+from network import lex_net
+def create_module():
+    word_dict_path = "resources/word.dic"
+    label_dict_path = "resources/tag.dic"
+    word_rep_dict_path = "resources/q2b.dic"
+    pretrained_model = "resources/model"
+    word2id_dict = reader.load_reverse_dict(word_dict_path)
+    label2id_dict = reader.load_reverse_dict(label_dict_path)
+    word_rep_dict = reader.load_dict(word_rep_dict_path)
+    word_dict_len = max(map(int, word2id_dict.values())) + 1
+    label_dict_len = max(map(int, label2id_dict.values())) + 1
+    avg_cost, crf_decode, word, target = lex_net(word_dict_len, label_dict_len)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+    # load the lac pretrained model
+    def if_exist(var):
+        return os.path.exists(os.path.join(pretrained_model, var.name))
+    fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+    # assets
+    assets = [word_dict_path, label_dict_path, word_rep_dict_path]
+    # create a module and save as hub_module_lac
+    sign = hub.create_signature(
+        name="lexical_analysis", inputs=[word], outputs=[crf_decode])
+    hub.create_module(
+        sign_arr=[sign],
+        module_dir="hub_module_lac",
+        exe=exe,
+        module_info="resources/module_info.yml",
+        processor=processor.Processor,
+        assets=assets)
+if __name__ == "__main__":
+    create_module()
--- a/demo/lac/create_module.sh
+++ b/demo/lac/create_module.sh
+#!/bin/bash
+set -o nounset
+set -o errexit
+script_path=$(cd `dirname $0`; pwd)
+cd $script_path
+python create_module.py
--- a/demo/lac/infer.sh
+++ b/demo/lac/infer.sh
+python ../../paddle_hub/commands/hub.py run hub_module_lac/ --signature lexical_analysis --config resources/test/test.yml  --dataset resources/test/test.csv
--- a/demo/lac/network.py
+++ b/demo/lac/network.py
+import sys
+import os
+import math
+import paddle.fluid as fluid
+from paddle.fluid.initializer import NormalInitializer
+def lex_net(word_dict_len, label_dict_len):
+    """
+    define the lexical analysis network structure
+    """
+    word_emb_dim = 128
+    grnn_hidden_dim = 256
+    emb_lr = 5
+    crf_lr = 0.2
+    bigru_num = 2
+    init_bound = 0.1
+    IS_SPARSE = True
+    def _bigru_layer(input_feature):
+        """
+        define the bidirectional gru layer
+        """
+        pre_gru = fluid.layers.fc(
+            input=input_feature,
+            size=grnn_hidden_dim * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+        gru = fluid.layers.dynamic_gru(
+            input=pre_gru,
+            size=grnn_hidden_dim,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+        pre_gru_r = fluid.layers.fc(
+            input=input_feature,
+            size=grnn_hidden_dim * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+        gru_r = fluid.layers.dynamic_gru(
+            input=pre_gru_r,
+            size=grnn_hidden_dim,
+            is_reverse=True,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+        bi_merge = fluid.layers.concat(input=[gru, gru_r], axis=1)
+        return bi_merge
+    def _net_conf(word, target):
+        """
+        Configure the network
+        """
+        word_embedding = fluid.layers.embedding(
+            input=word,
+            size=[word_dict_len, word_emb_dim],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr=fluid.ParamAttr(
+                learning_rate=emb_lr,
+                name="word_emb",
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound)))
+        input_feature = word_embedding
+        for i in range(bigru_num):
+            bigru_output = _bigru_layer(input_feature)
+            input_feature = bigru_output
+        emission = fluid.layers.fc(
+            size=label_dict_len,
+            input=bigru_output,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Uniform(
+                    low=-init_bound, high=init_bound),
+                regularizer=fluid.regularizer.L2DecayRegularizer(
+                    regularization_coeff=1e-4)))
+        crf_cost = fluid.layers.linear_chain_crf(
+            input=emission,
+            label=target,
+            param_attr=fluid.ParamAttr(name='crfw', learning_rate=crf_lr))
+        crf_decode = fluid.layers.crf_decoding(
+            input=emission, param_attr=fluid.ParamAttr(name='crfw'))
+        avg_cost = fluid.layers.mean(x=crf_cost)
+        return avg_cost, crf_decode
+    word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
+    target = fluid.layers.data(
+        name="target", shape=[1], dtype='int64', lod_level=1)
+    avg_cost, crf_decode = _net_conf(word, target)
+    return avg_cost, crf_decode, word, target
--- a/demo/lac/processor.py
+++ b/demo/lac/processor.py
+import paddle
+import paddle.fluid as fluid
+import paddle_hub as hub
+import numpy as np
+import os
+import io
+from paddle_hub import BaseProcessor
+class Processor(BaseProcessor):
+    def __init__(self, module):
+        self.module = module
+        assets_path = self.module.helper.assets_path()
+        word_dict_path = os.path.join(assets_path, "word.dic")
+        label_dict_path = os.path.join(assets_path, "tag.dic")
+        word_rep_dict_path = os.path.join(assets_path, "q2b.dic")
+        self.id2word_dict = self.load_dict(word_dict_path)
+        self.word2id_dict = self.load_reverse_dict(word_dict_path)
+        self.id2label_dict = self.load_dict(label_dict_path)
+        self.label2id_dict = self.load_reverse_dict(label_dict_path)
+        self.q2b_dict = self.load_dict(word_rep_dict_path)
+    def load_dict(self, dict_path):
+        result_dict = {}
+        for line in io.open(dict_path, "r", encoding='utf8'):
+            terms = line.strip("\n").split("\t")
+            if len(terms) != 2:
+                continue
+            result_dict[terms[0]] = terms[1]
+        return result_dict
+    def load_reverse_dict(self, dict_path):
+        result_dict = {}
+        for line in io.open(dict_path, "r", encoding='utf8'):
+            terms = line.strip("\n").split("\t")
+            if len(terms) != 2:
+                continue
+            result_dict[terms[1]] = terms[0]
+        return result_dict
+    def preprocess(self, sign_name, data_dict):
+        result = {'text': []}
+        for sentence in data_dict['text']:
+            result_i = {}
+            result_i['origin'] = sentence
+            line = sentence.strip()
+            word_idx = []
+            for word in line:
+                if ord(word) < 0x20:
+                    word = ' '
+                if word in self.q2b_dict:
+                    word = self.q2b_dict[word]
+                if word in self.word2id_dict:
+                    word_idx.append(int(self.word2id_dict[word]))
+                else:
+                    word_idx.append(int(self.word2id_dict["OOV"]))
+            result_i['attach'] = line
+            result_i['processed'] = [x for x in word_idx]
+            result['text'].append(result_i)
+        return result
+    def postprocess(self, sign_name, data_out, data_info, **kwargs):
+        if sign_name == "lexical_analysis":
+            result = []
+            crf_decode = data_out[0]
+            lod_info = (crf_decode.lod())[0]
+            np_data = np.array(crf_decode)
+            for index in range(len(lod_info) - 1):
+                seg_result = {"word": [], "tag": []}
+                word_index = 0
+                outstr = ""
+                offset = 0
+                cur_full_word = ""
+                cur_full_tag = ""
+                words = data_info['text'][index]['attach']
+                for tag_index in range(lod_info[index], lod_info[index + 1]):
+                    cur_word = words[word_index]
+                    cur_tag = self.id2label_dict[str(np_data[tag_index][0])]
+                    if cur_tag.endswith("-B") or cur_tag.endswith("O"):
+                        if len(cur_full_word) != 0:
+                            seg_result['word'].append(cur_full_word)
+                            seg_result['tag'].append(cur_full_tag)
+                        cur_full_word = cur_word
+                        cur_full_tag = self.get_real_tag(cur_tag)
+                    else:
+                        cur_full_word += cur_word
+                    word_index += 1
+                seg_result['word'].append(cur_full_word)
+                seg_result['tag'].append(cur_full_tag)
+                result.append(seg_result)
+            return result
+    def get_real_tag(self, origin_tag):
+        if origin_tag == "O":
+            return "O"
+        return origin_tag[0:len(origin_tag) - 2]
+    def data_format(self, sign_name):
+        if sign_name == "lexical_analysis":
+            return {
+                "text": {
+                    "type": hub.DataType.TEXT,
+                    "feed_key": self.module.signatures[sign_name].inputs[0].name
+                }
+            }
+        return None
--- a/demo/lac/reader.py
+++ b/demo/lac/reader.py
+"""
+The file_reader converts raw corpus to input.
+"""
+import os
+import __future__
+import io
+def file_reader(file_dir,
+                word2id_dict,
+                label2id_dict,
+                word_replace_dict,
+                filename_feature=""):
+    """
+    define the reader to read files in file_dir
+    """
+    word_dict_len = max(map(int, word2id_dict.values())) + 1
+    label_dict_len = max(map(int, label2id_dict.values())) + 1
+    def reader():
+        """
+        the data generator
+        """
+        index = 0
+        for root, dirs, files in os.walk(file_dir):
+            for filename in files:
+                if not filename.startswith(filename_feature):
+                    continue
+                for line in io.open(
+                        os.path.join(root, filename), 'r', encoding='utf8'):
+                    index += 1
+                    bad_line = False
+                    line = line.strip("\n")
+                    if len(line) == 0:
+                        continue
+                    seg_tag = line.rfind("\t")
+                    word_part = line[0:seg_tag]
+                    label_part = line[seg_tag + 1:]
+                    word_idx = []
+                    words = word_part
+                    for word in words:
+                        if ord(word) < 0x20:
+                            word = ' '
+                        if word in word_replace_dict:
+                            word = word_replace_dict[word]
+                        if word in word2id_dict:
+                            word_idx.append(int(word2id_dict[word]))
+                        else:
+                            word_idx.append(int(word2id_dict["OOV"]))
+                    target_idx = []
+                    labels = label_part.strip().split(" ")
+                    for label in labels:
+                        if label in label2id_dict:
+                            target_idx.append(int(label2id_dict[label]))
+                        else:
+                            target_idx.append(int(label2id_dict["O"]))
+                    if len(word_idx) != len(target_idx):
+                        continue
+                    yield word_idx, target_idx
+    return reader
+def test_reader(file_dir,
+                word2id_dict,
+                label2id_dict,
+                word_replace_dict,
+                filename_feature=""):
+    """
+    define the reader to read test files in file_dir
+    """
+    word_dict_len = max(map(int, word2id_dict.values())) + 1
+    label_dict_len = max(map(int, label2id_dict.values())) + 1
+    def reader():
+        """
+        the data generator
+        """
+        index = 0
+        for root, dirs, files in os.walk(file_dir):
+            for filename in files:
+                if not filename.startswith(filename_feature):
+                    continue
+                for line in io.open(
+                        os.path.join(root, filename), 'r', encoding='utf8'):
+                    index += 1
+                    bad_line = False
+                    line = line.strip("\n")
+                    if len(line) == 0:
+                        continue
+                    seg_tag = line.rfind("\t")
+                    if seg_tag == -1:
+                        seg_tag = len(line)
+                    word_part = line[0:seg_tag]
+                    label_part = line[seg_tag + 1:]
+                    word_idx = []
+                    words = word_part
+                    for word in words:
+                        if ord(word) < 0x20:
+                            word = ' '
+                        if word in word_replace_dict:
+                            word = word_replace_dict[word]
+                        if word in word2id_dict:
+                            word_idx.append(int(word2id_dict[word]))
+                        else:
+                            word_idx.append(int(word2id_dict["OOV"]))
+                    yield word_idx, words
+    return reader
+def load_dict(dict_path):
+    """
+    Load a dict. The first column is the key and the second column is the value.
+    """
+    result_dict = {}
+    for line in io.open(dict_path, "r", encoding='utf8'):
+        terms = line.strip("\n").split("\t")
+        if len(terms) != 2:
+            continue
+        result_dict[terms[0]] = terms[1]
+    return result_dict
+def load_reverse_dict(dict_path):
+    """
+    Load a dict. The first column is the value and the second column is the key.
+    """
+    result_dict = {}
+    for line in io.open(dict_path, "r", encoding='utf8'):
+        terms = line.strip("\n").split("\t")
+        if len(terms) != 2:
+            continue
+        result_dict[terms[1]] = terms[0]
+    return result_dict
--- a/demo/lac/resources/customization.dic
+++ b/demo/lac/resources/customization.dic
--- a/demo/lac/resources/download.sh
+++ b/demo/lac/resources/download.sh
+#!/bin/bash
+set -o nounset
+set -o errexit
+script_path=$(cd `dirname $0`; pwd)
+cd $script_path
+wget --no-check-certificate https://paddlehub.bj.bcebos.com/paddle_model/lac.tar.gz
+tar xvzf lac.tar.gz
+rm lac.tar.gz
--- a/demo/lac/resources/module_info.yml
+++ b/demo/lac/resources/module_info.yml
+name: lac
+type: nlp/lexical_analysis
+author: paddlepaddle
+author_email: paddle-dev@baidu.com
+version: 1.0.0
--- a/demo/lac/resources/q2b.dic
+++ b/demo/lac/resources/q2b.dic
+、	,
+。	.
+—	-
+～	~
+‖	|
+…	.
+‘	'
+’	'
+“	"
+”	"
+〔	(
+〕	)
+〈	<
+〉	>
+「	'
+」	'
+『	"
+』	"
+〖	[
+〗	]
+【	[
+】	]
+∶	:
+＄	$
+！	!
+＂	"
+＃	#
+％	%
+＆	&
+＇	'
+（	(
+）	)
+＊	*
+＋	+
+，	,
+－	-
+．	.
+／	/
+０	0
+１	1
+２	2
+３	3
+４	4
+５	5
+６	6
+７	7
+８	8
+９	9
+：	:
+；	;
+＜	<
+＝	=
+＞	>
+？	?
+＠	@
+Ａ	a
+Ｂ	b
+Ｃ	c
+Ｄ	d
+Ｅ	e
+Ｆ	f
+Ｇ	g
+Ｈ	h
+Ｉ	i
+Ｊ	j
+Ｋ	k
+Ｌ	l
+Ｍ	m
+Ｎ	n
+Ｏ	o
+Ｐ	p
+Ｑ	q
+Ｒ	r
+Ｓ	s
+Ｔ	t
+Ｕ	u
+Ｖ	v
+Ｗ	w
+Ｘ	x
+Ｙ	y
+Ｚ	z
+［	[
+＼	\
+］	]
+＾	^
+＿	_
+｀	`
+ａ	a
+ｂ	b
+ｃ	c
+ｄ	d
+ｅ	e
+ｆ	f
+ｇ	g
+ｈ	h
+ｉ	i
+ｊ	j
+ｋ	k
+ｌ	l
+ｍ	m
+ｎ	n
+ｏ	o
+ｐ	p
+ｑ	q
+ｒ	r
+ｓ	s
+ｔ	t
+ｕ	u
+ｖ	v
+ｗ	w
+ｘ	x
+ｙ	y
+ｚ	z
+｛	{
+｜	|
+｝	}
+￣	~
+〝	"
+〞	"
+﹐	,
+﹑	,
+﹒	.
+﹔	;
+﹕	:
+﹖	?
+﹗	!
+﹙	(
+﹚	)
+﹛	{
+﹜	{
+﹝	[
+﹞	]
+﹟	#
+﹠	&
+﹡	*
+﹢	+
+﹣	-
+﹤	<
+﹥	>
+﹦	=
+﹨	\
+﹩	$
+﹪	%
+﹫	@
+ 	,
+A	a
+B	b
+C	c
+D	d
+E	e
+F	f
+G	g
+H	h
+I	i
+J	j
+K	k
+L	l
+M	m
+N	n
+O	o
+P	p
+Q	q
+R	r
+S	s
+T	t
+U	u
+V	v
+W	w
+X	x
+Y	y
+Z	z
--- a/demo/lac/resources/strong_punc.dic
+++ b/demo/lac/resources/strong_punc.dic
+!
+。
+！
+;
+；
--- a/demo/lac/resources/tag.dic
+++ b/demo/lac/resources/tag.dic
+0	a-B
+1	a-I
+2	ad-B
+3	ad-I
+4	an-B
+5	an-I
+6	c-B
+7	c-I
+8	d-B
+9	d-I
+10	f-B
+11	f-I
+12	m-B
+13	m-I
+14	n-B
+15	n-I
+16	nr-B
+17	nr-I
+18	ns-B
+19	ns-I
+20	nt-B
+21	nt-I
+22	nw-B
+23	nw-I
+24	nz-B
+25	nz-I
+26	p-B
+27	p-I
+28	q-B
+29	q-I
+30	r-B
+31	r-I
+32	s-B
+33	s-I
+34	t-B
+35	t-I
+36	u-B
+37	u-I
+38	v-B
+39	v-I
+40	vd-B
+41	vd-I
+42	vn-B
+43	vn-I
+44	w-B
+45	w-I
+46	xc-B
+47	xc-I
+48	PER-B
+49	PER-I
+50	LOC-B
+51	LOC-I
+52	ORG-B
+53	ORG-I
+54	TIME-B
+55	TIME-I
+56	O
--- a/demo/lac/resources/test/test.csv
+++ b/demo/lac/resources/test/test.csv
+TEXT_INPUT
+今天是个好日子
--- a/demo/lac/resources/test/test.yml
+++ b/demo/lac/resources/test/test.yml
+input_data:
+  text:
+    type : TEXT
+    key : TEXT_INPUT
--- a/demo/lac/resources/word.dic
+++ b/demo/lac/resources/word.dic