未验证 提交 77696b06 编写于 作者: 片刻小哥哥's avatar 片刻小哥哥 提交者: GitHub

Merge pull request #604 from jiangzhonglian/master

移动 命名实体的代码位置
...@@ -3,7 +3,6 @@ __pycache__/ ...@@ -3,7 +3,6 @@ __pycache__/
*.py[cod] *.py[cod]
*$py.class *$py.class
.vscode .vscode
zh-NER
# C extensions # C extensions
*.so *.so
......
# *-* coding:utf-8 *-*
'''
@author: 片刻
@date: 20200901 22:02
'''
class TextNER(object):
DEBUG = False
path_root = "/home/apachecn/jiangzhonglian"
if DEBUG:
path_root = "/Users/jiangzhonglian/data/nlp/命名实体识别/data"
path_train = '%s/train_data.data' % path_root
path_test = '%s/test_data.data' % path_root
path_config = '%s/config.pkl' % path_root
path_model = '%s/model.h5' % path_root
# 迭代次数
EPOCHS = 3
# embedding的列数
EMBED_DIM = 128
# LSTM的列数
BiLSTM_UNITS = 128
class Config(object):
nlp_ner = TextNER()
numpy
pandas
sklearn
keras
tensorflow
git+https://www.github.com/keras-team/keras-contrib.git
\ No newline at end of file
import tutorials.keras.text_NER as ft
def main():
ft.main()
if __name__ == "__main__":
main()
...@@ -21,8 +21,8 @@ from keras.models import load_model ...@@ -21,8 +21,8 @@ from keras.models import load_model
from keras.layers.normalization import BatchNormalization from keras.layers.normalization import BatchNormalization
from keras.layers import Dropout, Dense, Flatten, Bidirectional, Embedding, GRU, Input, multiply from keras.layers import Dropout, Dense, Flatten, Bidirectional, Embedding, GRU, Input, multiply
""" """
# padding: pre 向前补充0 post 向后补充0 # padding: pre(默认) 向前补充0 post 向后补充0
# truncating: 文本超过 pad_num, pre 删除前面 post 删除后面 # truncating: 文本超过 pad_num, pre(默认) 删除前面 post 删除后面
# x_train = pad_sequences(x, maxlen=pad_num, value=0, padding='post', truncating="post") # x_train = pad_sequences(x, maxlen=pad_num, value=0, padding='post', truncating="post")
# print("--- ", x_train[0][:20]) # print("--- ", x_train[0][:20])
""" """
......
import pickle import pickle
import numpy as np import numpy as np
import pandas as pd
import platform import platform
from collections import Counter from collections import Counter
from keras.models import Sequential from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM from keras.layers import Embedding, Bidirectional, LSTM
from keras_contrib.layers import CRF from keras_contrib.layers import CRF
"""
# padding: pre(默认) 向前补充0 post 向后补充0
# truncating: 文本超过 pad_num, pre(默认) 删除前面 post 删除后面
# x_train = pad_sequences(x, maxlen=pad_num, value=0, padding='post', truncating="post")
# print("--- ", x_train[0][:20])
使用keras_bert、keras_contrib的crf时bug记录
TypeError: Tensors in list passed to 'values' of 'ConcatV2' Op have types [bool, float32] that don't all match
解决方案, 修改crf.py 516行:
mask2 = K.cast(K.concatenate([mask, K.zeros_like(mask[:, :1])], axis=1),
为:
mask2 = K.cast(K.concatenate([mask, K.cast(K.zeros_like(mask[:, :1]), mask.dtype)], axis=1),
"""
from keras.preprocessing.sequence import pad_sequences from keras.preprocessing.sequence import pad_sequences
from config.setting import Config
EMBED_DIM = 200
BiRNN_UNITS = 200
def load_data(): def load_data():
train = _parse_data(open('zh-NER/data/train_data.data', 'rb')) train = _parse_data(Config.nlp_ner.path_train)
test = _parse_data(open('zh-NER/data/test_data.data', 'rb')) test = _parse_data(Config.nlp_ner.path_test)
print("--- init 数据加载解析完成 ---")
# Counter({'的': 8, '中': 7, '致': 7, '党': 7})
word_counts = Counter(row[0].lower() for sample in train for row in sample) word_counts = Counter(row[0].lower() for sample in train for row in sample)
vocab = [w for w, f in iter(word_counts.items()) if f >= 2] vocab = [w for w, f in iter(word_counts.items()) if f >= 2]
chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"] chunk_tags = ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', "B-ORG", "I-ORG"]
# save initial config data # 存储保留的有效个数的 vovab 和 对应 chunk_tags
with open('zh-NER/model/config.pkl', 'wb') as outp: with open(Config.nlp_ner.path_config, 'wb') as outp:
pickle.dump((vocab, chunk_tags), outp) pickle.dump((vocab, chunk_tags), outp)
print("--- init 配置文件保存成功 ---")
train = _process_data(train, vocab, chunk_tags) train = _process_data(train, vocab, chunk_tags)
test = _process_data(test, vocab, chunk_tags) test = _process_data(test , vocab, chunk_tags)
print("--- init 对数据进行编码,生成训练需要的数据格式 ---")
return train, test, (vocab, chunk_tags) return train, test, (vocab, chunk_tags)
def _parse_data(fh): def _parse_data(filename):
# in windows the new line is '\r\n\r\n' the space is '\r\n' . so if you use windows system, """
# you have to use recorsponding instructions 以单下划线开头(_foo)的代表不能直接访问的类属性
用于解析数据,用于模型训练
if platform.system() == 'Windows': :param filename: 文件地址
split_text = '\r\n' :return: data: 解析数据后的结果
else: [[['中', 'B-ORG'], ['共', 'I-ORG']], [['中', 'B-ORG'], ['国', 'I-ORG']]]
"""
with open(filename, 'rb') as fn:
split_text = '\n' split_text = '\n'
# 主要是分句: split_text 默认每个句子都是一行,所以原来换行就需要 两个split_text
string = fh.read().decode('utf-8') texts = fn.read().decode('utf-8').strip().split(split_text + split_text)
data = [[row.split() for row in sample.split(split_text)] for # 对于每个字需要 split_text, 而字的内部需要用空格分隔
sample in data = [[row.split() for row in text.split(split_text)] for text in texts]
string.strip().split(split_text + split_text)]
fh.close()
return data return data
def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False): def _process_data(data, vocab, chunk_tags, maxlen=None, onehot=False):
if maxlen is None: if maxlen is None:
maxlen = max(len(s) for s in data) maxlen = max(len(s) for s in data)
# 对每个字进行编码
word2idx = dict((w, i) for i, w in enumerate(vocab)) word2idx = dict((w, i) for i, w in enumerate(vocab))
x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data] # set to <unk> (index 1) if not in vocab # 如果不在 vocab里面,就给 unk 值为 1
x = [[word2idx.get(w[0].lower(), 1) for w in s] for s in data]
y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data] y_chunk = [[chunk_tags.index(w[1]) for w in s] for s in data]
x = pad_sequences(x, maxlen) # left padding x = pad_sequences(x, maxlen) # left padding
y_chunk = pad_sequences(y_chunk, maxlen, value=-1) y_chunk = pad_sequences(y_chunk, maxlen, value=-1)
if onehot: if onehot:
# 返回一个onehot 编码的多维数组
y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk] y_chunk = np.eye(len(chunk_tags), dtype='float32')[y_chunk]
else: else:
# np.expand_dims:用于扩展数组的形状
# https://blog.csdn.net/hong615771420/article/details/83448878
y_chunk = np.expand_dims(y_chunk, 2) y_chunk = np.expand_dims(y_chunk, 2)
return x, y_chunk return x, y_chunk
...@@ -74,38 +92,33 @@ def process_data(data, vocab, maxlen=100): ...@@ -74,38 +92,33 @@ def process_data(data, vocab, maxlen=100):
return x, length return x, length
def create_model(train=True): def create_model(len_vocab, len_chunk_tags):
if train:
(train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data()
else:
with open('model/config.pkl', 'rb') as inp:
(vocab, chunk_tags) = pickle.load(inp)
model = Sequential() model = Sequential()
model.add(Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding model.add(Embedding(len_vocab, Config.nlp_ner.EMBED_DIM, mask_zero=True)) # Random embedding
model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True))) model.add(Bidirectional(LSTM(Config.nlp_ner.BiLSTM_UNITS // 2, return_sequences=True)))
crf = CRF(len(chunk_tags), sparse_target=True) crf = CRF(len_chunk_tags, sparse_target=True)
model.add(crf) model.add(crf)
model.summary() model.summary()
model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy]) model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])
if train: return model
return model, (train_x, train_y), (test_x, test_y)
else:
return model, (vocab, chunk_tags)
def train(): def train():
EPOCHS = 10 (train_x, train_y), (test_x, test_y), (vocab, chunk_tags) = load_data()
model, (train_x, train_y), (test_x, test_y) = create_model() model = create_model(len(vocab), len(chunk_tags))
# train model # train model
model.fit(train_x, train_y,batch_size=16,epochs=EPOCHS, validation_data=[test_x, test_y]) model.fit(train_x, train_y, batch_size=16, epochs=Config.nlp_ner.EPOCHS, validation_data=[test_x, test_y])
model.save('model/crf.h5') model.save(Config.nlp_ner.path_model)
def test(): def test():
model, (vocab, chunk_tags) = create_model(train=False) with open(Config.nlp_ner.path_config, 'rb') as inp:
(vocab, chunk_tags) = pickle.load(inp)
model = create_model(len(vocab), len(chunk_tags))
predict_text = '中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下,连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚' predict_text = '中华人民共和国国务院总理周恩来在外交部长陈毅的陪同下,连续访问了埃塞俄比亚等非洲10国以及阿尔巴尼亚'
str, length = process_data(predict_text, vocab) text_EMBED, length = process_data(predict_text, vocab)
model.load_weights('model/crf.h5') model.load_weights(Config.nlp_ner.path_model)
raw = model.predict(str)[0][-length:] raw = model.predict(text_EMBED)[0][-length:]
result = [np.argmax(row) for row in raw] result = [np.argmax(row) for row in raw]
result_tags = [chunk_tags[i] for i in result] result_tags = [chunk_tags[i] for i in result]
...@@ -122,5 +135,9 @@ def test(): ...@@ -122,5 +135,9 @@ def test():
print(['person:' + per, 'location:' + loc, 'organzation:' + org]) print(['person:' + per, 'location:' + loc, 'organzation:' + org])
if __name__ == "__main__": def main():
# print("--")
train() train()
# if __name__ == "__main__":
# train()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册