未验证 提交 9b70b1f3 编写于 作者: S SiMing Dai 提交者: GitHub

Add topic model (#759)

上级 3eece7b5
...@@ -48,10 +48,7 @@ class ChineseTextDetectionDB(hub.Module): ...@@ -48,10 +48,7 @@ class ChineseTextDetectionDB(hub.Module):
try: try:
import shapely, pyclipper import shapely, pyclipper
except: except:
print( raise ImportError('This module requires the shapely, pyclipper tools. The running environment does not meet the requirements. Please install the two packages.')
'This module requires the shapely, pyclipper tools. The running enviroment does not meet the requirments. Please install the two packages.'
)
exit()
def _set_config(self): def _set_config(self):
""" """
......
...@@ -48,10 +48,7 @@ class ChineseTextDetectionDBServer(hub.Module): ...@@ -48,10 +48,7 @@ class ChineseTextDetectionDBServer(hub.Module):
try: try:
import shapely, pyclipper import shapely, pyclipper
except: except:
print( raise ImportError('This module requires the shapely, pyclipper tools. The running environment does not meet the requirements. Please install the two packages.')
'This module requires the shapely, pyclipper tools. The running enviroment does not meet the requirments. Please install the two packages.'
)
exit()
def _set_config(self): def _set_config(self):
""" """
......
## 模型概述
主题模型(Topic Model)是以无监督学习的方式对文档的隐含语义结构进行聚类的统计模型,其中LDA(Latent Dirichlet Allocation)算法是主题模型的一种。LDA根据对词的共现信息的分析,拟合出词-文档-主题的分布,从而将词、文本映射到一个语义空间中。本Module基于的数据集为百度自建的新闻领域数据集。
<p align="center">
<img src="https://bj.bcebos.com/paddlehub/model/nlp/semantic_model/lda.png" hspace='10'/> <br />
</p>
更多详情请参考[LDA论文](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf)
注:该Module由第三方开发者DesmonDay贡献。
## LDA模型 API 说明
### cal_doc_distance(doc_text1, doc_text2)
用于计算两个输入文档之间的距离,包括Jensen-Shannon divergence(JS散度)、Hellinger Distance(海林格距离)。
**参数**
- doc_text1(str): 输入的第一个文档。
- doc_text2(str): 输入的第二个文档。
**返回**
- jsd(float): 两个文档之间的JS散度([Jensen-Shannon divergence](https://blog.csdn.net/FrankieHello/article/details/80614422?utm_source=copy))。
- hd(float): 两个文档之间的海林格距离([Hellinger Distance](http://blog.sina.com.cn/s/blog_85f1ffb70101e65d.html))。
### cal_doc_keywords_similarity(document, top_k=10)
用于查找输入文档的前k个关键词及对应的与原文档的相似度。
**参数**
- document(str): 输入文档。
- top_k(int): 查找输入文档的前k个关键词。
**返回**
- results(list): 包含每个关键词以及对应的与原文档的相似度。其中,list的基本元素为dict,dict的key为关键词,value为对应的与原文档的相似度。
### cal_query_doc_similarity(query, document)
用于计算短文档与长文档之间的相似度。
**参数**
- query(str): 输入的短文档。
- document(str): 输入的长文档。
**返回**
- lda_sim(float): 返回短文档与长文档之间的相似度。
### infer_doc_topic_distribution(document)
用于推理出文档的主题分布。
**参数**
- document(str): 输入文档。
**返回**
- results(list): 包含主题分布下各个主题ID和对应的概率分布。其中,list的基本元素为dict,dict的key为主题ID,value为各个主题ID对应的概率。
### show_topic_keywords(topic_id, k=10)
用于展示出每个主题下对应的关键词,可配合推理主题分布的API使用。
**参数**
- topic_id(int): 主题ID。
- k(int): 需要知道对应主题的前k个关键词。
**返回**
- results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。
### 代码示例
这里展示部分API的使用示例。
``` python
import paddlehub as hub
lda_news = hub.Module(name="lda_news")
jsd, hd = lda_news.cal_doc_distance(doc_text1="今天的天气如何,适合出去游玩吗", doc_text2="感觉今天的天气不错,可以出去玩一玩了")
# jsd = 0.003109, hd = 0.0573171
lda_sim = lda_news.cal_query_doc_similarity(query='百度搜索引擎', document='百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。百度超过千亿的中文网页数据库,可以瞬间找到相关的搜索结果。')
# LDA similarity = 0.06826
results = lda_news.cal_doc_keywords_similarity('百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。百度超过千亿的中文网页数据库,可以瞬间找到相关的搜索结果。')
# [{'word': '百度', 'similarity': 0.12943492762349573},
# {'word': '信息', 'similarity': 0.06139783578769882},
# {'word': '找到', 'similarity': 0.055296603463188265},
# {'word': '搜索', 'similarity': 0.04270794098349327},
# {'word': '全球', 'similarity': 0.03773627056367886},
# {'word': '超过', 'similarity': 0.03478658388202199},
# {'word': '相关', 'similarity': 0.026295857219683725},
# {'word': '获取', 'similarity': 0.021313585287833996},
# {'word': '中文', 'similarity': 0.020187103312009513},
# {'word': '搜索引擎', 'similarity': 0.007092890537169911}]
```
## 查看代码
https://github.com/baidu/Familia
## 依赖
paddlepaddle >= 1.8.2
paddlehub >= 1.8.0
## 更新历史
* 1.0.0
初始发布
"""
This file defines the basic config information of LDA/SLDA model.
"""
class ModelType:
LDA = 0
SLDA = 1
class ModelConfig:
type = None
num_topics = None
alpha = None
beta = None
word_topic_file = None
vocab_file = None
import numpy as np
class Topic(object):
"""Basic data structure of topic, contains topic id and
corresponding probability.
"""
def __init__(self, tid, prob):
self.tid = tid # topic id
self.prob = prob # topic probability
class Token(object):
"""Basic storage unit of LDA documents, contains word id
and corresponding topic.
"""
def __init__(self, topic, id):
self.topic = topic
self.id = id
class Sentence(object):
"""Basic storage unit of SentenceLDA documents, contains word ids
of the sentence and its corresponding topic id.
"""
def __init__(self, topic, tokens):
self.topic = topic
self.tokens = tokens
class LDADoc(object):
"""The storage structure of LDA model's inference result.
"""
def __init__(self):
self._num_topics = None # Number of topics.
self._num_accum = None # Number of accumulated sample rounds.
self._alpha = None # Document prior parameter.
self._tokens = None # Storage structure of inference results.
self._topic_sum = None # Document's topic sum in one round samples.
self._accum_topic_sum = None # Accumulated results of topic sum.
def init(self, num_topics):
"""Initialize the LDADoc according to num_topics.
"""
self._num_topics = num_topics
self._num_accum = 0
self._tokens = []
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_token(self, token):
"""Add new word to current LDADoc.
Arg:
token: Token class object.
"""
assert token.topic >= 0, "Topic %d out of range!" % token.topic
assert token.topic < self._num_topics, "Topic %d out of range!" % token.topic
self._tokens.append(token)
self._topic_sum[token.topic] += 1
def token(self, index):
return self._tokens[index]
def set_topic(self, index, new_topic):
"""Set the index word's topic to new_topic, and update the corresponding
topic distribution.
"""
assert new_topic >= 0, "Topic %d out of range!" % new_topic
assert new_topic < self._num_topics, "Topic %d out of range!" % new_topic
old_topic = self._tokens[index].topic
if new_topic == old_topic:
return
self._tokens[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def set_alpha(self, alpha):
self._alpha = alpha
def size(self):
"""Return number of words in LDADoc.
"""
return len(self._tokens)
def topic_sum(self, topic_id):
return self._topic_sum[topic_id]
def sparse_topic_dist(self, sort=True):
"""Return the topic distribution of documents in sparse format.
By default, it is sorted according to the topic probability
under the descending order.
"""
topic_dist = []
sum_ = np.sum(self._accum_topic_sum)
if sum_ == 0:
return
for i in range(0, self._num_topics):
if self._accum_topic_sum[i] == 0:
continue
topic_dist.append(Topic(i, self._accum_topic_sum[i] * 1.0 / sum_))
if sort:
def take_elem(topic):
return topic.prob
topic_dist.sort(key=take_elem, reverse=True)
if topic_dist is None:
topic_dist = []
return topic_dist
def dense_topic_dist(self):
"""Return the distribution of document topics in dense format,
taking into account the prior parameter alpha.
"""
dense_dist = np.zeros(self._num_topics)
if self.size() == 0:
return dense_dist
dense_dist = (
self._accum_topic_sum * 1.0 / self._num_accum + self._alpha) / (
self.size() + self._alpha * self._num_topics)
return dense_dist
def accumulate_topic_num(self):
self._accum_topic_sum += self._topic_sum
self._num_accum += 1
class SLDADoc(LDADoc):
"""Sentence LDA Document, inherited from LDADoc.
Add add_sentence interface.
"""
def __init__(self):
super().__init__()
self.__sentences = None
def init(self, num_topics):
"""Initialize the SLDADoc according to num_topics.
"""
self._num_topics = num_topics
self.__sentences = []
self._num_accum = 0
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_sentence(self, sent):
"""Add new sentence to current SLDADoc.
Arg:
sent: Sentence class object.
"""
assert sent.topic >= 0, "Topic %d out of range!" % (sent.topic)
assert sent.topic < self._num_topics, "Topic %d out of range!" % (
sent.topic)
self.__sentences.append(sent)
self._topic_sum[sent.topic] += 1
def set_topic(self, index, new_topic):
assert new_topic >= 0, "Topic %d out of range!" % (new_topic)
assert new_topic < self._num_topics, "Topic %d out of range!" % (
new_topic)
old_topic = self.__sentences[index].topic
if new_topic == old_topic:
return
self.__sentences[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def size(self):
"""Return number of sentences in SLDADoc.
"""
return len(self.__sentences)
def sent(self, index):
return self.__sentences[index]
import os
from paddlehub.common.logger import logger
from lda_news.config import ModelConfig
from lda_news.util import load_prototxt, fix_random_seed, rand_k
from lda_news.model import TopicModel
from lda_news.sampler import GibbsSampler, MHSampler
from lda_news.document import LDADoc, SLDADoc, Token, Sentence
from lda_news.vocab import OOV
class SamplerType:
GibbsSampling = 0
MetropolisHastings = 1
class InferenceEngine(object):
def __init__(self,
model_dir,
conf_file,
type=SamplerType.MetropolisHastings):
# Read model configuration.
config = ModelConfig()
conf_file_path = os.path.join(model_dir, conf_file)
load_prototxt(conf_file_path, config)
self.__model = TopicModel(model_dir, config)
self.__config = config
# Initialize the sampler according to the configuration.
if type == SamplerType.GibbsSampling:
self.__sampler = GibbsSampler(self.__model)
elif type == SamplerType.MetropolisHastings:
self.__sampler = MHSampler(self.__model)
def infer(self, input, doc):
"""Perform LDA topic inference on input, and store the results in doc.
Args:
input: a list of strings after tokenization.
doc: LDADoc type or SLDADoc type.
"""
fix_random_seed()
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for token in input:
id_ = self.__model.term_id(token)
if id_ != OOV:
init_topic = rand_k(self.__model.num_topics())
doc.add_token(Token(init_topic, id_))
self.lda_infer(doc, 20, 50)
elif isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for sent in input:
words = []
for token in sent:
id_ = self.__model.term_id(token)
if id_ != OOV:
words.append(id_)
init_topic = rand_k(self.__model.num_topics())
doc.add_sentence(Sentence(init_topic, words))
self.slda_infer(doc, 20, 50)
else:
logger.error("Wrong Doc Type!")
def lda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def slda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def model_type(self):
return self.__model.type()
def get_model(self):
return self.__model
def get_config(self):
return self.__config
import os
from collections import OrderedDict
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from lda_news.vocab import Vocab, WordCount
class TopicModel(object):
"""Storage Structure of Topic model, including vocabulary and word topic count.
"""
def __init__(self, model_dir, config):
"""
Args:
model_dir: the path of model directory
config: ModelConfig class.
"""
self.__word_topic = None # Model parameter of word topic.
self.__vocab = Vocab() # Vocab data structure of model.
self.__num_topics = config.num_topics # Number of topics.
self.__alpha = config.alpha
self.__alpha_sum = self.__alpha * self.__num_topics
self.__beta = config.beta
self.__beta_sum = None
self.__type = config.type # Model type.
self.__topic_sum = np.zeros(
self.__num_topics,
dtype="int64") # Accum sum of each topic in word topic.
self.__topic_words = [[] for _ in range(self.__num_topics)]
word_topic_path = os.path.join(model_dir, config.word_topic_file)
vocab_path = os.path.join(model_dir, config.vocab_file)
self.load_model(word_topic_path, vocab_path)
def term_id(self, term):
return self.__vocab.get_id(term)
def load_model(self, word_topic_path, vocab_path):
# Loading vocabulary
self.__vocab.load(vocab_path)
self.__beta_sum = self.__beta * self.__vocab.size()
self.__word_topic = [{} for _ in range(self.__vocab.size())] # 字典列表
self.__load_word_dict(word_topic_path)
logger.info(
"Model Info: #num_topics=%d #vocab_size=%d alpha=%f beta=%f" %
(self.num_topics(), self.vocab_size(), self.alpha(), self.beta()))
def word_topic_value(self, word_id, topic_id):
"""Return value of specific word under specific topic in the model.
"""
word_dict = self.__word_topic[word_id]
if topic_id not in word_dict:
return 0
return word_dict[topic_id]
def word_topic(self, term_id):
"""Return the topic distribution of a word.
"""
return self.__word_topic[term_id]
def topic_sum_value(self, topic_id):
return self.__topic_sum[topic_id]
def topic_sum(self):
return self.__topic_sum
def num_topics(self):
return self.__num_topics
def vocab_size(self):
return self.__vocab.size()
def alpha(self):
return self.__alpha
def alpha_sum(self):
return self.__alpha_sum
def beta(self):
return self.__beta
def beta_sum(self):
return self.__beta_sum
def type(self):
return self.__type
def __load_word_dict(self, word_dict_path):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
term_id = int(fields[0])
assert term_id < self.vocab_size(), "Term id out of range!"
assert term_id >= 0, "Term id out of range!"
for i in range(1, len(fields)):
topic_count = fields[i].split(":")
assert len(topic_count) == 2, "Topic count format error!"
topic_id = int(topic_count[0])
assert topic_id >= 0, "Topic out of range!"
assert topic_id < self.__num_topics, "Topic out of range!"
count = int(topic_count[1])
assert count >= 0, "Topic count error!"
self.__word_topic[term_id][topic_id] = count
self.__topic_sum[topic_id] += count
self.__topic_words[topic_id].append(
WordCount(term_id, count))
new_dict = OrderedDict()
for key in sorted(self.__word_topic[term_id]):
new_dict[key] = self.__word_topic[term_id][key]
self.__word_topic[term_id] = new_dict
def get_vocab(self):
return self.__vocab.vocabulary()
def topic_words(self):
return self.__topic_words
import os
import paddlehub as hub
from paddlehub.module.module import moduleinfo
from paddlehub.common.logger import logger
from lda_news.inference_engine import InferenceEngine
from lda_news.document import LDADoc, SLDADoc
from lda_news.semantic_matching import SemanticMatching, WordAndDis
from lda_news.tokenizer import LACTokenizer, SimpleTokenizer
from lda_news.config import ModelType
from lda_news.vocab import Vocab, WordCount
@moduleinfo(
name="lda_news",
version="1.0.0",
summary=
"This is a PaddleHub Module for LDA topic model in news dataset, where we can calculate doc distance, calculate the similarity between query and document, etc",
author="DesmonDay",
author_email="",
type="nlp/semantic_model")
class TopicModel(hub.Module):
def _initialize(self):
"""
Initialize with the necessary elements.
"""
self.model_dir = os.path.join(self.directory, 'news')
self.conf_file = 'lda.conf'
self.__engine = InferenceEngine(self.model_dir, self.conf_file)
self.vocab_path = os.path.join(self.model_dir, 'vocab_info.txt')
lac = hub.Module(name="lac")
# self.__tokenizer = SimpleTokenizer(self.vocab_path)
self.__tokenizer = LACTokenizer(self.vocab_path, lac)
self.vocabulary = self.__engine.get_model().get_vocab()
self.config = self.__engine.get_config()
self.topic_words = self.__engine.get_model().topic_words()
self.topic_sum_table = self.__engine.get_model().topic_sum()
def take_elem(word_count):
return word_count.count
for i in range(self.config.num_topics):
self.topic_words[i].sort(key=take_elem, reverse=True)
logger.info("Finish initialization.")
def cal_doc_distance(self, doc_text1, doc_text2):
"""
This interface calculates the distance between documents.
Args:
doc_text1(str): the input document text 1.
doc_text2(str): the input document text 2.
Returns:
jsd(float): Jensen-Shannon Divergence distance of two documents.
hd(float): Hellinger Distance of two documents.
"""
doc1_tokens = self.__tokenizer.tokenize(doc_text1)
doc2_tokens = self.__tokenizer.tokenize(doc_text2)
# Document topic inference.
doc1, doc2 = LDADoc(), LDADoc()
self.__engine.infer(doc1_tokens, doc1)
self.__engine.infer(doc2_tokens, doc2)
# To calculate jsd, we need dense document topic distribution.
dense_dict1 = doc1.dense_topic_dist()
dense_dict2 = doc2.dense_topic_dist()
# Calculate the distance between distributions.
# The smaller the distance, the higher the document semantic similarity.
sm = SemanticMatching()
jsd = sm.jensen_shannon_divergence(dense_dict1, dense_dict2)
hd = sm.hellinger_distance(dense_dict1, dense_dict2)
return jsd, hd
def cal_doc_keywords_similarity(self, document, top_k=10):
"""
This interface can be used to find top k keywords of document.
Args:
document(str): the input document text.
top_k(int): top k keywords of this document.
Returns:
results(list): contains top_k keywords and their corresponding
similarity compared to document.
"""
d_tokens = self.__tokenizer.tokenize(document)
# Do topic inference on documents to obtain topic distribution.
doc = LDADoc()
self.__engine.infer(d_tokens, doc)
doc_topic_dist = doc.sparse_topic_dist()
items = []
words = set()
for word in d_tokens:
if word in words:
continue
words.add(word)
wd = WordAndDis()
wd.word = word
sm = SemanticMatching()
wd.distance = sm.likelihood_based_similarity(
terms=[word],
doc_topic_dist=doc_topic_dist,
model=self.__engine.get_model())
items.append(wd)
def take_elem(word_dis):
return word_dis.distance
items.sort(key=take_elem, reverse=True)
results = []
size = len(items)
for i in range(top_k):
if i >= size:
break
results.append({
"word": items[i].word,
"similarity": items[i].distance
})
return results
def cal_query_doc_similarity(self, query, document):
"""
This interface calculates the similarity between query and document.
Args:
query(str): the input query text.
document(str): the input document text.
Returns:
lda_sim(float): likelihood based similarity between query and document
based on LDA.
"""
q_tokens = self.__tokenizer.tokenize(query)
d_tokens = self.__tokenizer.tokenize(document)
doc = LDADoc()
self.__engine.infer(d_tokens, doc)
doc_topic_dist = doc.sparse_topic_dist()
sm = SemanticMatching()
lda_sim = sm.likelihood_based_similarity(q_tokens, doc_topic_dist,
self.__engine.get_model())
return lda_sim
def infer_doc_topic_distribution(self, document):
"""
This interface infers the topic distribution of document.
Args:
document(str): the input document text.
Returns:
results(list): returns the topic distribution of document.
"""
tokens = self.__tokenizer.tokenize(document)
if tokens == []:
return []
results = []
doc = LDADoc()
self.__engine.infer(tokens, doc)
topics = doc.sparse_topic_dist()
for topic in topics:
results.append({"topic id": topic.tid, "distribution": topic.prob})
return results
def show_topic_keywords(self, topic_id, k=10):
"""
This interface returns first k keywords under specific topic.
Args:
topic_id(int): topic information we want to know.
k(int): top k keywords.
Returns:
results(dict): contains specific topic's keywords and corresponding
probability.
"""
EPS = 1e-8
results = {}
if 0 <= topic_id < self.config.num_topics:
k = min(k, len(self.topic_words[topic_id]))
for i in range(k):
prob = self.topic_words[topic_id][i].count / \
(self.topic_sum_table[topic_id] + EPS)
results[self.vocabulary[self.topic_words[topic_id]
[i].word_id]] = prob
return results
else:
logger.error("%d is out of range!" % topic_id)
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from lda_news.document import LDADoc, SLDADoc, Token, Sentence
from lda_news.vose_alias import VoseAlias
from lda_news.util import rand, rand_k
class Sampler(object):
def __init__(self):
pass
def sample_doc(self, doc):
"""Sample LDA or SLDA topics for documents.
"""
raise NotImplementedError
class MHSampler(Sampler):
def __init__(self, model):
super().__init__()
self.__model = model
self.__topic_indexes = None
self.__alias_tables = None
self.__prob_sum = None
self.__beta_alias = VoseAlias()
self.__beta_prior_sum = None
self.__mh_steps = 2
self.__construct_alias_table()
def __construct_alias_table(self):
"""Construct alias table for all words.
"""
logger.info("Construct alias table for alias sampling method.")
vocab_size = self.__model.vocab_size()
self.__topic_indexes = [[] for _ in range(vocab_size)]
self.__alias_tables = [VoseAlias() for _ in range(vocab_size)]
self.__prob_sum = np.zeros(vocab_size)
# Construct each word's alias table (prior is not included).
for i in tqdm(range(vocab_size)):
dist = []
prob_sum = 0
for key in self.__model.word_topic(i):
topic_id = key
word_topic_count = self.__model.word_topic(i)[key]
topic_sum = self.__model.topic_sum_value(topic_id)
self.__topic_indexes[i].append(topic_id)
q = word_topic_count / (topic_sum + self.__model.beta_sum())
dist.append(q)
prob_sum += q
self.__prob_sum[i] = prob_sum
if len(dist) > 0:
dist = np.array(dist, dtype=np.float)
self.__alias_tables[i].initialize(dist)
# Build prior parameter beta's alias table.
beta_dist = self.__model.beta() / (
self.__model.topic_sum() + self.__model.beta_sum())
self.__beta_prior_sum = np.sum(beta_dist)
self.__beta_alias.initialize(beta_dist)
def sample_doc(self, doc):
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_token(doc, doc.token(i))
doc.set_topic(i, new_topic)
elif isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_sentence(doc, doc.sent(i))
doc.set_topic(i, new_topic)
def __sample_token(self, doc, token):
new_topic = token.topic
for i in range(self.__mh_steps):
doc_proposed_topic = self.__doc_proposal(doc, token)
new_topic = self.__word_proposal(doc, token, doc_proposed_topic)
return new_topic
def __sample_sentence(self, doc, sent):
new_topic = sent.topic
for i in range(self.__mh_steps):
doc_proposed_topic = self.__doc_proposal(doc, sent)
new_topic = self.__word_proposal(doc, sent, doc_proposed_topic)
return new_topic
def __doc_proposal(self, doc, token):
if isinstance(doc, LDADoc) and isinstance(token, Token):
old_topic = token.topic
dart = rand() * (doc.size() + self.__model.alpha_sum())
if dart < doc.size():
token_index = int(dart)
new_topic = doc.token(token_index).topic
else:
new_topic = rand_k(self.__model.num_topics())
if new_topic != old_topic:
proposal_old = self.__doc_proposal_distribution(doc, old_topic)
proposal_new = self.__doc_proposal_distribution(doc, new_topic)
proportion_old = self.__proportional_function(
doc, token, old_topic)
proportion_new = self.__proportional_function(
doc, token, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
old_topic = sent.topic
dart = rand() * (doc.size() + self.__model.alpha_sum())
if dart < doc.size():
token_index = int(dart)
new_topic = doc.sent(token_index).topic
else:
new_topic = rand_k(self.__model.num_topics())
if new_topic != old_topic:
proportion_old = self.__proportional_function(
doc, sent, old_topic)
proportion_new = self.__proportional_function(
doc, sent, new_topic)
proposal_old = self.__doc_proposal_distribution(doc, old_topic)
proposal_new = self.__doc_proposal_distribution(doc, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
def __word_proposal(self, doc, token, old_topic):
if isinstance(doc, LDADoc) and isinstance(token, Token):
new_topic = self.__propose(token.id)
if new_topic != old_topic:
proposal_old = self.__word_proposal_distribution(
token.id, old_topic)
proposal_new = self.__word_proposal_distribution(
token.id, new_topic)
proportion_old = self.__proportional_function(
doc, token, old_topic)
proportion_new = self.__proportional_function(
doc, token, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
new_topic = old_topic
for word_id in sent.tokens:
new_topic = self.__propose(word_id)
if new_topic != old_topic:
proportion_old = self.__proportional_function(
doc, sent, old_topic)
proportion_new = self.__proportional_function(
doc, sent, new_topic)
proposal_old = self.__word_proposal_distribution(
word_id, old_topic)
proposal_new = self.__word_proposal_distribution(
word_id, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
new_topic = (new_topic & mask) | (old_topic & ~mask)
return new_topic
def __proportional_function(self, doc, token, new_topic):
if isinstance(doc, LDADoc) and isinstance(token, Token):
old_topic = token.topic
dt_alpha = doc.topic_sum(new_topic) + self.__model.alpha()
wt_beta = self.__model.word_topic_value(
token.id, new_topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
new_topic) + self.__model.beta_sum()
if new_topic == old_topic and wt_beta > 1:
if dt_alpha > 1:
dt_alpha -= 1
wt_beta -= 1
t_sum_beta_sum -= 1
return dt_alpha * wt_beta / t_sum_beta_sum
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
old_topic = sent.topic
result = doc.topic_sum(new_topic) + self.__model.alpha()
if new_topic == old_topic:
result -= 1
for word_id in sent.tokens:
wt_beta = self.__model.word_topic_value(
word_id, new_topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
new_topic) + self.__model.beta_sum()
if new_topic == old_topic and wt_beta > 1:
wt_beta -= 1
t_sum_beta_sum -= 1
result *= wt_beta / t_sum_beta_sum
return result
else:
logger.error("Wrong input argument type!")
def __word_proposal_distribution(self, word_id, topic):
wt_beta = self.__model.word_topic_value(word_id,
topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
topic) + self.__model.beta_sum()
return wt_beta / t_sum_beta_sum
def __doc_proposal_distribution(self, doc, topic):
return doc.topic_sum(topic) + self.__model.alpha()
def __propose(self, word_id):
dart = rand() * (self.__prob_sum[word_id] + self.__beta_prior_sum)
if dart < self.__prob_sum[word_id]:
idx = self.__alias_tables[word_id].generate()
topic = self.__topic_indexes[word_id][idx]
else:
topic = self.__beta_alias.generate()
return topic
class GibbsSampler(Sampler):
def __init__(self, model):
super().__init__()
self.__model = model
def sample_doc(self, doc):
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_token(doc, doc.token(i))
doc.set_topic(i, new_topic)
elif isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_sentence(doc, doc.sent(i))
doc.set_topic(i, new_topic)
def __sample_token(self, doc, token):
old_topic = token.topic
num_topics = self.__model.num_topics()
accum_prob = np.zeros(num_topics)
prob = np.zeros(num_topics)
sum_ = 0
for i in range(num_topics):
dt_alpha = doc.topic_sum(i) + self.__model.alpha()
wt_beta = self.__model.word_topic_value(token.id,
i) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum(i) + self.__model.beta_sum()
if i == old_topic and wt_beta > 1:
if dt_alpha > 1:
dt_alpha -= 1
wt_beta -= 1
t_sum_beta_sum -= 1
prob[i] = dt_alpha * wt_beta / t_sum_beta_sum
sum_ += prob[i]
accum_prob[i] = prob[i] if i == 0 else accum_prob[i - 1] + prob[i]
dart = rand() * sum_
if dart <= accum_prob[0]:
return 0
for i in range(1, num_topics):
if accum_prob[i - 1] < dart <= accum_prob[i]:
return i
return num_topics - 1
def __sample_sentence(self, doc, sent):
old_topic = sent.topic
num_topics = self.__model.num_topics()
accum_prob = np.zeros(num_topics)
prob = np.zeros(num_topics)
sum_ = 0
for t in range(num_topics):
dt_alpha = doc.topic_sum(t) + self.__model.alpha()
t_sum_beta_sum = self.__model.topic_sum(t) + self.__model.beta_sum()
if t == old_topic:
if dt_alpha > 1:
dt_alpha -= 1
if t_sum_beta_sum > 1:
t_sum_beta_sum -= 1
prob[t] = dt_alpha
for i in range(len(sent.tokens)):
w = sent.tokens[i]
wt_beta = self.__model.word_topic_value(
w, t) + self.__model.beta()
if t == old_topic and wt_beta > 1:
wt_beta -= 1
# Note: if the length of the sentence is too long, the probability will be
# too small and the accuracy will be lost if there are too many multiply items
prob[t] *= wt_beta / t_sum_beta_sum
sum_ += prob[t]
accum_prob[t] = prob[t] if t == 0 else accum_prob[t - 1] + prob[t]
dart = rand() * sum
if dart <= accum_prob[0]:
return 0
for t in range(1, num_topics):
if accum_prob[t - 1] < dart <= accum_prob[t]:
return t
return num_topics - 1
import numpy as np
from lda_news.vocab import OOV
EPS = 1e-06
class WordAndDis(object):
def __init__(self):
self.word = None
self.distance = None
class SemanticMatching(object):
def __init__(self):
pass
def l2_norm(self, vec):
"""Calculate the length of vector.
"""
result = np.sqrt(np.sum(vec**2))
return result
def cosine_similarity(self, vec1, vec2):
"""Calculate the cosine similarity between two vectors.
"""
norm1 = self.l2_norm(vec1)
norm2 = self.l2_norm(vec2)
result = np.sum(vec1 * vec2) / norm1 / norm2
return result
def likelihood_based_similarity(self, terms, doc_topic_dist, model):
"""Calculate the likelihood based similarity.
Args:
terms: list of strings
doc_topic_dist: list of Topic class
model: TopicModel class
"""
num_of_term_in_vocab = 0
result = 0
for i in range(len(terms)):
term_id = model.term_id(terms[i])
if term_id == OOV:
continue
num_of_term_in_vocab += 1
for j in range(len(doc_topic_dist)):
topic_id = doc_topic_dist[j].tid
prob = doc_topic_dist[j].prob
result += model.word_topic_value(term_id, topic_id) * 1.0 / \
model.topic_sum_value(topic_id) * prob
if num_of_term_in_vocab == 0:
return result
return result / num_of_term_in_vocab
def kullback_leibler_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist2[dist2 < EPS] = EPS
result = np.sum(dist1 * np.log(dist1 / dist2))
return result
def jensen_shannon_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist1[dist1 < EPS] = EPS
dist2[dist2 < EPS] = EPS
mean = (dist1 + dist2) * 0.5
jsd = self.kullback_leibler_divergence(dist1, mean) * 0.5 + \
self.kullback_leibler_divergence(dist2, mean) * 0.5
return jsd
def hellinger_distance(self, dist1, dist2):
assert dist1.shape == dist2.shape
result = np.sum((np.sqrt(dist1) - np.sqrt(dist2))**2)
result = np.sqrt(result) * 0.7071067812
return result
"""This file defines tokenizer class object.
"""
class Tokenizer(object):
"""Base tokenizer class.
"""
def __init__(self):
pass
def tokenize(self, text):
raise NotImplementedError
class SimpleTokenizer(Tokenizer):
"""Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
be used in topic model demo, but not in real business application scenarios.
Notes: This tokenizer can only recognize the words in the corresponding vocab file.
"""
def __init__(self, vocab_path):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__load_vocab(vocab_path)
def tokenize(self, text):
"""Tokenize the input string `text`, and return the tokenize result.
"""
text_len = len(text)
result = []
i = 0
while i < text_len:
word = found_word = ""
# Deal with English characters.
if self.__is_eng_char(text[i]):
for j in range(i, text_len + 1):
if j < text_len and self.__is_eng_char(text[j]):
word += self.__tolower(text[j])
else:
# Forward matching by character granularity.
if word in self.__vocab:
result.append(word)
i = j - 1
break
else:
for j in range(i, min(i + self.__max_word_len, text_len)):
word += text[j]
if word in self.__vocab:
found_word = word
if len(found_word) > 0:
result.append(found_word)
i += len(found_word) - 1
i += 1
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def __is_eng_char(self, c):
"""Check whether char c is an English character.
"""
return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')
def __tolower(self, c):
"""Return the lowercase character of the corresponding character, or return
the original character if there is no corresponding lowercase character.
"""
return c.lower()
class LACTokenizer(Tokenizer):
def __init__(self, vocab_path, lac):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__lac = lac
self.__load_vocab(vocab_path)
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def tokenize(self, text):
results = self.__lac.lexical_analysis(
texts=[text], use_gpu=True, batch_size=1, return_tag=True)
# Change English words to lower case.
# And just preserve the word in vocab.
words = results[0]["word"]
result = []
for word in words:
word = word.lower()
if word in self.__vocab:
result.append(word)
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab
import time
import yaml
import numpy as np
from paddlehub.common.logger import logger
from lda_news.config import ModelType
def load_prototxt(config_file, config):
"""
Args:
config_file: model configuration file.
config: ModelConfig class
"""
logger.info("Loading LDA config.")
with open(config_file, 'r') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
if yaml_dict["type"] == "LDA":
config.type = ModelType.LDA
else:
config.type = ModelType.SLDA
config.num_topics = yaml_dict["num_topics"]
config.alpha = yaml_dict["alpha"]
config.beta = yaml_dict["beta"]
config.word_topic_file = yaml_dict["word_topic_file"]
config.vocab_file = yaml_dict["vocab_file"]
def fix_random_seed(seed=2147483647):
np.random.seed(seed)
def rand(min_=0, max_=1):
return np.random.uniform(low=min_, high=max_)
def rand_k(k):
"""Returns an integer float number between [0, k - 1].
"""
return int(rand() * k)
def timeit(f):
"""Return time cost of function f.
"""
def timed(*args, **kwargs):
start_time = time.time()
result = f(*args, **kwargs)
end_time = time.time()
print(" [-] %s : %2.5f sec" % (f.__name__, end_time - start_time))
return result
return timed
from paddlehub.common.logger import logger
OOV = -1
class WordCount(object):
def __init__(self, word_id, count):
self.word_id = word_id
self.count = count
class Vocab(object):
def __init__(self):
self.__term2id = {}
self.__id2term = {}
def get_id(self, word):
if word not in self.__term2id:
return OOV
return self.__term2id[word]
def load(self, vocab_file):
self.__term2id = {}
self.__id2term = {}
with open(vocab_file, 'r') as fin:
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(
fields) == 5, "Vocabulary file [%s] format error!" % (
vocab_file)
term = fields[1]
id_ = int(fields[2])
if term in self.__term2id:
logger.error("Duplicate word [%s] in vocab file!" % (term))
continue
self.__term2id[term] = id_
self.__id2term[id_] = term
def size(self):
return len(self.__term2id)
def vocabulary(self):
return self.__id2term
import numpy as np
from lda_news.util import rand, rand_k
class VoseAlias(object):
"""Vose's Alias Method.
"""
def __init__(self):
self.__alias = None
self.__prob = None # np.array
def initialize(self, distribution):
"""Initialize the alias table according to the input distribution
Arg:
distribution: the input distribution.
"""
size = distribution.shape[0]
self.__alias = np.zeros(size, dtype=np.int64)
self.__prob = np.zeros(size)
sum_ = np.sum(distribution)
p = distribution / sum_ * size # Scale up probability.
large, small = [], []
for i, p_ in enumerate(p):
if p_ < 1.0:
small.append(i)
else:
large.append(i)
while large and small:
l = small[0]
g = large[0]
small.pop(0)
large.pop(0)
self.__prob[l] = p[l]
self.__alias[l] = g
p[g] = p[g] + p[l] - 1 # A more numerically stable option.
if p[g] < 1.0:
small.append(g)
else:
large.append(g)
while large:
g = large[0]
large.pop(0)
self.__prob[g] = 1.0
while small:
l = small[0]
small.pop(0)
self.__prob[l] = 1.0
def generate(self):
"""Generate samples from given distribution.
"""
dart1 = rand_k(self.size())
dart2 = int(rand())
return dart1 if dart2 > self.__prob[dart1] else self.__alias[dart1]
def size(self):
return self.__prob.shape[0]
## 模型概述
主题模型(Topic Model)是以无监督学习的方式对文档的隐含语义结构进行聚类的统计模型,其中LDA(Latent Dirichlet Allocation)算法是主题模型的一种。LDA根据对词的共现信息的分析,拟合出词-文档-主题的分布,从而将词、文本映射到一个语义空间中。本Module基于的数据集为百度自建的小说领域数据集。
<p align="center">
<img src="https://bj.bcebos.com/paddlehub/model/nlp/semantic_model/lda.png" hspace='10'/> <br />
</p>
更多详情请参考[LDA论文](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf)
注:该Module由第三方开发者DesmonDay贡献。
## LDA模型 API 说明
### cal_doc_distance(doc_text1, doc_text2)
用于计算两个输入文档之间的距离,包括Jensen-Shannon divergence(JS散度)、Hellinger Distance(海林格距离)。
**参数**
- doc_text1(str): 输入的第一个文档。
- doc_text2(str): 输入的第二个文档。
**返回**
- jsd(float): 两个文档之间的JS散度([Jensen-Shannon divergence](https://blog.csdn.net/FrankieHello/article/details/80614422?utm_source=copy))。
- hd(float): 两个文档之间的海林格距离([Hellinger Distance](http://blog.sina.com.cn/s/blog_85f1ffb70101e65d.html))。
### cal_doc_keywords_similarity(document, top_k=10)
用于查找输入文档的前k个关键词及对应的与原文档的相似度。
**参数**
- document(str): 输入文档。
- top_k(int): 查找输入文档的前k个关键词。
**返回**
- results(list): 包含每个关键词以及对应的与原文档的相似度。其中,list的基本元素为dict,dict的key为关键词,value为对应的与原文档的相似度。
### cal_query_doc_similarity(query, document)
用于计算短文档与长文档之间的相似度。
**参数**
- query(str): 输入的短文档。
- document(str): 输入的长文档。
**返回**
- lda_sim(float): 返回短文档与长文档之间的相似度。
### infer_doc_topic_distribution(document)
用于推理出文档的主题分布。
**参数**
- document(str): 输入文档。
**返回**
- results(list): 包含主题分布下各个主题ID和对应的概率分布。其中,list的基本元素为dict,dict的key为主题ID,value为各个主题ID对应的概率。
### show_topic_keywords(topic_id, k=10)
用于展示出每个主题下对应的关键词,可配合推理主题分布的API使用。
**参数**
- topic_id(int): 主题ID。
- k(int): 需要知道对应主题的前k个关键词。
**返回**
- results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。
### 代码示例
这里展示部分API的使用示例。
``` python
import paddlehub as hub
lda_novel = hub.Module(name="lda_novel")
jsd, hd = lda_novel.cal_doc_distance(doc_text1="老人幸福地看着自己的儿子,露出了欣慰的笑容。", doc_text2="老奶奶看着自己的儿子,幸福地笑了。")
# jsd = 0.01292, hd = 0.11893
lda_sim = lda_novel.cal_query_doc_similarity(query='亲孙女', document='老人激动地打量着面前的女孩,似乎找到了自己的亲孙女一般,双手止不住地颤抖着。')
# LDA similarity = 0.0
```
## 查看代码
https://github.com/baidu/Familia
## 依赖
paddlepaddle >= 1.8.2
paddlehub >= 1.8.0
## 更新历史
* 1.0.0
初始发布
"""
This file defines the basic config information of LDA/SLDA model.
"""
class ModelType:
LDA = 0
SLDA = 1
class ModelConfig:
type = None
num_topics = None
alpha = None
beta = None
word_topic_file = None
vocab_file = None
import numpy as np
class Topic(object):
"""Basic data structure of topic, contains topic id and
corresponding probability.
"""
def __init__(self, tid, prob):
self.tid = tid # topic id
self.prob = prob # topic probability
class Token(object):
"""Basic storage unit of LDA documents, contains word id
and corresponding topic.
"""
def __init__(self, topic, id):
self.topic = topic
self.id = id
class Sentence(object):
"""Basic storage unit of SentenceLDA documents, contains word ids
of the sentence and its corresponding topic id.
"""
def __init__(self, topic, tokens):
self.topic = topic
self.tokens = tokens
class LDADoc(object):
"""The storage structure of LDA model's inference result.
"""
def __init__(self):
self._num_topics = None # Number of topics.
self._num_accum = None # Number of accumulated sample rounds.
self._alpha = None # Document prior parameter.
self._tokens = None # Storage structure of inference results.
self._topic_sum = None # Document's topic sum in one round samples.
self._accum_topic_sum = None # Accumulated results of topic sum.
def init(self, num_topics):
"""Initialize the LDADoc according to num_topics.
"""
self._num_topics = num_topics
self._num_accum = 0
self._tokens = []
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_token(self, token):
"""Add new word to current LDADoc.
Arg:
token: Token class object.
"""
assert token.topic >= 0, "Topic %d out of range!" % token.topic
assert token.topic < self._num_topics, "Topic %d out of range!" % token.topic
self._tokens.append(token)
self._topic_sum[token.topic] += 1
def token(self, index):
return self._tokens[index]
def set_topic(self, index, new_topic):
"""Set the index word's topic to new_topic, and update the corresponding
topic distribution.
"""
assert new_topic >= 0, "Topic %d out of range!" % new_topic
assert new_topic < self._num_topics, "Topic %d out of range!" % new_topic
old_topic = self._tokens[index].topic
if new_topic == old_topic:
return
self._tokens[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def set_alpha(self, alpha):
self._alpha = alpha
def size(self):
"""Return number of words in LDADoc.
"""
return len(self._tokens)
def topic_sum(self, topic_id):
return self._topic_sum[topic_id]
def sparse_topic_dist(self, sort=True):
"""Return the topic distribution of documents in sparse format.
By default, it is sorted according to the topic probability
under the descending order.
"""
topic_dist = []
sum_ = np.sum(self._accum_topic_sum)
if sum_ == 0:
return
for i in range(0, self._num_topics):
if self._accum_topic_sum[i] == 0:
continue
topic_dist.append(Topic(i, self._accum_topic_sum[i] * 1.0 / sum_))
if sort:
def take_elem(topic):
return topic.prob
topic_dist.sort(key=take_elem, reverse=True)
if topic_dist is None:
topic_dist = []
return topic_dist
def dense_topic_dist(self):
"""Return the distribution of document topics in dense format,
taking into account the prior parameter alpha.
"""
dense_dist = np.zeros(self._num_topics)
if self.size() == 0:
return dense_dist
dense_dist = (
self._accum_topic_sum * 1.0 / self._num_accum + self._alpha) / (
self.size() + self._alpha * self._num_topics)
return dense_dist
def accumulate_topic_num(self):
self._accum_topic_sum += self._topic_sum
self._num_accum += 1
class SLDADoc(LDADoc):
"""Sentence LDA Document, inherited from LDADoc.
Add add_sentence interface.
"""
def __init__(self):
super().__init__()
self.__sentences = None
def init(self, num_topics):
"""Initialize the SLDADoc according to num_topics.
"""
self._num_topics = num_topics
self.__sentences = []
self._num_accum = 0
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_sentence(self, sent):
"""Add new sentence to current SLDADoc.
Arg:
sent: Sentence class object.
"""
assert sent.topic >= 0, "Topic %d out of range!" % (sent.topic)
assert sent.topic < self._num_topics, "Topic %d out of range!" % (
sent.topic)
self.__sentences.append(sent)
self._topic_sum[sent.topic] += 1
def set_topic(self, index, new_topic):
assert new_topic >= 0, "Topic %d out of range!" % (new_topic)
assert new_topic < self._num_topics, "Topic %d out of range!" % (
new_topic)
old_topic = self.__sentences[index].topic
if new_topic == old_topic:
return
self.__sentences[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def size(self):
"""Return number of sentences in SLDADoc.
"""
return len(self.__sentences)
def sent(self, index):
return self.__sentences[index]
import os
from paddlehub.common.logger import logger
from lda_novel.config import ModelConfig
from lda_novel.util import load_prototxt, fix_random_seed, rand_k
from lda_novel.model import TopicModel
from lda_novel.sampler import GibbsSampler, MHSampler
from lda_novel.document import LDADoc, SLDADoc, Token, Sentence
from lda_novel.vocab import OOV
class SamplerType:
GibbsSampling = 0
MetropolisHastings = 1
class InferenceEngine(object):
def __init__(self,
model_dir,
conf_file,
type=SamplerType.MetropolisHastings):
# Read model configuration.
config = ModelConfig()
conf_file_path = os.path.join(model_dir, conf_file)
load_prototxt(conf_file_path, config)
self.__model = TopicModel(model_dir, config)
self.__config = config
# Initialize the sampler according to the configuration.
if type == SamplerType.GibbsSampling:
self.__sampler = GibbsSampler(self.__model)
elif type == SamplerType.MetropolisHastings:
self.__sampler = MHSampler(self.__model)
def infer(self, input, doc):
"""Perform LDA topic inference on input, and store the results in doc.
Args:
input: a list of strings after tokenization.
doc: LDADoc type or SLDADoc type.
"""
fix_random_seed()
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for token in input:
id_ = self.__model.term_id(token)
if id_ != OOV:
init_topic = rand_k(self.__model.num_topics())
doc.add_token(Token(init_topic, id_))
self.lda_infer(doc, 20, 50)
elif isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for sent in input:
words = []
for token in sent:
id_ = self.__model.term_id(token)
if id_ != OOV:
words.append(id_)
init_topic = rand_k(self.__model.num_topics())
doc.add_sentence(Sentence(init_topic, words))
self.slda_infer(doc, 20, 50)
else:
logger.error("Wrong Doc Type!")
def lda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def slda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def model_type(self):
return self.__model.type()
def get_model(self):
return self.__model
def get_config(self):
return self.__config
import os
from collections import OrderedDict
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from lda_novel.vocab import Vocab, WordCount
class TopicModel(object):
"""Storage Structure of Topic model, including vocabulary and word topic count.
"""
def __init__(self, model_dir, config):
"""
Args:
model_dir: the path of model directory
config: ModelConfig class.
"""
self.__word_topic = None # Model parameter of word topic.
self.__vocab = Vocab() # Vocab data structure of model.
self.__num_topics = config.num_topics # Number of topics.
self.__alpha = config.alpha
self.__alpha_sum = self.__alpha * self.__num_topics
self.__beta = config.beta
self.__beta_sum = None
self.__type = config.type # Model type.
self.__topic_sum = np.zeros(
self.__num_topics,
dtype="int64") # Accum sum of each topic in word topic.
self.__topic_words = [[] for _ in range(self.__num_topics)]
word_topic_path = os.path.join(model_dir, config.word_topic_file)
vocab_path = os.path.join(model_dir, config.vocab_file)
self.load_model(word_topic_path, vocab_path)
def term_id(self, term):
return self.__vocab.get_id(term)
def load_model(self, word_topic_path, vocab_path):
# Loading vocabulary
self.__vocab.load(vocab_path)
self.__beta_sum = self.__beta * self.__vocab.size()
self.__word_topic = [{} for _ in range(self.__vocab.size())] # 字典列表
self.__load_word_dict(word_topic_path)
logger.info(
"Model Info: #num_topics=%d #vocab_size=%d alpha=%f beta=%f" %
(self.num_topics(), self.vocab_size(), self.alpha(), self.beta()))
def word_topic_value(self, word_id, topic_id):
"""Return value of specific word under specific topic in the model.
"""
word_dict = self.__word_topic[word_id]
if topic_id not in word_dict:
return 0
return word_dict[topic_id]
def word_topic(self, term_id):
"""Return the topic distribution of a word.
"""
return self.__word_topic[term_id]
def topic_sum_value(self, topic_id):
return self.__topic_sum[topic_id]
def topic_sum(self):
return self.__topic_sum
def num_topics(self):
return self.__num_topics
def vocab_size(self):
return self.__vocab.size()
def alpha(self):
return self.__alpha
def alpha_sum(self):
return self.__alpha_sum
def beta(self):
return self.__beta
def beta_sum(self):
return self.__beta_sum
def type(self):
return self.__type
def __load_word_dict(self, word_dict_path):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
term_id = int(fields[0])
assert term_id < self.vocab_size(), "Term id out of range!"
assert term_id >= 0, "Term id out of range!"
for i in range(1, len(fields)):
topic_count = fields[i].split(":")
assert len(topic_count) == 2, "Topic count format error!"
topic_id = int(topic_count[0])
assert topic_id >= 0, "Topic out of range!"
assert topic_id < self.__num_topics, "Topic out of range!"
count = int(topic_count[1])
assert count >= 0, "Topic count error!"
self.__word_topic[term_id][topic_id] = count
self.__topic_sum[topic_id] += count
self.__topic_words[topic_id].append(
WordCount(term_id, count))
new_dict = OrderedDict()
for key in sorted(self.__word_topic[term_id]):
new_dict[key] = self.__word_topic[term_id][key]
self.__word_topic[term_id] = new_dict
def get_vocab(self):
return self.__vocab.vocabulary()
def topic_words(self):
return self.__topic_words
import os
import paddlehub as hub
from paddlehub.module.module import moduleinfo
from paddlehub.common.logger import logger
from lda_novel.inference_engine import InferenceEngine
from lda_novel.document import LDADoc, SLDADoc
from lda_novel.semantic_matching import SemanticMatching, WordAndDis
from lda_novel.tokenizer import LACTokenizer, SimpleTokenizer
from lda_novel.config import ModelType
from lda_novel.vocab import Vocab, WordCount
@moduleinfo(
name="lda_novel",
version="1.0.0",
summary=
"This is a PaddleHub Module for LDA topic model in novel dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.",
author="DesmonDay",
author_email="",
type="nlp/semantic_model")
class TopicModel(hub.Module):
def _initialize(self):
"""
Initialize with the necessary elements.
"""
self.model_dir = os.path.join(self.directory, 'novel')
self.conf_file = 'lda.conf'
self.__engine = InferenceEngine(self.model_dir, self.conf_file)
self.vocab_path = os.path.join(self.model_dir, 'vocab_info.txt')
lac = hub.Module(name="lac")
# self.__tokenizer = SimpleTokenizer(self.vocab_path)
self.__tokenizer = LACTokenizer(self.vocab_path, lac)
self.vocabulary = self.__engine.get_model().get_vocab()
self.config = self.__engine.get_config()
self.topic_words = self.__engine.get_model().topic_words()
self.topic_sum_table = self.__engine.get_model().topic_sum()
def take_elem(word_count):
return word_count.count
for i in range(self.config.num_topics):
self.topic_words[i].sort(key=take_elem, reverse=True)
logger.info("Finish initialization.")
def cal_doc_distance(self, doc_text1, doc_text2):
"""
This interface calculates the distance between documents.
Args:
doc_text1(str): the input document text 1.
doc_text2(str): the input document text 2.
Returns:
jsd(float): Jensen-Shannon Divergence distance of two documents.
hd(float): Hellinger Distance of two documents.
"""
doc1_tokens = self.__tokenizer.tokenize(doc_text1)
doc2_tokens = self.__tokenizer.tokenize(doc_text2)
# Document topic inference.
doc1, doc2 = LDADoc(), LDADoc()
self.__engine.infer(doc1_tokens, doc1)
self.__engine.infer(doc2_tokens, doc2)
# To calculate jsd, we need dense document topic distribution.
dense_dict1 = doc1.dense_topic_dist()
dense_dict2 = doc2.dense_topic_dist()
# Calculate the distance between distributions.
# The smaller the distance, the higher the document semantic similarity.
sm = SemanticMatching()
jsd = sm.jensen_shannon_divergence(dense_dict1, dense_dict2)
hd = sm.hellinger_distance(dense_dict1, dense_dict2)
return jsd, hd
def cal_doc_keywords_similarity(self, document, top_k=10):
"""
This interface can be used to find topk keywords of document.
Args:
document(str): the input document text.
top_k(int): top k keywords of this document.
Returns:
results(list): contains top_k keywords and their corresponding
similarity compared to document.
"""
d_tokens = self.__tokenizer.tokenize(document)
# Do topic inference on documents to obtain topic distribution.
doc = LDADoc()
self.__engine.infer(d_tokens, doc)
doc_topic_dist = doc.sparse_topic_dist()
items = []
words = set()
for word in d_tokens:
if word in words:
continue
words.add(word)
wd = WordAndDis()
wd.word = word
sm = SemanticMatching()
wd.distance = sm.likelihood_based_similarity(
terms=[word],
doc_topic_dist=doc_topic_dist,
model=self.__engine.get_model())
items.append(wd)
def take_elem(word_dis):
return word_dis.distance
items.sort(key=take_elem, reverse=True)
results = []
size = len(items)
for i in range(top_k):
if i >= size:
break
results.append({
"word": items[i].word,
"similarity": items[i].distance
})
return results
def cal_query_doc_similarity(self, query, document):
"""
This interface calculates the similarity between query and document.
Args:
query(str): the input query text.
document(str): the input document text.
Returns:
lda_sim(float): likelihood based similarity between query and document
based on LDA.
"""
q_tokens = self.__tokenizer.tokenize(query)
d_tokens = self.__tokenizer.tokenize(document)
doc = LDADoc()
self.__engine.infer(d_tokens, doc)
doc_topic_dist = doc.sparse_topic_dist()
sm = SemanticMatching()
lda_sim = sm.likelihood_based_similarity(q_tokens, doc_topic_dist,
self.__engine.get_model())
return lda_sim
def infer_doc_topic_distribution(self, document):
"""
This interface infers the topic distribution of document.
Args:
document(str): the input document text.
Returns:
results(list): returns the topic distribution of document.
"""
tokens = self.__tokenizer.tokenize(document)
if tokens == []:
return []
results = []
doc = LDADoc()
self.__engine.infer(tokens, doc)
topics = doc.sparse_topic_dist()
for topic in topics:
results.append({"topic id": topic.tid, "distribution": topic.prob})
return results
def show_topic_keywords(self, topic_id, k=10):
"""
This interface returns the k keywords under specific topic.
Args:
topic_id(int): topic information we want to know.
k(int): top k keywords.
Returns:
results(dict): contains specific topic's keywords and corresponding
probability.
"""
EPS = 1e-8
results = {}
if 0 <= topic_id < self.config.num_topics:
k = min(k, len(self.topic_words[topic_id]))
for i in range(k):
prob = self.topic_words[topic_id][i].count / \
(self.topic_sum_table[topic_id] + EPS)
results[self.vocabulary[self.topic_words[topic_id]
[i].word_id]] = prob
return results
else:
logger.error("%d is out of range!" % topic_id)
import os
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from lda_novel.document import LDADoc, SLDADoc, Token, Sentence
from lda_novel.vose_alias import VoseAlias
from lda_novel.util import rand, rand_k
class Sampler(object):
def __init__(self):
pass
def sample_doc(self, doc):
"""Sample LDA or SLDA topics for documents.
"""
raise NotImplementedError
class MHSampler(Sampler):
def __init__(self, model):
super().__init__()
self.__model = model
self.__topic_indexes = None
self.__alias_tables = None
self.__prob_sum = None
self.__beta_alias = VoseAlias()
self.__beta_prior_sum = None
self.__mh_steps = 2
self.__construct_alias_table()
def __construct_alias_table(self):
"""Construct alias table for all words.
"""
logger.info("Construct alias table for alias sampling method.")
vocab_size = self.__model.vocab_size()
self.__topic_indexes = [[] for _ in range(vocab_size)]
self.__alias_tables = [VoseAlias() for _ in range(vocab_size)]
self.__prob_sum = np.zeros(vocab_size)
# Construct each word's alias table (prior is not included).
for i in tqdm(range(vocab_size)):
dist = []
prob_sum = 0
for key in self.__model.word_topic(i):
topic_id = key
word_topic_count = self.__model.word_topic(i)[key]
topic_sum = self.__model.topic_sum_value(topic_id)
self.__topic_indexes[i].append(topic_id)
q = word_topic_count / (topic_sum + self.__model.beta_sum())
dist.append(q)
prob_sum += q
self.__prob_sum[i] = prob_sum
if len(dist) > 0:
dist = np.array(dist, dtype=np.float)
self.__alias_tables[i].initialize(dist)
# Build prior parameter beta's alias table.
beta_dist = self.__model.beta() / (
self.__model.topic_sum() + self.__model.beta_sum())
self.__beta_prior_sum = np.sum(beta_dist)
self.__beta_alias.initialize(beta_dist)
def sample_doc(self, doc):
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_token(doc, doc.token(i))
doc.set_topic(i, new_topic)
elif isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_sentence(doc, doc.sent(i))
doc.set_topic(i, new_topic)
def __sample_token(self, doc, token):
new_topic = token.topic
for i in range(self.__mh_steps):
doc_proposed_topic = self.__doc_proposal(doc, token)
new_topic = self.__word_proposal(doc, token, doc_proposed_topic)
return new_topic
def __sample_sentence(self, doc, sent):
new_topic = sent.topic
for i in range(self.__mh_steps):
doc_proposed_topic = self.__doc_proposal(doc, sent)
new_topic = self.__word_proposal(doc, sent, doc_proposed_topic)
return new_topic
def __doc_proposal(self, doc, token):
if isinstance(doc, LDADoc) and isinstance(token, Token):
old_topic = token.topic
dart = rand() * (doc.size() + self.__model.alpha_sum())
if dart < doc.size():
token_index = int(dart)
new_topic = doc.token(token_index).topic
else:
new_topic = rand_k(self.__model.num_topics())
if new_topic != old_topic:
proposal_old = self.__doc_proposal_distribution(doc, old_topic)
proposal_new = self.__doc_proposal_distribution(doc, new_topic)
proportion_old = self.__proportional_function(
doc, token, old_topic)
proportion_new = self.__proportional_function(
doc, token, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
old_topic = sent.topic
dart = rand() * (doc.size() + self.__model.alpha_sum())
if dart < doc.size():
token_index = int(dart)
new_topic = doc.sent(token_index).topic
else:
new_topic = rand_k(self.__model.num_topics())
if new_topic != old_topic:
proportion_old = self.__proportional_function(
doc, sent, old_topic)
proportion_new = self.__proportional_function(
doc, sent, new_topic)
proposal_old = self.__doc_proposal_distribution(doc, old_topic)
proposal_new = self.__doc_proposal_distribution(doc, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
def __word_proposal(self, doc, token, old_topic):
if isinstance(doc, LDADoc) and isinstance(token, Token):
new_topic = self.__propose(token.id)
if new_topic != old_topic:
proposal_old = self.__word_proposal_distribution(
token.id, old_topic)
proposal_new = self.__word_proposal_distribution(
token.id, new_topic)
proportion_old = self.__proportional_function(
doc, token, old_topic)
proportion_new = self.__proportional_function(
doc, token, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
new_topic = old_topic
for word_id in sent.tokens:
new_topic = self.__propose(word_id)
if new_topic != old_topic:
proportion_old = self.__proportional_function(
doc, sent, old_topic)
proportion_new = self.__proportional_function(
doc, sent, new_topic)
proposal_old = self.__word_proposal_distribution(
word_id, old_topic)
proposal_new = self.__word_proposal_distribution(
word_id, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
new_topic = (new_topic & mask) | (old_topic & ~mask)
return new_topic
def __proportional_function(self, doc, token, new_topic):
if isinstance(doc, LDADoc) and isinstance(token, Token):
old_topic = token.topic
dt_alpha = doc.topic_sum(new_topic) + self.__model.alpha()
wt_beta = self.__model.word_topic_value(
token.id, new_topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
new_topic) + self.__model.beta_sum()
if new_topic == old_topic and wt_beta > 1:
if dt_alpha > 1:
dt_alpha -= 1
wt_beta -= 1
t_sum_beta_sum -= 1
return dt_alpha * wt_beta / t_sum_beta_sum
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
old_topic = sent.topic
result = doc.topic_sum(new_topic) + self.__model.alpha()
if new_topic == old_topic:
result -= 1
for word_id in sent.tokens:
wt_beta = self.__model.word_topic_value(
word_id, new_topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
new_topic) + self.__model.beta_sum()
if new_topic == old_topic and wt_beta > 1:
wt_beta -= 1
t_sum_beta_sum -= 1
result *= wt_beta / t_sum_beta_sum
return result
else:
logger.error("Wrong input argument type!")
def __word_proposal_distribution(self, word_id, topic):
wt_beta = self.__model.word_topic_value(word_id,
topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
topic) + self.__model.beta_sum()
return wt_beta / t_sum_beta_sum
def __doc_proposal_distribution(self, doc, topic):
return doc.topic_sum(topic) + self.__model.alpha()
def __propose(self, word_id):
dart = rand() * (self.__prob_sum[word_id] + self.__beta_prior_sum)
if dart < self.__prob_sum[word_id]:
idx = self.__alias_tables[word_id].generate()
topic = self.__topic_indexes[word_id][idx]
else:
topic = self.__beta_alias.generate()
return topic
class GibbsSampler(Sampler):
def __init__(self, model):
super().__init__()
self.__model = model
def sample_doc(self, doc):
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_token(doc, doc.token(i))
doc.set_topic(i, new_topic)
elif isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_sentence(doc, doc.sent(i))
doc.set_topic(i, new_topic)
def __sample_token(self, doc, token):
old_topic = token.topic
num_topics = self.__model.num_topics()
accum_prob = np.zeros(num_topics)
prob = np.zeros(num_topics)
sum_ = 0
for i in range(num_topics):
dt_alpha = doc.topic_sum(i) + self.__model.alpha()
wt_beta = self.__model.word_topic_value(token.id,
i) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum(i) + self.__model.beta_sum()
if i == old_topic and wt_beta > 1:
if dt_alpha > 1:
dt_alpha -= 1
wt_beta -= 1
t_sum_beta_sum -= 1
prob[i] = dt_alpha * wt_beta / t_sum_beta_sum
sum_ += prob[i]
accum_prob[i] = prob[i] if i == 0 else accum_prob[i - 1] + prob[i]
dart = rand() * sum_
if dart <= accum_prob[0]:
return 0
for i in range(1, num_topics):
if accum_prob[i - 1] < dart <= accum_prob[i]:
return i
return num_topics - 1
def __sample_sentence(self, doc, sent):
old_topic = sent.topic
num_topics = self.__model.num_topics()
accum_prob = np.zeros(num_topics)
prob = np.zeros(num_topics)
sum_ = 0
for t in range(num_topics):
dt_alpha = doc.topic_sum(t) + self.__model.alpha()
t_sum_beta_sum = self.__model.topic_sum(t) + self.__model.beta_sum()
if t == old_topic:
if dt_alpha > 1:
dt_alpha -= 1
if t_sum_beta_sum > 1:
t_sum_beta_sum -= 1
prob[t] = dt_alpha
for i in range(len(sent.tokens)):
w = sent.tokens[i]
wt_beta = self.__model.word_topic_value(
w, t) + self.__model.beta()
if t == old_topic and wt_beta > 1:
wt_beta -= 1
# Note: if the length of the sentence is too long, the probability will be
# too small and the accuracy will be lost if there are too many multiply items
prob[t] *= wt_beta / t_sum_beta_sum
sum_ += prob[t]
accum_prob[t] = prob[t] if t == 0 else accum_prob[t - 1] + prob[t]
dart = rand() * sum
if dart <= accum_prob[0]:
return 0
for t in range(1, num_topics):
if accum_prob[t - 1] < dart <= accum_prob[t]:
return t
return num_topics - 1
import os
import numpy as np
from paddlehub.common.logger import logger
from lda_novel.vocab import OOV
EPS = 1e-06
class WordAndDis(object):
def __init__(self):
self.word = None
self.distance = None
class SemanticMatching(object):
def __init__(self):
pass
def l2_norm(self, vec):
"""Calculate the length of vector.
"""
result = np.sqrt(np.sum(vec**2))
return result
def cosine_similarity(self, vec1, vec2):
norm1 = self.l2_norm(vec1)
norm2 = self.l2_norm(vec2)
result = np.sum(vec1 * vec2) / norm1 / norm2
return result
def likelihood_based_similarity(self, terms, doc_topic_dist, model):
"""
Args:
terms: list of strings
doc_topic_dist: list of Topic class
model: TopicModel class
"""
num_of_term_in_vocab = 0
result = 0
for i in range(len(terms)):
term_id = model.term_id(terms[i])
if term_id == OOV:
continue
num_of_term_in_vocab += 1
for j in range(len(doc_topic_dist)):
topic_id = doc_topic_dist[j].tid
prob = doc_topic_dist[j].prob
result += model.word_topic_value(term_id, topic_id) * 1.0 / \
model.topic_sum_value(topic_id) * prob
if num_of_term_in_vocab == 0:
return result
return result / num_of_term_in_vocab
def kullback_leibler_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist2[dist2 < EPS] = EPS
result = np.sum(dist1 * np.log(dist1 / dist2))
return result
def jensen_shannon_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist1[dist1 < EPS] = EPS
dist2[dist2 < EPS] = EPS
mean = (dist1 + dist2) * 0.5
jsd = self.kullback_leibler_divergence(dist1, mean) * 0.5 + \
self.kullback_leibler_divergence(dist2, mean) * 0.5
return jsd
def hellinger_distance(self, dist1, dist2):
assert dist1.shape == dist2.shape
result = np.sum((np.sqrt(dist1) - np.sqrt(dist2))**2)
result = np.sqrt(result) * 0.7071067812
return result
import os
import numpy as np
from paddlehub.common.logger import logger
class Tokenizer(object):
"""Base tokenizer class.
"""
def __init__(self):
pass
def tokenize(self, text):
raise NotImplementedError
class SimpleTokenizer(Tokenizer):
"""Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
be used in topic model demo, but not in real business application scenarios.
Notes: This tokenizer can only recognize the words in the corresponding vocab file.
"""
def __init__(self, vocab_path):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__load_vocab(vocab_path)
def tokenize(self, text):
"""Tokenize the input string `text`, and return the tokenize result.
"""
text_len = len(text)
result = []
i = 0
while i < text_len:
word = found_word = ""
# Deal with English characters.
if self.__is_eng_char(text[i]):
for j in range(i, text_len + 1):
if j < text_len and self.__is_eng_char(text[j]):
word += self.__tolower(text[j])
else:
# Forward matching by character granularity.
if word in self.__vocab:
result.append(word)
i = j - 1
break
else:
for j in range(i, min(i + self.__max_word_len, text_len)):
word += text[j]
if word in self.__vocab:
found_word = word
if len(found_word) > 0:
result.append(found_word)
i += len(found_word) - 1
i += 1
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def __is_eng_char(self, c):
"""Check whether char c is an English character.
"""
return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')
def __tolower(self, c):
"""Return the lowercase character of the corresponding character, or return
the original character if there is no corresponding lowercase character.
"""
return c.lower()
class LACTokenizer(Tokenizer):
def __init__(self, vocab_path, lac):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__lac = lac
self.__load_vocab(vocab_path)
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def tokenize(self, text):
results = self.__lac.lexical_analysis(
texts=[text], use_gpu=True, batch_size=1, return_tag=True)
# Change English words to lower case.
# And just preserve the word in vocab.
words = results[0]["word"]
result = []
for word in words:
word = word.lower()
if word in self.__vocab:
result.append(word)
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab
import time
import yaml
import numpy as np
from paddlehub.common.logger import logger
from lda_novel.config import ModelType
def load_prototxt(config_file, config):
"""
Args:
config_file: model configuration file.
config: ModelConfig class
"""
logger.info("Loading LDA config.")
with open(config_file, 'r') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
if yaml_dict["type"] == "LDA":
config.type = ModelType.LDA
else:
config.type = ModelType.SLDA
config.num_topics = yaml_dict["num_topics"]
config.alpha = yaml_dict["alpha"]
config.beta = yaml_dict["beta"]
config.word_topic_file = yaml_dict["word_topic_file"]
config.vocab_file = yaml_dict["vocab_file"]
def fix_random_seed(seed=2147483647):
np.random.seed(seed)
def rand(min_=0, max_=1):
return np.random.uniform(low=min_, high=max_)
def rand_k(k):
"""Returns an integer float number between [0, k - 1].
"""
return int(rand() * k)
def timeit(f):
"""Return time cost of function f.
"""
def timed(*args, **kwargs):
start_time = time.time()
result = f(*args, **kwargs)
end_time = time.time()
print(" [-] %s : %2.5f sec" % (f.__name__, end_time - start_time))
return result
return timed
from paddlehub.common.logger import logger
OOV = -1
class WordCount(object):
def __init__(self, word_id, count):
self.word_id = word_id
self.count = count
class Vocab(object):
def __init__(self):
self.__term2id = {}
self.__id2term = {}
def get_id(self, word):
if word not in self.__term2id:
return OOV
return self.__term2id[word]
def load(self, vocab_file):
self.__term2id = {}
self.__id2term = {}
with open(vocab_file, 'r') as fin:
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(
fields) == 5, "Vocabulary file [%s] format error!" % (
vocab_file)
term = fields[1]
id_ = int(fields[2])
if term in self.__term2id:
logger.error("Duplicate word [%s] in vocab file!" % (term))
continue
self.__term2id[term] = id_
self.__id2term[id_] = term
def size(self):
return len(self.__term2id)
def vocabulary(self):
return self.__id2term
import os
import numpy as np
from paddlehub.common.logger import logger
from lda_novel.util import rand, rand_k
class VoseAlias(object):
"""Vose's Alias Method.
"""
def __init__(self):
self.__alias = None
self.__prob = None # np.array
def initialize(self, distribution):
"""Initialize the alias table according to the input distribution
Arg:
distribution: Numpy array.
"""
size = distribution.shape[0]
self.__alias = np.zeros(size, dtype=np.int64)
self.__prob = np.zeros(size)
sum_ = np.sum(distribution)
p = distribution / sum_ * size # Scale up probability.
large, small = [], []
for i, p_ in enumerate(p):
if p_ < 1.0:
small.append(i)
else:
large.append(i)
while large and small:
l = small[0]
g = large[0]
small.pop(0)
large.pop(0)
self.__prob[l] = p[l]
self.__alias[l] = g
p[g] = p[g] + p[l] - 1 # A more numerically stable option.
if p[g] < 1.0:
small.append(g)
else:
large.append(g)
while large:
g = large[0]
large.pop(0)
self.__prob[g] = 1.0
while small:
l = small[0]
small.pop(0)
self.__prob[l] = 1.0
def generate(self):
"""Generate samples from given distribution.
"""
dart1 = rand_k(self.size())
dart2 = int(rand())
return dart1 if dart2 > self.__prob[dart1] else self.__alias[dart1]
def size(self):
return self.__prob.shape[0]
## 模型概述
主题模型(Topic Model)是以无监督学习的方式对文档的隐含语义结构进行聚类的统计模型,其中LDA(Latent Dirichlet Allocation)算法是主题模型的一种。LDA根据对词的共现信息的分析,拟合出词-文档-主题的分布,从而将词、文本映射到一个语义空间中。本Module基于的数据集为百度自建的网页领域数据集。
<p align="center">
<img src="https://bj.bcebos.com/paddlehub/model/nlp/semantic_model/lda.png" hspace='10'/> <br />
</p>
更多详情请参考[LDA论文](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf)
注:该Module由第三方开发者DesmonDay贡献。
## LDA模型 API 说明
### cal_doc_distance(doc_text1, doc_text2)
用于计算两个输入文档之间的距离,包括Jensen-Shannon divergence(JS散度)、Hellinger Distance(海林格距离)。
**参数**
- doc_text1(str): 输入的第一个文档。
- doc_text2(str): 输入的第二个文档。
**返回**
- jsd(float): 两个文档之间的JS散度([Jensen-Shannon divergence](https://blog.csdn.net/FrankieHello/article/details/80614422?utm_source=copy))。
- hd(float): 两个文档之间的海林格距离([Hellinger Distance](http://blog.sina.com.cn/s/blog_85f1ffb70101e65d.html))。
### cal_doc_keywords_similarity(document, top_k=10)
用于查找输入文档的前k个关键词及对应的与原文档的相似度。
**参数**
- document(str): 输入文档。
- top_k(int): 查找输入文档的前k个关键词。
**返回**
- results(list): 包含每个关键词以及对应的与原文档的相似度。其中,list的基本元素为dict,dict的key为关键词,value为对应的与原文档的相似度。
### cal_query_doc_similarity(query, document)
用于计算短文档与长文档之间的相似度。
**参数**
- query(str): 输入的短文档。
- document(str): 输入的长文档。
**返回**
- lda_sim(float): 返回短文档与长文档之间的相似度。
### infer_doc_topic_distribution(document)
用于推理出文档的主题分布。
**参数**
- document(str): 输入文档。
**返回**
- results(list): 包含主题分布下各个主题ID和对应的概率分布。其中,list的基本元素为dict,dict的key为主题ID,value为各个主题ID对应的概率。
### show_topic_keywords(topic_id, k=10)
用于展示出每个主题下对应的关键词,可配合推理主题分布的API使用。
**参数**
- topic_id(int): 主题ID。
- k(int): 需要知道对应主题的前k个关键词。
**返回**
- results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。
### 代码示例
这里展示部分API的使用示例。
``` python
import paddlehub as hub
lda_webpage = hub.Module(name="lda_webpage")
jsd, hd = lda_webpage.cal_doc_distance(doc_text1="百度的网页上有着各种新闻的推荐,内容丰富多彩。", doc_text2="百度首页推荐着各种新闻,还提供了强大的搜索引擎功能。")
# jsd = 0.00249, hd = 0.0510
results = lda_webpage.cal_doc_keywords_similarity('百度首页推荐着各种新闻,还提供了强大的搜索引擎功能。')
# [{'word': '强大', 'similarity': 0.0838851256627093},
# {'word': '推荐', 'similarity': 0.06295345182499558},
# {'word': '新闻', 'similarity': 0.05894049247832139},
# {'word': '提供', 'similarity': 0.04179908620523299},
# {'word': '百度', 'similarity': 0.033778847361833536},
# {'word': '首页', 'similarity': 0.018429949496365026},
# {'word': '功能', 'similarity': 0.011409342579361237},
# {'word': '搜索引擎', 'similarity': 0.010392479335778413}]
```
## 查看代码
https://github.com/baidu/Familia
## 依赖
paddlepaddle >= 1.8.2
paddlehub >= 1.8.0
## 更新历史
* 1.0.0
初始发布
"""
This file defines the basic config information of LDA/SLDA model.
"""
class ModelType:
LDA = 0
SLDA = 1
class ModelConfig:
type = None
num_topics = None
alpha = None
beta = None
word_topic_file = None
vocab_file = None
import numpy as np
class Topic(object):
"""Basic data structure of topic, contains topic id and
corresponding probability.
"""
def __init__(self, tid, prob):
self.tid = tid # topic id
self.prob = prob # topic probability
class Token(object):
"""Basic storage unit of LDA documents, contains word id
and corresponding topic.
"""
def __init__(self, topic, id):
self.topic = topic
self.id = id
class Sentence(object):
"""Basic storage unit of SentenceLDA documents, contains word ids
of the sentence and its corresponding topic id.
"""
def __init__(self, topic, tokens):
self.topic = topic
self.tokens = tokens
class LDADoc(object):
"""The storage structure of LDA model's inference result.
"""
def __init__(self):
self._num_topics = None # Number of topics.
self._num_accum = None # Number of accumulated sample rounds.
self._alpha = None # Document prior parameter.
self._tokens = None # Storage structure of inference results.
self._topic_sum = None # Document's topic sum in one round samples.
self._accum_topic_sum = None # Accumulated results of topic sum.
def init(self, num_topics):
"""Initialize the LDADoc according to num_topics.
"""
self._num_topics = num_topics
self._num_accum = 0
self._tokens = []
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_token(self, token):
"""Add new word to current LDADoc.
Arg:
token: Token class object.
"""
assert token.topic >= 0, "Topic %d out of range!" % token.topic
assert token.topic < self._num_topics, "Topic %d out of range!" % token.topic
self._tokens.append(token)
self._topic_sum[token.topic] += 1
def token(self, index):
return self._tokens[index]
def set_topic(self, index, new_topic):
"""Set the index word's topic to new_topic, and update the corresponding
topic distribution.
"""
assert new_topic >= 0, "Topic %d out of range!" % new_topic
assert new_topic < self._num_topics, "Topic %d out of range!" % new_topic
old_topic = self._tokens[index].topic
if new_topic == old_topic:
return
self._tokens[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def set_alpha(self, alpha):
self._alpha = alpha
def size(self):
"""Return number of words in LDADoc.
"""
return len(self._tokens)
def topic_sum(self, topic_id):
return self._topic_sum[topic_id]
def sparse_topic_dist(self, sort=True):
"""Return the topic distribution of documents in sparse format.
By default, it is sorted according to the topic probability
under the descending order.
"""
topic_dist = []
sum_ = np.sum(self._accum_topic_sum)
if sum_ == 0:
return
for i in range(0, self._num_topics):
if self._accum_topic_sum[i] == 0:
continue
topic_dist.append(Topic(i, self._accum_topic_sum[i] * 1.0 / sum_))
if sort:
def take_elem(topic):
return topic.prob
topic_dist.sort(key=take_elem, reverse=True)
if topic_dist is None:
topic_dist = []
return topic_dist
def dense_topic_dist(self):
"""Return the distribution of document topics in dense format,
taking into account the prior parameter alpha.
"""
dense_dist = np.zeros(self._num_topics)
if self.size() == 0:
return dense_dist
dense_dist = (
self._accum_topic_sum * 1.0 / self._num_accum + self._alpha) / (
self.size() + self._alpha * self._num_topics)
return dense_dist
def accumulate_topic_num(self):
self._accum_topic_sum += self._topic_sum
self._num_accum += 1
class SLDADoc(LDADoc):
"""Sentence LDA Document, inherited from LDADoc.
Add add_sentence interface.
"""
def __init__(self):
super().__init__()
self.__sentences = None
def init(self, num_topics):
"""Initialize the SLDADoc according to num_topics.
"""
self._num_topics = num_topics
self.__sentences = []
self._num_accum = 0
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_sentence(self, sent):
"""Add new sentence to current SLDADoc.
Arg:
sent: Sentence class object.
"""
assert sent.topic >= 0, "Topic %d out of range!" % (sent.topic)
assert sent.topic < self._num_topics, "Topic %d out of range!" % (
sent.topic)
self.__sentences.append(sent)
self._topic_sum[sent.topic] += 1
def set_topic(self, index, new_topic):
assert new_topic >= 0, "Topic %d out of range!" % (new_topic)
assert new_topic < self._num_topics, "Topic %d out of range!" % (
new_topic)
old_topic = self.__sentences[index].topic
if new_topic == old_topic:
return
self.__sentences[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def size(self):
"""Return number of sentences in SLDADoc.
"""
return len(self.__sentences)
def sent(self, index):
return self.__sentences[index]
import os
from paddlehub.common.logger import logger
from lda_webpage.config import ModelConfig
from lda_webpage.util import load_prototxt, fix_random_seed, rand_k
from lda_webpage.model import TopicModel
from lda_webpage.sampler import GibbsSampler, MHSampler
from lda_webpage.document import LDADoc, SLDADoc, Token, Sentence
from lda_webpage.vocab import OOV
class SamplerType:
GibbsSampling = 0
MetropolisHastings = 1
class InferenceEngine(object):
def __init__(self,
model_dir,
conf_file,
type=SamplerType.MetropolisHastings):
# Read model configuration.
config = ModelConfig()
conf_file_path = os.path.join(model_dir, conf_file)
load_prototxt(conf_file_path, config)
self.__model = TopicModel(model_dir, config)
self.__config = config
# Initialize the sampler according to the configuration.
if type == SamplerType.GibbsSampling:
self.__sampler = GibbsSampler(self.__model)
elif type == SamplerType.MetropolisHastings:
self.__sampler = MHSampler(self.__model)
def infer(self, input, doc):
"""Perform LDA topic inference on input, and store the results in doc.
Args:
input: a list of strings after tokenization.
doc: LDADoc type or SLDADoc type.
"""
fix_random_seed()
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for token in input:
id_ = self.__model.term_id(token)
if id_ != OOV:
init_topic = rand_k(self.__model.num_topics())
doc.add_token(Token(init_topic, id_))
self.lda_infer(doc, 20, 50)
elif isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for sent in input:
words = []
for token in sent:
id_ = self.__model.term_id(token)
if id_ != OOV:
words.append(id_)
init_topic = rand_k(self.__model.num_topics())
doc.add_sentence(Sentence(init_topic, words))
self.slda_infer(doc, 20, 50)
else:
logger.error("Wrong Doc Type!")
def lda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def slda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def model_type(self):
return self.__model.type()
def get_model(self):
return self.__model
def get_config(self):
return self.__config
import os
from collections import OrderedDict
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from lda_webpage.vocab import Vocab, WordCount
class TopicModel(object):
"""Storage Structure of Topic model, including vocabulary and word topic count.
"""
def __init__(self, model_dir, config):
"""
Args:
model_dir: the path of model directory
config: ModelConfig class.
"""
self.__word_topic = None # Model parameter of word topic.
self.__vocab = Vocab() # Vocab data structure of model.
self.__num_topics = config.num_topics # Number of topics.
self.__alpha = config.alpha
self.__alpha_sum = self.__alpha * self.__num_topics
self.__beta = config.beta
self.__beta_sum = None
self.__type = config.type # Model type.
self.__topic_sum = np.zeros(
self.__num_topics,
dtype="int64") # Accum sum of each topic in word topic.
self.__topic_words = [[] for _ in range(self.__num_topics)]
word_topic_path = os.path.join(model_dir, config.word_topic_file)
vocab_path = os.path.join(model_dir, config.vocab_file)
self.load_model(word_topic_path, vocab_path)
def term_id(self, term):
return self.__vocab.get_id(term)
def load_model(self, word_topic_path, vocab_path):
# Loading vocabulary
self.__vocab.load(vocab_path)
self.__beta_sum = self.__beta * self.__vocab.size()
self.__word_topic = [{} for _ in range(self.__vocab.size())] # 字典列表
self.__load_word_dict(word_topic_path)
logger.info(
"Model Info: #num_topics=%d #vocab_size=%d alpha=%f beta=%f" %
(self.num_topics(), self.vocab_size(), self.alpha(), self.beta()))
def word_topic_value(self, word_id, topic_id):
"""Return value of specific word under specific topic in the model.
"""
word_dict = self.__word_topic[word_id]
if topic_id not in word_dict:
return 0
return word_dict[topic_id]
def word_topic(self, term_id):
"""Return the topic distribution of a word.
"""
return self.__word_topic[term_id]
def topic_sum_value(self, topic_id):
return self.__topic_sum[topic_id]
def topic_sum(self):
return self.__topic_sum
def num_topics(self):
return self.__num_topics
def vocab_size(self):
return self.__vocab.size()
def alpha(self):
return self.__alpha
def alpha_sum(self):
return self.__alpha_sum
def beta(self):
return self.__beta
def beta_sum(self):
return self.__beta_sum
def type(self):
return self.__type
def __load_word_dict(self, word_dict_path):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
term_id = int(fields[0])
assert term_id < self.vocab_size(), "Term id out of range!"
assert term_id >= 0, "Term id out of range!"
for i in range(1, len(fields)):
topic_count = fields[i].split(":")
assert len(topic_count) == 2, "Topic count format error!"
topic_id = int(topic_count[0])
assert topic_id >= 0, "Topic out of range!"
assert topic_id < self.__num_topics, "Topic out of range!"
count = int(topic_count[1])
assert count >= 0, "Topic count error!"
self.__word_topic[term_id][topic_id] = count
self.__topic_sum[topic_id] += count
self.__topic_words[topic_id].append(
WordCount(term_id, count))
new_dict = OrderedDict()
for key in sorted(self.__word_topic[term_id]):
new_dict[key] = self.__word_topic[term_id][key]
self.__word_topic[term_id] = new_dict
def get_vocab(self):
return self.__vocab.vocabulary()
def topic_words(self):
return self.__topic_words
import os
import paddlehub as hub
from paddlehub.module.module import moduleinfo
from paddlehub.common.logger import logger
from lda_webpage.inference_engine import InferenceEngine
from lda_webpage.document import LDADoc
from lda_webpage.semantic_matching import SemanticMatching, WordAndDis
from lda_webpage.tokenizer import LACTokenizer, SimpleTokenizer
from lda_webpage.config import ModelType
from lda_webpage.vocab import Vocab, WordCount
@moduleinfo(
name="lda_webpage",
version="1.0.0",
summary=
"This is a PaddleHub Module for LDA topic model in webpage dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.",
author="DesmonDay",
author_email="",
type="nlp/semantic_model")
class TopicModel(hub.Module):
def _initialize(self):
"""
Initialize with the necessary elements.
"""
self.model_dir = os.path.join(self.directory, 'webpage')
self.conf_file = 'lda.conf'
self.__engine = InferenceEngine(self.model_dir, self.conf_file)
self.vocab_path = os.path.join(self.model_dir, 'vocab_info.txt')
lac = hub.Module(name="lac")
# self.__tokenizer = SimpleTokenizer(self.vocab_path)
self.__tokenizer = LACTokenizer(self.vocab_path, lac)
self.vocabulary = self.__engine.get_model().get_vocab()
self.config = self.__engine.get_config()
self.topic_words = self.__engine.get_model().topic_words()
self.topic_sum_table = self.__engine.get_model().topic_sum()
def take_elem(word_count):
return word_count.count
for i in range(self.config.num_topics):
self.topic_words[i].sort(key=take_elem, reverse=True)
logger.info("Finish initialization.")
def cal_doc_distance(self, doc_text1, doc_text2):
"""
This interface calculates the distance between documents.
Args:
doc_text1(str): the input document text 1.
doc_text2(str): the input document text 2.
Returns:
jsd(float): Jensen-Shannon Divergence distance of two documents.
hd(float): Hellinger Distance of two documents.
"""
doc1_tokens = self.__tokenizer.tokenize(doc_text1)
doc2_tokens = self.__tokenizer.tokenize(doc_text2)
# Document topic inference.
doc1, doc2 = LDADoc(), LDADoc()
self.__engine.infer(doc1_tokens, doc1)
self.__engine.infer(doc2_tokens, doc2)
# To calculate jsd, we need dense document topic distribution.
dense_dict1 = doc1.dense_topic_dist()
dense_dict2 = doc2.dense_topic_dist()
# Calculate the distance between distributions.
# The smaller the distance, the higher the document semantic similarity.
sm = SemanticMatching()
jsd = sm.jensen_shannon_divergence(dense_dict1, dense_dict2)
hd = sm.hellinger_distance(dense_dict1, dense_dict2)
return jsd, hd
def cal_doc_keywords_similarity(self, document, top_k=10):
"""
This interface can be used to find topk keywords of document.
Args:
document(str): the input document text.
top_k(int): top k keywords of this document.
Returns:
results(list): contains top_k keywords and their
corresponding similarity compared to document.
"""
d_tokens = self.__tokenizer.tokenize(document)
# Do topic inference on documents to obtain topic distribution.
doc = LDADoc()
self.__engine.infer(d_tokens, doc)
doc_topic_dist = doc.sparse_topic_dist()
items = []
words = set()
for word in d_tokens:
if word in words:
continue
words.add(word)
wd = WordAndDis()
wd.word = word
sm = SemanticMatching()
wd.distance = sm.likelihood_based_similarity(
terms=[word],
doc_topic_dist=doc_topic_dist,
model=self.__engine.get_model())
items.append(wd)
def take_elem(word_dis):
return word_dis.distance
items.sort(key=take_elem, reverse=True)
results = []
size = len(items)
for i in range(top_k):
if i >= size:
break
results.append({
"word": items[i].word,
"similarity": items[i].distance
})
return results
def cal_query_doc_similarity(self, query, document):
"""
This interface calculates the similarity between query and document.
Args:
query(str): the input query text.
document(str): the input document text.
Returns:
lda_sim(float): likelihood based similarity between query and document based on LDA.
"""
q_tokens = self.__tokenizer.tokenize(query)
d_tokens = self.__tokenizer.tokenize(document)
doc = LDADoc()
self.__engine.infer(d_tokens, doc)
doc_topic_dist = doc.sparse_topic_dist()
sm = SemanticMatching()
lda_sim = sm.likelihood_based_similarity(q_tokens, doc_topic_dist,
self.__engine.get_model())
return lda_sim
def infer_doc_topic_distribution(self, document):
"""
This interface infers the topic distribution of document.
Args:
document(str): the input document text.
Returns:
results(list): returns the topic distribution of document.
"""
tokens = self.__tokenizer.tokenize(document)
if tokens == []:
return []
results = []
doc = LDADoc()
self.__engine.infer(tokens, doc)
topics = doc.sparse_topic_dist()
for topic in topics:
results.append({"topic id": topic.tid, "distribution": topic.prob})
return results
def show_topic_keywords(self, topic_id, k=10):
"""
This interface returns the first k keywords under specific topic.
Args:
topic_id(int): topic information we want to know.
k(int): top k keywords.
Returns:
results(dict): contains specific topic's keywords and
corresponding probability.
"""
EPS = 1e-8
results = {}
if 0 <= topic_id < self.config.num_topics:
k = min(k, len(self.topic_words[topic_id]))
for i in range(k):
prob = self.topic_words[topic_id][i].count / \
(self.topic_sum_table[topic_id] + EPS)
results[self.vocabulary[self.topic_words[topic_id]
[i].word_id]] = prob
return results
else:
logger.error("%d is out of range!" % topic_id)
import os
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from lda_webpage.document import LDADoc, SLDADoc, Token, Sentence
from lda_webpage.vose_alias import VoseAlias
from lda_webpage.util import rand, rand_k
class Sampler(object):
def __init__(self):
pass
def sample_doc(self, doc):
"""Sample LDA or SLDA topics for documents.
"""
raise NotImplementedError
class MHSampler(Sampler):
def __init__(self, model):
super().__init__()
self.__model = model
self.__topic_indexes = None
self.__alias_tables = None
self.__prob_sum = None
self.__beta_alias = VoseAlias()
self.__beta_prior_sum = None
self.__mh_steps = 2
self.__construct_alias_table()
def __construct_alias_table(self):
"""Construct alias table for all words.
"""
logger.info("Construct alias table for alias sampling method.")
vocab_size = self.__model.vocab_size()
self.__topic_indexes = [[] for _ in range(vocab_size)]
self.__alias_tables = [VoseAlias() for _ in range(vocab_size)]
self.__prob_sum = np.zeros(vocab_size)
# Construct each word's alias table (prior is not included).
for i in tqdm(range(vocab_size)):
dist = []
prob_sum = 0
for key in self.__model.word_topic(i):
topic_id = key
word_topic_count = self.__model.word_topic(i)[key]
topic_sum = self.__model.topic_sum_value(topic_id)
self.__topic_indexes[i].append(topic_id)
q = word_topic_count / (topic_sum + self.__model.beta_sum())
dist.append(q)
prob_sum += q
self.__prob_sum[i] = prob_sum
if len(dist) > 0:
dist = np.array(dist, dtype=np.float)
self.__alias_tables[i].initialize(dist)
# Build prior parameter beta's alias table.
beta_dist = self.__model.beta() / (
self.__model.topic_sum() + self.__model.beta_sum())
self.__beta_prior_sum = np.sum(beta_dist)
self.__beta_alias.initialize(beta_dist)
def sample_doc(self, doc):
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_token(doc, doc.token(i))
doc.set_topic(i, new_topic)
elif isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_sentence(doc, doc.sent(i))
doc.set_topic(i, new_topic)
def __sample_token(self, doc, token):
new_topic = token.topic
for i in range(self.__mh_steps):
doc_proposed_topic = self.__doc_proposal(doc, token)
new_topic = self.__word_proposal(doc, token, doc_proposed_topic)
return new_topic
def __sample_sentence(self, doc, sent):
new_topic = sent.topic
for i in range(self.__mh_steps):
doc_proposed_topic = self.__doc_proposal(doc, sent)
new_topic = self.__word_proposal(doc, sent, doc_proposed_topic)
return new_topic
def __doc_proposal(self, doc, token):
if isinstance(doc, LDADoc) and isinstance(token, Token):
old_topic = token.topic
dart = rand() * (doc.size() + self.__model.alpha_sum())
if dart < doc.size():
token_index = int(dart)
new_topic = doc.token(token_index).topic
else:
new_topic = rand_k(self.__model.num_topics())
if new_topic != old_topic:
proposal_old = self.__doc_proposal_distribution(doc, old_topic)
proposal_new = self.__doc_proposal_distribution(doc, new_topic)
proportion_old = self.__proportional_function(
doc, token, old_topic)
proportion_new = self.__proportional_function(
doc, token, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
old_topic = sent.topic
dart = rand() * (doc.size() + self.__model.alpha_sum())
if dart < doc.size():
token_index = int(dart)
new_topic = doc.sent(token_index).topic
else:
new_topic = rand_k(self.__model.num_topics())
if new_topic != old_topic:
proportion_old = self.__proportional_function(
doc, sent, old_topic)
proportion_new = self.__proportional_function(
doc, sent, new_topic)
proposal_old = self.__doc_proposal_distribution(doc, old_topic)
proposal_new = self.__doc_proposal_distribution(doc, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
def __word_proposal(self, doc, token, old_topic):
if isinstance(doc, LDADoc) and isinstance(token, Token):
new_topic = self.__propose(token.id)
if new_topic != old_topic:
proposal_old = self.__word_proposal_distribution(
token.id, old_topic)
proposal_new = self.__word_proposal_distribution(
token.id, new_topic)
proportion_old = self.__proportional_function(
doc, token, old_topic)
proportion_new = self.__proportional_function(
doc, token, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
new_topic = old_topic
for word_id in sent.tokens:
new_topic = self.__propose(word_id)
if new_topic != old_topic:
proportion_old = self.__proportional_function(
doc, sent, old_topic)
proportion_new = self.__proportional_function(
doc, sent, new_topic)
proposal_old = self.__word_proposal_distribution(
word_id, old_topic)
proposal_new = self.__word_proposal_distribution(
word_id, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
new_topic = (new_topic & mask) | (old_topic & ~mask)
return new_topic
def __proportional_function(self, doc, token, new_topic):
if isinstance(doc, LDADoc) and isinstance(token, Token):
old_topic = token.topic
dt_alpha = doc.topic_sum(new_topic) + self.__model.alpha()
wt_beta = self.__model.word_topic_value(
token.id, new_topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
new_topic) + self.__model.beta_sum()
if new_topic == old_topic and wt_beta > 1:
if dt_alpha > 1:
dt_alpha -= 1
wt_beta -= 1
t_sum_beta_sum -= 1
return dt_alpha * wt_beta / t_sum_beta_sum
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
old_topic = sent.topic
result = doc.topic_sum(new_topic) + self.__model.alpha()
if new_topic == old_topic:
result -= 1
for word_id in sent.tokens:
wt_beta = self.__model.word_topic_value(
word_id, new_topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
new_topic) + self.__model.beta_sum()
if new_topic == old_topic and wt_beta > 1:
wt_beta -= 1
t_sum_beta_sum -= 1
result *= wt_beta / t_sum_beta_sum
return result
else:
logger.error("Wrong input argument type!")
def __word_proposal_distribution(self, word_id, topic):
wt_beta = self.__model.word_topic_value(word_id,
topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
topic) + self.__model.beta_sum()
return wt_beta / t_sum_beta_sum
def __doc_proposal_distribution(self, doc, topic):
return doc.topic_sum(topic) + self.__model.alpha()
def __propose(self, word_id):
dart = rand() * (self.__prob_sum[word_id] + self.__beta_prior_sum)
if dart < self.__prob_sum[word_id]:
idx = self.__alias_tables[word_id].generate()
topic = self.__topic_indexes[word_id][idx]
else:
topic = self.__beta_alias.generate()
return topic
class GibbsSampler(Sampler):
def __init__(self, model):
super().__init__()
self.__model = model
def sample_doc(self, doc):
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_token(doc, doc.token(i))
doc.set_topic(i, new_topic)
elif isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_sentence(doc, doc.sent(i))
doc.set_topic(i, new_topic)
def __sample_token(self, doc, token):
old_topic = token.topic
num_topics = self.__model.num_topics()
accum_prob = np.zeros(num_topics)
prob = np.zeros(num_topics)
sum_ = 0
for i in range(num_topics):
dt_alpha = doc.topic_sum(i) + self.__model.alpha()
wt_beta = self.__model.word_topic_value(token.id,
i) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum(i) + self.__model.beta_sum()
if i == old_topic and wt_beta > 1:
if dt_alpha > 1:
dt_alpha -= 1
wt_beta -= 1
t_sum_beta_sum -= 1
prob[i] = dt_alpha * wt_beta / t_sum_beta_sum
sum_ += prob[i]
accum_prob[i] = prob[i] if i == 0 else accum_prob[i - 1] + prob[i]
dart = rand() * sum_
if dart <= accum_prob[0]:
return 0
for i in range(1, num_topics):
if accum_prob[i - 1] < dart <= accum_prob[i]:
return i
return num_topics - 1
def __sample_sentence(self, doc, sent):
old_topic = sent.topic
num_topics = self.__model.num_topics()
accum_prob = np.zeros(num_topics)
prob = np.zeros(num_topics)
sum_ = 0
for t in range(num_topics):
dt_alpha = doc.topic_sum(t) + self.__model.alpha()
t_sum_beta_sum = self.__model.topic_sum(t) + self.__model.beta_sum()
if t == old_topic:
if dt_alpha > 1:
dt_alpha -= 1
if t_sum_beta_sum > 1:
t_sum_beta_sum -= 1
prob[t] = dt_alpha
for i in range(len(sent.tokens)):
w = sent.tokens[i]
wt_beta = self.__model.word_topic_value(
w, t) + self.__model.beta()
if t == old_topic and wt_beta > 1:
wt_beta -= 1
# Note: if the length of the sentence is too long, the probability will be
# too small and the accuracy will be lost if there are too many multiply items
prob[t] *= wt_beta / t_sum_beta_sum
sum_ += prob[t]
accum_prob[t] = prob[t] if t == 0 else accum_prob[t - 1] + prob[t]
dart = rand() * sum
if dart <= accum_prob[0]:
return 0
for t in range(1, num_topics):
if accum_prob[t - 1] < dart <= accum_prob[t]:
return t
return num_topics - 1
import os
import numpy as np
from paddlehub.common.logger import logger
from lda_webpage.vocab import OOV
EPS = 1e-06
class WordAndDis(object):
def __init__(self):
self.word = None
self.distance = None
class SemanticMatching(object):
def __init__(self):
pass
def l2_norm(self, vec):
"""Calculate the length of vector.
"""
result = np.sqrt(np.sum(vec**2))
return result
def cosine_similarity(self, vec1, vec2):
norm1 = self.l2_norm(vec1)
norm2 = self.l2_norm(vec2)
result = np.sum(vec1 * vec2) / norm1 / norm2
return result
def likelihood_based_similarity(self, terms, doc_topic_dist, model):
"""
Args:
terms: list of strings
doc_topic_dist: list of Topic class
model: TopicModel class
"""
num_of_term_in_vocab = 0
result = 0
for i in range(len(terms)):
term_id = model.term_id(terms[i])
if term_id == OOV:
continue
num_of_term_in_vocab += 1
for j in range(len(doc_topic_dist)):
topic_id = doc_topic_dist[j].tid
prob = doc_topic_dist[j].prob
result += model.word_topic_value(term_id, topic_id) * 1.0 / \
model.topic_sum_value(topic_id) * prob
if num_of_term_in_vocab == 0:
return result
return result / num_of_term_in_vocab
def kullback_leibler_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist2[dist2 < EPS] = EPS
result = np.sum(dist1 * np.log(dist1 / dist2))
return result
def jensen_shannon_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist1[dist1 < EPS] = EPS
dist2[dist2 < EPS] = EPS
mean = (dist1 + dist2) * 0.5
jsd = self.kullback_leibler_divergence(dist1, mean) * 0.5 + \
self.kullback_leibler_divergence(dist2, mean) * 0.5
return jsd
def hellinger_distance(self, dist1, dist2):
assert dist1.shape == dist2.shape
result = np.sum((np.sqrt(dist1) - np.sqrt(dist2))**2)
result = np.sqrt(result) * 0.7071067812
return result
import os
import numpy as np
from paddlehub.common.logger import logger
class Tokenizer(object):
"""Base tokenizer class.
"""
def __init__(self):
pass
def tokenize(self, text):
raise NotImplementedError
class SimpleTokenizer(Tokenizer):
"""Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
be used in topic model demo, but not in real business application scenarios.
Notes: This tokenizer can only recognize the words in the corresponding vocab file.
"""
def __init__(self, vocab_path):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__load_vocab(vocab_path)
def tokenize(self, text):
"""Tokenize the input string `text`, and return the tokenize result.
"""
text_len = len(text)
result = []
i = 0
while i < text_len:
word = found_word = ""
# Deal with English characters.
if self.__is_eng_char(text[i]):
for j in range(i, text_len + 1):
if j < text_len and self.__is_eng_char(text[j]):
word += self.__tolower(text[j])
else:
# Forward matching by character granularity.
if word in self.__vocab:
result.append(word)
i = j - 1
break
else:
for j in range(i, min(i + self.__max_word_len, text_len)):
word += text[j]
if word in self.__vocab:
found_word = word
if len(found_word) > 0:
result.append(found_word)
i += len(found_word) - 1
i += 1
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def __is_eng_char(self, c):
"""Check whether char c is an English character.
"""
return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')
def __tolower(self, c):
"""Return the lowercase character of the corresponding character, or return
the original character if there is no corresponding lowercase character.
"""
return c.lower()
class LACTokenizer(Tokenizer):
def __init__(self, vocab_path, lac):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__lac = lac
self.__load_vocab(vocab_path)
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def tokenize(self, text):
results = self.__lac.lexical_analysis(
texts=[text], use_gpu=True, batch_size=1, return_tag=True)
# Change English words to lower case.
# And just preserve the word in vocab.
words = results[0]["word"]
result = []
for word in words:
word = word.lower()
if word in self.__vocab:
result.append(word)
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab
import time
import yaml
import numpy as np
from paddlehub.common.logger import logger
from lda_webpage.config import ModelType
def load_prototxt(config_file, config):
"""
Args:
config_file: model configuration file.
config: ModelConfig class
"""
logger.info("Loading LDA config.")
with open(config_file, 'r') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
if yaml_dict["type"] == "LDA":
config.type = ModelType.LDA
else:
config.type = ModelType.SLDA
config.num_topics = yaml_dict["num_topics"]
config.alpha = yaml_dict["alpha"]
config.beta = yaml_dict["beta"]
config.word_topic_file = yaml_dict["word_topic_file"]
config.vocab_file = yaml_dict["vocab_file"]
def fix_random_seed(seed=2147483647):
np.random.seed(seed)
def rand(min_=0, max_=1):
return np.random.uniform(low=min_, high=max_)
def rand_k(k):
"""Returns an integer float number between [0, k - 1].
"""
return int(rand() * k)
def timeit(f):
"""Return time cost of function f.
"""
def timed(*args, **kwargs):
start_time = time.time()
result = f(*args, **kwargs)
end_time = time.time()
print(" [-] %s : %2.5f sec" % (f.__name__, end_time - start_time))
return result
return timed
from paddlehub.common.logger import logger
OOV = -1
class WordCount(object):
def __init__(self, word_id, count):
self.word_id = word_id
self.count = count
class Vocab(object):
def __init__(self):
self.__term2id = {}
self.__id2term = {}
def get_id(self, word):
if word not in self.__term2id:
return OOV
return self.__term2id[word]
def load(self, vocab_file):
self.__term2id = {}
self.__id2term = {}
with open(vocab_file, 'r') as fin:
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(
fields) == 5, "Vocabulary file [%s] format error!" % (
vocab_file)
term = fields[1]
id_ = int(fields[2])
if term in self.__term2id:
logger.error("Duplicate word [%s] in vocab file!" % (term))
continue
self.__term2id[term] = id_
self.__id2term[id_] = term
def size(self):
return len(self.__term2id)
def vocabulary(self):
return self.__id2term
import os
import numpy as np
from paddlehub.common.logger import logger
from lda_webpage.util import rand, rand_k
class VoseAlias(object):
"""Vose's Alias Method.
"""
def __init__(self):
self.__alias = None
self.__prob = None # np.array
def initialize(self, distribution):
"""Initialize the alias table according to the input distribution
Arg:
distribution: Numpy array.
"""
size = distribution.shape[0]
self.__alias = np.zeros(size, dtype=np.int64)
self.__prob = np.zeros(size)
sum_ = np.sum(distribution)
p = distribution / sum_ * size # Scale up probability.
large, small = [], []
for i, p_ in enumerate(p):
if p_ < 1.0:
small.append(i)
else:
large.append(i)
while large and small:
l = small[0]
g = large[0]
small.pop(0)
large.pop(0)
self.__prob[l] = p[l]
self.__alias[l] = g
p[g] = p[g] + p[l] - 1 # A more numerically stable option.
if p[g] < 1.0:
small.append(g)
else:
large.append(g)
while large:
g = large[0]
large.pop(0)
self.__prob[g] = 1.0
while small:
l = small[0]
small.pop(0)
self.__prob[l] = 1.0
def generate(self):
"""Generate samples from given distribution.
"""
dart1 = rand_k(self.size())
dart2 = int(rand())
return dart1 if dart2 > self.__prob[dart1] else self.__alias[dart1]
def size(self):
return self.__prob.shape[0]
## 模型概述
主题模型(Topic Model)是以无监督学习的方式对文档的隐含语义结构进行聚类的统计模型,其中SLDA(Sentence-LDA)是主题模型的一种。SLDA是LDA主题模型的扩展,LDA假设每个单词对应一个主题,而SLDA假设每个句子对应一个主题。本Module基于的数据集为百度自建的新闻领域数据集。
<p alian="center">
<img src="https://bj.bcebos.com/paddlehub/model/nlp/semantic_model/slda.png" hspace='10'/> <br />
</p>
更多详情请参考[SLDA论文](https://pdfs.semanticscholar.org/c311/778adb9484c86250e915aecd9714f4206050.pdf)
注:该Module由第三方开发者DesmonDay贡献。
## SLDA模型 API 说明
### infer_doc_topic_distribution(document)
用于推理出文档的主题分布。
**参数**
- document(str): 输入文档。
**返回**
- results(list): 包含主题分布下各个主题ID和对应的概率分布。其中,list的基本元素为dict,dict的key为主题ID,value为各个主题ID对应的概率。
### show_topic_keywords(topic_id, k=10)
用于展示出每个主题下对应的关键词,可配合推理主题分布的API使用。
**参数**
- topic_id(int): 主题ID。
- k(int): 需要知道对应主题的前k个关键词。
**返回**
- results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。
### 代码示例
这里展示API的使用示例。
``` python
import paddlehub as hub
slda_news = hub.Module(name="slda_news")
topic_dist = slda_news.infer_doc_topic_distribution("百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。")
# {378: 0.5, 804: 0.5}
keywords = slda_news.show_topic_keywords(topic_id=804, k=10)
# {'百度': 0.08269021676897842,
# '搜索': 0.04154762385123992,
# '推广': 0.026193527138926424,
# '贴吧': 0.02125616298078334,
# '排名': 0.019595252609963018,
# '关键词': 0.015173719446828477,
# '广告': 0.013552941381750894,
# '搜索引擎': 0.010038529194616577,
# '公司': 0.009388342219512786,
# '网站': 0.009173721627932065}
```
## 查看代码
https://github.com/baidu/Familia
## 依赖
paddlepaddle >= 1.8.2
paddlehub >= 1.8.0
## 更新历史
* 1.0.0
初始发布
"""
This file defines the basic config information of LDA/SLDA model.
"""
class ModelType:
LDA = 0
SLDA = 1
class ModelConfig:
type = None
num_topics = None
alpha = None
beta = None
word_topic_file = None
vocab_file = None
import numpy as np
class Topic(object):
"""Basic data structure of topic, contains topic id and
corresponding probability.
"""
def __init__(self, tid, prob):
self.tid = tid # topic id
self.prob = prob # topic probability
class Token(object):
"""Basic storage unit of LDA documents, contains word id
and corresponding topic.
"""
def __init__(self, topic, id):
self.topic = topic
self.id = id
class Sentence(object):
"""Basic storage unit of SentenceLDA documents, contains word ids
of the sentence and its corresponding topic id.
"""
def __init__(self, topic, tokens):
self.topic = topic
self.tokens = tokens
class LDADoc(object):
"""The storage structure of LDA model's inference result.
"""
def __init__(self):
self._num_topics = None # Number of topics.
self._num_accum = None # Number of accumulated sample rounds.
self._alpha = None # Document prior parameter.
self._tokens = None # Storage structure of inference results.
self._topic_sum = None # Document's topic sum in one round samples.
self._accum_topic_sum = None # Accumulated results of topic sum.
def init(self, num_topics):
"""Initialize the LDADoc according to num_topics.
"""
self._num_topics = num_topics
self._num_accum = 0
self._tokens = []
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_token(self, token):
"""Add new word to current LDADoc.
Arg:
token: Token class object.
"""
assert token.topic >= 0, "Topic %d out of range!" % token.topic
assert token.topic < self._num_topics, "Topic %d out of range!" % token.topic
self._tokens.append(token)
self._topic_sum[token.topic] += 1
def token(self, index):
return self._tokens[index]
def set_topic(self, index, new_topic):
"""Set the index word's topic to new_topic, and update the corresponding
topic distribution.
"""
assert new_topic >= 0, "Topic %d out of range!" % new_topic
assert new_topic < self._num_topics, "Topic %d out of range!" % new_topic
old_topic = self._tokens[index].topic
if new_topic == old_topic:
return
self._tokens[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def set_alpha(self, alpha):
self._alpha = alpha
def size(self):
"""Return number of words in LDADoc.
"""
return len(self._tokens)
def topic_sum(self, topic_id):
return self._topic_sum[topic_id]
def sparse_topic_dist(self, sort=True):
"""Return the topic distribution of documents in sparse format.
By default, it is sorted according to the topic probability
under the descending order.
"""
topic_dist = []
sum_ = np.sum(self._accum_topic_sum)
if sum_ == 0:
return
for i in range(0, self._num_topics):
if self._accum_topic_sum[i] == 0:
continue
topic_dist.append(Topic(i, self._accum_topic_sum[i] * 1.0 / sum_))
if sort:
def take_elem(topic):
return topic.prob
topic_dist.sort(key=take_elem, reverse=True)
if topic_dist is None:
topic_dist = []
return topic_dist
def dense_topic_dist(self):
"""Return the distribution of document topics in dense format,
taking into account the prior parameter alpha.
"""
dense_dist = np.zeros(self._num_topics)
if self.size() == 0:
return dense_dist
dense_dist = (
self._accum_topic_sum * 1.0 / self._num_accum + self._alpha) / (
self.size() + self._alpha * self._num_topics)
return dense_dist
def accumulate_topic_num(self):
self._accum_topic_sum += self._topic_sum
self._num_accum += 1
class SLDADoc(LDADoc):
"""Sentence LDA Document, inherited from LDADoc.
Add add_sentence interface.
"""
def __init__(self):
super().__init__()
self.__sentences = None
def init(self, num_topics):
"""Initialize the SLDADoc according to num_topics.
"""
self._num_topics = num_topics
self.__sentences = []
self._num_accum = 0
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_sentence(self, sent):
"""Add new sentence to current SLDADoc.
Arg:
sent: Sentence class object.
"""
assert sent.topic >= 0, "Topic %d out of range!" % (sent.topic)
assert sent.topic < self._num_topics, "Topic %d out of range!" % (
sent.topic)
self.__sentences.append(sent)
self._topic_sum[sent.topic] += 1
def set_topic(self, index, new_topic):
assert new_topic >= 0, "Topic %d out of range!" % (new_topic)
assert new_topic < self._num_topics, "Topic %d out of range!" % (
new_topic)
old_topic = self.__sentences[index].topic
if new_topic == old_topic:
return
self.__sentences[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def size(self):
"""Return number of sentences in SLDADoc.
"""
return len(self.__sentences)
def sent(self, index):
return self.__sentences[index]
import os
from paddlehub.common.logger import logger
from slda_news.config import ModelConfig
from slda_news.util import load_prototxt, fix_random_seed, rand_k
from slda_news.model import TopicModel
from slda_news.sampler import GibbsSampler, MHSampler
from slda_news.document import LDADoc, SLDADoc, Token, Sentence
from slda_news.vocab import OOV
class SamplerType:
GibbsSampling = 0
MetropolisHastings = 1
class InferenceEngine(object):
def __init__(self,
model_dir,
conf_file,
type=SamplerType.MetropolisHastings):
# Read model configuration.
config = ModelConfig()
conf_file_path = os.path.join(model_dir, conf_file)
load_prototxt(conf_file_path, config)
self.__model = TopicModel(model_dir, config)
self.__config = config
# Initialize the sampler according to the configuration.
if type == SamplerType.GibbsSampling:
self.__sampler = GibbsSampler(self.__model)
elif type == SamplerType.MetropolisHastings:
self.__sampler = MHSampler(self.__model)
def infer(self, input, doc):
"""Perform LDA topic inference on input, and store the results in doc.
Args:
input: a list of strings after tokenization.
doc: LDADoc type or SLDADoc type.
"""
fix_random_seed()
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for token in input:
id_ = self.__model.term_id(token)
if id_ != OOV:
init_topic = rand_k(self.__model.num_topics())
doc.add_token(Token(init_topic, id_))
self.lda_infer(doc, 20, 50)
elif isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for sent in input:
words = []
for token in sent:
id_ = self.__model.term_id(token)
if id_ != OOV:
words.append(id_)
init_topic = rand_k(self.__model.num_topics())
doc.add_sentence(Sentence(init_topic, words))
self.slda_infer(doc, 20, 50)
else:
logger.error("Wrong Doc Type!")
def lda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def slda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def model_type(self):
return self.__model.type()
def get_model(self):
return self.__model
def get_config(self):
return self.__config
import os
from collections import OrderedDict
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from slda_news.vocab import Vocab, WordCount
class TopicModel(object):
"""Storage Structure of Topic model, including vocabulary and word topic count.
"""
def __init__(self, model_dir, config):
"""
Args:
model_dir: the path of model directory
config: ModelConfig class.
"""
self.__word_topic = None # Model parameter of word topic.
self.__vocab = Vocab() # Vocab data structure of model.
self.__num_topics = config.num_topics # Number of topics.
self.__alpha = config.alpha
self.__alpha_sum = self.__alpha * self.__num_topics
self.__beta = config.beta
self.__beta_sum = None
self.__type = config.type # Model type.
self.__topic_sum = np.zeros(
self.__num_topics,
dtype="int64") # Accum sum of each topic in word topic.
self.__topic_words = [[] for _ in range(self.__num_topics)]
word_topic_path = os.path.join(model_dir, config.word_topic_file)
vocab_path = os.path.join(model_dir, config.vocab_file)
self.load_model(word_topic_path, vocab_path)
def term_id(self, term):
return self.__vocab.get_id(term)
def load_model(self, word_topic_path, vocab_path):
# Loading vocabulary
self.__vocab.load(vocab_path)
self.__beta_sum = self.__beta * self.__vocab.size()
self.__word_topic = [{} for _ in range(self.__vocab.size())] # 字典列表
self.__load_word_dict(word_topic_path)
logger.info(
"Model Info: #num_topics=%d #vocab_size=%d alpha=%f beta=%f" %
(self.num_topics(), self.vocab_size(), self.alpha(), self.beta()))
def word_topic_value(self, word_id, topic_id):
"""Return value of specific word under specific topic in the model.
"""
word_dict = self.__word_topic[word_id]
if topic_id not in word_dict:
return 0
return word_dict[topic_id]
def word_topic(self, term_id):
"""Return the topic distribution of a word.
"""
return self.__word_topic[term_id]
def topic_sum_value(self, topic_id):
return self.__topic_sum[topic_id]
def topic_sum(self):
return self.__topic_sum
def num_topics(self):
return self.__num_topics
def vocab_size(self):
return self.__vocab.size()
def alpha(self):
return self.__alpha
def alpha_sum(self):
return self.__alpha_sum
def beta(self):
return self.__beta
def beta_sum(self):
return self.__beta_sum
def type(self):
return self.__type
def __load_word_dict(self, word_dict_path):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
term_id = int(fields[0])
assert term_id < self.vocab_size(), "Term id out of range!"
assert term_id >= 0, "Term id out of range!"
for i in range(1, len(fields)):
topic_count = fields[i].split(":")
assert len(topic_count) == 2, "Topic count format error!"
topic_id = int(topic_count[0])
assert topic_id >= 0, "Topic out of range!"
assert topic_id < self.__num_topics, "Topic out of range!"
count = int(topic_count[1])
assert count >= 0, "Topic count error!"
self.__word_topic[term_id][topic_id] = count
self.__topic_sum[topic_id] += count
self.__topic_words[topic_id].append(
WordCount(term_id, count))
new_dict = OrderedDict()
for key in sorted(self.__word_topic[term_id]):
new_dict[key] = self.__word_topic[term_id][key]
self.__word_topic[term_id] = new_dict
def get_vocab(self):
return self.__vocab.vocabulary()
def topic_words(self):
return self.__topic_words
import os
import paddlehub as hub
from paddlehub.module.module import moduleinfo
from paddlehub.common.logger import logger
from slda_news.inference_engine import InferenceEngine
from slda_news.document import SLDADoc
from slda_news.semantic_matching import SemanticMatching, WordAndDis
from slda_news.tokenizer import LACTokenizer, SimpleTokenizer
from slda_news.config import ModelType
from slda_news.vocab import Vocab, WordCount
@moduleinfo(
name="slda_news",
version="1.0.0",
summary=
"This is a PaddleHub Module for SLDA topic model in news dataset, where we can infer the topic distribution of document.",
author="DesmonDay",
author_email="",
type="nlp/semantic_model")
class TopicModel(hub.Module):
def _initialize(self):
"""Initialize with the necessary elements.
"""
self.model_dir = os.path.join(self.directory, 'news')
self.conf_file = 'slda.conf'
self.__engine = InferenceEngine(self.model_dir, self.conf_file)
self.vocab_path = os.path.join(self.model_dir, 'vocab_info.txt')
lac = hub.Module(name="lac")
# self.__tokenizer = SimpleTokenizer(self.vocab_path)
self.__tokenizer = LACTokenizer(self.vocab_path, lac)
self.vocabulary = self.__engine.get_model().get_vocab()
self.config = self.__engine.get_config()
self.topic_words = self.__engine.get_model().topic_words()
self.topic_sum_table = self.__engine.get_model().topic_sum()
def take_elem(word_count):
return word_count.count
for i in range(self.config.num_topics):
self.topic_words[i].sort(key=take_elem, reverse=True)
logger.info("Finish Initialization.")
def infer_doc_topic_distribution(self, document):
"""
This interface infers the topic distribution of document.
Args:
document(str): the input document text.
Returns:
results(list): returns the topic distribution of document.
"""
tokens = self.__tokenizer.tokenize(document)
if tokens == []:
return []
results = []
sentences = []
sent = []
for i in range(len(tokens)):
sent.append(tokens[i])
if len(sent) % 5 == 0:
sentences.append(sent)
sent = []
if len(sent) > 0:
sentences.append(sent)
doc = SLDADoc()
self.__engine.infer(sentences, doc)
topics = doc.sparse_topic_dist()
for topic in topics:
results.append({"topic id": topic.tid, "distribution": topic.prob})
return results
def show_topic_keywords(self, topic_id, k=10):
"""
This interface returns the k keywords under specific topic.
Args:
topic_id(int): topic information we want to know.
k(int): top k keywords.
Returns:
results(dict): contains specific topic's keywords and corresponding
probability.
"""
EPS = 1e-8
results = {}
if 0 <= topic_id < self.config.num_topics:
k = min(k, len(self.topic_words[topic_id]))
for i in range(k):
prob = self.topic_words[topic_id][i].count / \
(self.topic_sum_table[topic_id] + EPS)
results[self.vocabulary[self.topic_words[topic_id]
[i].word_id]] = prob
return results
else:
logger.error("%d is out of range!" % topic_id)
import os
import numpy as np
from paddlehub.common.logger import logger
from slda_news.vocab import OOV
EPS = 1e-06
class WordAndDis(object):
def __init__(self):
self.word = None
self.distance = None
class SemanticMatching(object):
def __init__(self):
pass
def l2_norm(self, vec):
"""Calculate the length of vector.
"""
result = np.sqrt(np.sum(vec**2))
return result
def cosine_similarity(self, vec1, vec2):
norm1 = self.l2_norm(vec1)
norm2 = self.l2_norm(vec2)
result = np.sum(vec1 * vec2) / norm1 / norm2
return result
def likelihood_based_similarity(self, terms, doc_topic_dist, model):
"""
Args:
terms: list of strings
doc_topic_dist: list of Topic class
model: TopicModel class
"""
num_of_term_in_vocab = 0
result = 0
for i in range(len(terms)):
term_id = model.term_id(terms[i])
if term_id == OOV:
continue
num_of_term_in_vocab += 1
for j in range(len(doc_topic_dist)):
topic_id = doc_topic_dist[j].tid
prob = doc_topic_dist[j].prob
result += model.word_topic_value(term_id, topic_id) * 1.0 / \
model.topic_sum_value(topic_id) * prob
if num_of_term_in_vocab == 0:
return result
return result / num_of_term_in_vocab
def kullback_leibler_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist2[dist2 < EPS] = EPS
result = np.sum(dist1 * np.log(dist1 / dist2))
return result
def jensen_shannon_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist1[dist1 < EPS] = EPS
dist2[dist2 < EPS] = EPS
mean = (dist1 + dist2) * 0.5
jsd = self.kullback_leibler_divergence(dist1, mean) * 0.5 + \
self.kullback_leibler_divergence(dist2, mean) * 0.5
return jsd
def hellinger_distance(self, dist1, dist2):
assert dist1.shape == dist2.shape
result = np.sum((np.sqrt(dist1) - np.sqrt(dist2))**2)
result = np.sqrt(result) * 0.7071067812
return result
import time
import yaml
import numpy as np
from paddlehub.common.logger import logger
from slda_news.config import ModelType
def load_prototxt(config_file, config):
"""
Args:
config_file: model configuration file.
config: ModelConfig class
"""
logger.info("Loading SLDA config.")
with open(config_file, 'r') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
if yaml_dict["type"] == "LDA":
config.type = ModelType.LDA
else:
config.type = ModelType.SLDA
config.num_topics = yaml_dict["num_topics"]
config.alpha = yaml_dict["alpha"]
config.beta = yaml_dict["beta"]
config.word_topic_file = yaml_dict["word_topic_file"]
config.vocab_file = yaml_dict["vocab_file"]
def fix_random_seed(seed=2147483647):
np.random.seed(seed)
def rand(min_=0, max_=1):
return np.random.uniform(low=min_, high=max_)
def rand_k(k):
"""Returns an integer float number between [0, k - 1].
"""
return int(rand() * k)
def timeit(f):
"""Return time cost of function f.
"""
def timed(*args, **kwargs):
start_time = time.time()
result = f(*args, **kwargs)
end_time = time.time()
print(" [-] %s : %2.5f sec" % (f.__name__, end_time - start_time))
return result
return timed
"""
This file defines the basic config information of LDA/SLDA model.
"""
class ModelType:
LDA = 0
SLDA = 1
class ModelConfig:
type = None
num_topics = None
alpha = None
beta = None
word_topic_file = None
vocab_file = None
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册