未验证 提交 9b70b1f3 编写于 作者: S SiMing Dai 提交者: GitHub

Add topic model (#759)

上级 3eece7b5
......@@ -48,10 +48,7 @@ class ChineseTextDetectionDB(hub.Module):
try:
import shapely, pyclipper
except:
print(
'This module requires the shapely, pyclipper tools. The running enviroment does not meet the requirments. Please install the two packages.'
)
exit()
raise ImportError('This module requires the shapely, pyclipper tools. The running environment does not meet the requirements. Please install the two packages.')
def _set_config(self):
"""
......
......@@ -48,10 +48,7 @@ class ChineseTextDetectionDBServer(hub.Module):
try:
import shapely, pyclipper
except:
print(
'This module requires the shapely, pyclipper tools. The running enviroment does not meet the requirments. Please install the two packages.'
)
exit()
raise ImportError('This module requires the shapely, pyclipper tools. The running environment does not meet the requirements. Please install the two packages.')
def _set_config(self):
"""
......
## 模型概述
主题模型(Topic Model)是以无监督学习的方式对文档的隐含语义结构进行聚类的统计模型,其中LDA(Latent Dirichlet Allocation)算法是主题模型的一种。LDA根据对词的共现信息的分析,拟合出词-文档-主题的分布,从而将词、文本映射到一个语义空间中。本Module基于的数据集为百度自建的新闻领域数据集。
<p align="center">
<img src="https://bj.bcebos.com/paddlehub/model/nlp/semantic_model/lda.png" hspace='10'/> <br />
</p>
更多详情请参考[LDA论文](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf)
注:该Module由第三方开发者DesmonDay贡献。
## LDA模型 API 说明
### cal_doc_distance(doc_text1, doc_text2)
用于计算两个输入文档之间的距离,包括Jensen-Shannon divergence(JS散度)、Hellinger Distance(海林格距离)。
**参数**
- doc_text1(str): 输入的第一个文档。
- doc_text2(str): 输入的第二个文档。
**返回**
- jsd(float): 两个文档之间的JS散度([Jensen-Shannon divergence](https://blog.csdn.net/FrankieHello/article/details/80614422?utm_source=copy))。
- hd(float): 两个文档之间的海林格距离([Hellinger Distance](http://blog.sina.com.cn/s/blog_85f1ffb70101e65d.html))。
### cal_doc_keywords_similarity(document, top_k=10)
用于查找输入文档的前k个关键词及对应的与原文档的相似度。
**参数**
- document(str): 输入文档。
- top_k(int): 查找输入文档的前k个关键词。
**返回**
- results(list): 包含每个关键词以及对应的与原文档的相似度。其中,list的基本元素为dict,dict的key为关键词,value为对应的与原文档的相似度。
### cal_query_doc_similarity(query, document)
用于计算短文档与长文档之间的相似度。
**参数**
- query(str): 输入的短文档。
- document(str): 输入的长文档。
**返回**
- lda_sim(float): 返回短文档与长文档之间的相似度。
### infer_doc_topic_distribution(document)
用于推理出文档的主题分布。
**参数**
- document(str): 输入文档。
**返回**
- results(list): 包含主题分布下各个主题ID和对应的概率分布。其中,list的基本元素为dict,dict的key为主题ID,value为各个主题ID对应的概率。
### show_topic_keywords(topic_id, k=10)
用于展示出每个主题下对应的关键词,可配合推理主题分布的API使用。
**参数**
- topic_id(int): 主题ID。
- k(int): 需要知道对应主题的前k个关键词。
**返回**
- results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。
### 代码示例
这里展示部分API的使用示例。
``` python
import paddlehub as hub
lda_news = hub.Module(name="lda_news")
jsd, hd = lda_news.cal_doc_distance(doc_text1="今天的天气如何,适合出去游玩吗", doc_text2="感觉今天的天气不错,可以出去玩一玩了")
# jsd = 0.003109, hd = 0.0573171
lda_sim = lda_news.cal_query_doc_similarity(query='百度搜索引擎', document='百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。百度超过千亿的中文网页数据库,可以瞬间找到相关的搜索结果。')
# LDA similarity = 0.06826
results = lda_news.cal_doc_keywords_similarity('百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。百度超过千亿的中文网页数据库,可以瞬间找到相关的搜索结果。')
# [{'word': '百度', 'similarity': 0.12943492762349573},
# {'word': '信息', 'similarity': 0.06139783578769882},
# {'word': '找到', 'similarity': 0.055296603463188265},
# {'word': '搜索', 'similarity': 0.04270794098349327},
# {'word': '全球', 'similarity': 0.03773627056367886},
# {'word': '超过', 'similarity': 0.03478658388202199},
# {'word': '相关', 'similarity': 0.026295857219683725},
# {'word': '获取', 'similarity': 0.021313585287833996},
# {'word': '中文', 'similarity': 0.020187103312009513},
# {'word': '搜索引擎', 'similarity': 0.007092890537169911}]
```
## 查看代码
https://github.com/baidu/Familia
## 依赖
paddlepaddle >= 1.8.2
paddlehub >= 1.8.0
## 更新历史
* 1.0.0
初始发布
"""
This file defines the basic config information of LDA/SLDA model.
"""
class ModelType:
LDA = 0
SLDA = 1
class ModelConfig:
type = None
num_topics = None
alpha = None
beta = None
word_topic_file = None
vocab_file = None
import numpy as np
class Topic(object):
"""Basic data structure of topic, contains topic id and
corresponding probability.
"""
def __init__(self, tid, prob):
self.tid = tid # topic id
self.prob = prob # topic probability
class Token(object):
"""Basic storage unit of LDA documents, contains word id
and corresponding topic.
"""
def __init__(self, topic, id):
self.topic = topic
self.id = id
class Sentence(object):
"""Basic storage unit of SentenceLDA documents, contains word ids
of the sentence and its corresponding topic id.
"""
def __init__(self, topic, tokens):
self.topic = topic
self.tokens = tokens
class LDADoc(object):
"""The storage structure of LDA model's inference result.
"""
def __init__(self):
self._num_topics = None # Number of topics.
self._num_accum = None # Number of accumulated sample rounds.
self._alpha = None # Document prior parameter.
self._tokens = None # Storage structure of inference results.
self._topic_sum = None # Document's topic sum in one round samples.
self._accum_topic_sum = None # Accumulated results of topic sum.
def init(self, num_topics):
"""Initialize the LDADoc according to num_topics.
"""
self._num_topics = num_topics
self._num_accum = 0
self._tokens = []
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_token(self, token):
"""Add new word to current LDADoc.
Arg:
token: Token class object.
"""
assert token.topic >= 0, "Topic %d out of range!" % token.topic
assert token.topic < self._num_topics, "Topic %d out of range!" % token.topic
self._tokens.append(token)
self._topic_sum[token.topic] += 1
def token(self, index):
return self._tokens[index]
def set_topic(self, index, new_topic):
"""Set the index word's topic to new_topic, and update the corresponding
topic distribution.
"""
assert new_topic >= 0, "Topic %d out of range!" % new_topic
assert new_topic < self._num_topics, "Topic %d out of range!" % new_topic
old_topic = self._tokens[index].topic
if new_topic == old_topic:
return
self._tokens[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def set_alpha(self, alpha):
self._alpha = alpha
def size(self):
"""Return number of words in LDADoc.
"""
return len(self._tokens)
def topic_sum(self, topic_id):
return self._topic_sum[topic_id]
def sparse_topic_dist(self, sort=True):
"""Return the topic distribution of documents in sparse format.
By default, it is sorted according to the topic probability
under the descending order.
"""
topic_dist = []
sum_ = np.sum(self._accum_topic_sum)
if sum_ == 0:
return
for i in range(0, self._num_topics):
if self._accum_topic_sum[i] == 0:
continue
topic_dist.append(Topic(i, self._accum_topic_sum[i] * 1.0 / sum_))
if sort:
def take_elem(topic):
return topic.prob
topic_dist.sort(key=take_elem, reverse=True)
if topic_dist is None:
topic_dist = []
return topic_dist
def dense_topic_dist(self):
"""Return the distribution of document topics in dense format,
taking into account the prior parameter alpha.
"""
dense_dist = np.zeros(self._num_topics)
if self.size() == 0:
return dense_dist
dense_dist = (
self._accum_topic_sum * 1.0 / self._num_accum + self._alpha) / (
self.size() + self._alpha * self._num_topics)
return dense_dist
def accumulate_topic_num(self):
self._accum_topic_sum += self._topic_sum
self._num_accum += 1
class SLDADoc(LDADoc):
"""Sentence LDA Document, inherited from LDADoc.
Add add_sentence interface.
"""
def __init__(self):
super().__init__()
self.__sentences = None
def init(self, num_topics):
"""Initialize the SLDADoc according to num_topics.
"""
self._num_topics = num_topics
self.__sentences = []
self._num_accum = 0
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_sentence(self, sent):
"""Add new sentence to current SLDADoc.
Arg:
sent: Sentence class object.
"""
assert sent.topic >= 0, "Topic %d out of range!" % (sent.topic)
assert sent.topic < self._num_topics, "Topic %d out of range!" % (
sent.topic)
self.__sentences.append(sent)
self._topic_sum[sent.topic] += 1
def set_topic(self, index, new_topic):
assert new_topic >= 0, "Topic %d out of range!" % (new_topic)
assert new_topic < self._num_topics, "Topic %d out of range!" % (
new_topic)
old_topic = self.__sentences[index].topic
if new_topic == old_topic:
return
self.__sentences[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def size(self):
"""Return number of sentences in SLDADoc.
"""
return len(self.__sentences)
def sent(self, index):
return self.__sentences[index]
import os
from paddlehub.common.logger import logger
from lda_news.config import ModelConfig
from lda_news.util import load_prototxt, fix_random_seed, rand_k
from lda_news.model import TopicModel
from lda_news.sampler import GibbsSampler, MHSampler
from lda_news.document import LDADoc, SLDADoc, Token, Sentence
from lda_news.vocab import OOV
class SamplerType:
GibbsSampling = 0
MetropolisHastings = 1
class InferenceEngine(object):
def __init__(self,
model_dir,
conf_file,
type=SamplerType.MetropolisHastings):
# Read model configuration.
config = ModelConfig()
conf_file_path = os.path.join(model_dir, conf_file)
load_prototxt(conf_file_path, config)
self.__model = TopicModel(model_dir, config)
self.__config = config
# Initialize the sampler according to the configuration.
if type == SamplerType.GibbsSampling:
self.__sampler = GibbsSampler(self.__model)
elif type == SamplerType.MetropolisHastings:
self.__sampler = MHSampler(self.__model)
def infer(self, input, doc):
"""Perform LDA topic inference on input, and store the results in doc.
Args:
input: a list of strings after tokenization.
doc: LDADoc type or SLDADoc type.
"""
fix_random_seed()
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for token in input:
id_ = self.__model.term_id(token)
if id_ != OOV:
init_topic = rand_k(self.__model.num_topics())
doc.add_token(Token(init_topic, id_))
self.lda_infer(doc, 20, 50)
elif isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for sent in input:
words = []
for token in sent:
id_ = self.__model.term_id(token)
if id_ != OOV:
words.append(id_)
init_topic = rand_k(self.__model.num_topics())
doc.add_sentence(Sentence(init_topic, words))
self.slda_infer(doc, 20, 50)
else:
logger.error("Wrong Doc Type!")
def lda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def slda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def model_type(self):
return self.__model.type()
def get_model(self):
return self.__model
def get_config(self):
return self.__config
import os
from collections import OrderedDict
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from lda_news.vocab import Vocab, WordCount
class TopicModel(object):
"""Storage Structure of Topic model, including vocabulary and word topic count.
"""
def __init__(self, model_dir, config):
"""
Args:
model_dir: the path of model directory
config: ModelConfig class.
"""
self.__word_topic = None # Model parameter of word topic.
self.__vocab = Vocab() # Vocab data structure of model.
self.__num_topics = config.num_topics # Number of topics.
self.__alpha = config.alpha
self.__alpha_sum = self.__alpha * self.__num_topics
self.__beta = config.beta
self.__beta_sum = None
self.__type = config.type # Model type.
self.__topic_sum = np.zeros(
self.__num_topics,
dtype="int64") # Accum sum of each topic in word topic.
self.__topic_words = [[] for _ in range(self.__num_topics)]
word_topic_path = os.path.join(model_dir, config.word_topic_file)
vocab_path = os.path.join(model_dir, config.vocab_file)
self.load_model(word_topic_path, vocab_path)
def term_id(self, term):
return self.__vocab.get_id(term)
def load_model(self, word_topic_path, vocab_path):
# Loading vocabulary
self.__vocab.load(vocab_path)
self.__beta_sum = self.__beta * self.__vocab.size()
self.__word_topic = [{} for _ in range(self.__vocab.size())] # 字典列表
self.__load_word_dict(word_topic_path)
logger.info(
"Model Info: #num_topics=%d #vocab_size=%d alpha=%f beta=%f" %
(self.num_topics(), self.vocab_size(), self.alpha(), self.beta()))
def word_topic_value(self, word_id, topic_id):
"""Return value of specific word under specific topic in the model.
"""
word_dict = self.__word_topic[word_id]
if topic_id not in word_dict:
return 0
return word_dict[topic_id]
def word_topic(self, term_id):
"""Return the topic distribution of a word.
"""
return self.__word_topic[term_id]
def topic_sum_value(self, topic_id):
return self.__topic_sum[topic_id]
def topic_sum(self):
return self.__topic_sum
def num_topics(self):
return self.__num_topics
def vocab_size(self):
return self.__vocab.size()
def alpha(self):
return self.__alpha
def alpha_sum(self):
return self.__alpha_sum
def beta(self):
return self.__beta
def beta_sum(self):
return self.__beta_sum
def type(self):
return self.__type
def __load_word_dict(self, word_dict_path):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
term_id = int(fields[0])
assert term_id < self.vocab_size(), "Term id out of range!"
assert term_id >= 0, "Term id out of range!"
for i in range(1, len(fields)):
topic_count = fields[i].split(":")
assert len(topic_count) == 2, "Topic count format error!"
topic_id = int(topic_count[0])
assert topic_id >= 0, "Topic out of range!"
assert topic_id < self.__num_topics, "Topic out of range!"
count = int(topic_count[1])
assert count >= 0, "Topic count error!"
self.__word_topic[term_id][topic_id] = count
self.__topic_sum[topic_id] += count
self.__topic_words[topic_id].append(
WordCount(term_id, count))
new_dict = OrderedDict()
for key in sorted(self.__word_topic[term_id]):
new_dict[key] = self.__word_topic[term_id][key]
self.__word_topic[term_id] = new_dict
def get_vocab(self):
return self.__vocab.vocabulary()
def topic_words(self):
return self.__topic_words
import os
import paddlehub as hub
from paddlehub.module.module import moduleinfo
from paddlehub.common.logger import logger
from lda_news.inference_engine import InferenceEngine
from lda_news.document import LDADoc, SLDADoc
from lda_news.semantic_matching import SemanticMatching, WordAndDis
from lda_news.tokenizer import LACTokenizer, SimpleTokenizer
from lda_news.config import ModelType
from lda_news.vocab import Vocab, WordCount
@moduleinfo(
name="lda_news",
version="1.0.0",
summary=
"This is a PaddleHub Module for LDA topic model in news dataset, where we can calculate doc distance, calculate the similarity between query and document, etc",
author="DesmonDay",
author_email="",
type="nlp/semantic_model")
class TopicModel(hub.Module):
def _initialize(self):
"""
Initialize with the necessary elements.
"""
self.model_dir = os.path.join(self.directory, 'news')
self.conf_file = 'lda.conf'
self.__engine = InferenceEngine(self.model_dir, self.conf_file)
self.vocab_path = os.path.join(self.model_dir, 'vocab_info.txt')
lac = hub.Module(name="lac")
# self.__tokenizer = SimpleTokenizer(self.vocab_path)
self.__tokenizer = LACTokenizer(self.vocab_path, lac)
self.vocabulary = self.__engine.get_model().get_vocab()
self.config = self.__engine.get_config()
self.topic_words = self.__engine.get_model().topic_words()
self.topic_sum_table = self.__engine.get_model().topic_sum()
def take_elem(word_count):
return word_count.count
for i in range(self.config.num_topics):
self.topic_words[i].sort(key=take_elem, reverse=True)
logger.info("Finish initialization.")
def cal_doc_distance(self, doc_text1, doc_text2):
"""
This interface calculates the distance between documents.
Args:
doc_text1(str): the input document text 1.
doc_text2(str): the input document text 2.
Returns:
jsd(float): Jensen-Shannon Divergence distance of two documents.
hd(float): Hellinger Distance of two documents.
"""
doc1_tokens = self.__tokenizer.tokenize(doc_text1)
doc2_tokens = self.__tokenizer.tokenize(doc_text2)
# Document topic inference.
doc1, doc2 = LDADoc(), LDADoc()
self.__engine.infer(doc1_tokens, doc1)
self.__engine.infer(doc2_tokens, doc2)
# To calculate jsd, we need dense document topic distribution.
dense_dict1 = doc1.dense_topic_dist()
dense_dict2 = doc2.dense_topic_dist()
# Calculate the distance between distributions.
# The smaller the distance, the higher the document semantic similarity.
sm = SemanticMatching()
jsd = sm.jensen_shannon_divergence(dense_dict1, dense_dict2)
hd = sm.hellinger_distance(dense_dict1, dense_dict2)
return jsd, hd
def cal_doc_keywords_similarity(self, document, top_k=10):
"""
This interface can be used to find top k keywords of document.
Args:
document(str): the input document text.
top_k(int): top k keywords of this document.
Returns:
results(list): contains top_k keywords and their corresponding
similarity compared to document.
"""
d_tokens = self.__tokenizer.tokenize(document)
# Do topic inference on documents to obtain topic distribution.
doc = LDADoc()
self.__engine.infer(d_tokens, doc)
doc_topic_dist = doc.sparse_topic_dist()
items = []
words = set()
for word in d_tokens:
if word in words:
continue
words.add(word)
wd = WordAndDis()
wd.word = word
sm = SemanticMatching()
wd.distance = sm.likelihood_based_similarity(
terms=[word],
doc_topic_dist=doc_topic_dist,
model=self.__engine.get_model())
items.append(wd)
def take_elem(word_dis):
return word_dis.distance
items.sort(key=take_elem, reverse=True)
results = []
size = len(items)
for i in range(top_k):
if i >= size:
break
results.append({
"word": items[i].word,
"similarity": items[i].distance
})
return results
def cal_query_doc_similarity(self, query, document):
"""
This interface calculates the similarity between query and document.
Args:
query(str): the input query text.
document(str): the input document text.
Returns:
lda_sim(float): likelihood based similarity between query and document
based on LDA.
"""
q_tokens = self.__tokenizer.tokenize(query)
d_tokens = self.__tokenizer.tokenize(document)
doc = LDADoc()
self.__engine.infer(d_tokens, doc)
doc_topic_dist = doc.sparse_topic_dist()
sm = SemanticMatching()
lda_sim = sm.likelihood_based_similarity(q_tokens, doc_topic_dist,
self.__engine.get_model())
return lda_sim
def infer_doc_topic_distribution(self, document):
"""
This interface infers the topic distribution of document.
Args:
document(str): the input document text.
Returns:
results(list): returns the topic distribution of document.
"""
tokens = self.__tokenizer.tokenize(document)
if tokens == []:
return []
results = []
doc = LDADoc()
self.__engine.infer(tokens, doc)
topics = doc.sparse_topic_dist()
for topic in topics:
results.append({"topic id": topic.tid, "distribution": topic.prob})
return results
def show_topic_keywords(self, topic_id, k=10):
"""
This interface returns first k keywords under specific topic.
Args:
topic_id(int): topic information we want to know.
k(int): top k keywords.
Returns:
results(dict): contains specific topic's keywords and corresponding
probability.
"""
EPS = 1e-8
results = {}
if 0 <= topic_id < self.config.num_topics:
k = min(k, len(self.topic_words[topic_id]))
for i in range(k):
prob = self.topic_words[topic_id][i].count / \
(self.topic_sum_table[topic_id] + EPS)
results[self.vocabulary[self.topic_words[topic_id]
[i].word_id]] = prob
return results
else:
logger.error("%d is out of range!" % topic_id)
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from lda_news.document import LDADoc, SLDADoc, Token, Sentence
from lda_news.vose_alias import VoseAlias
from lda_news.util import rand, rand_k
class Sampler(object):
def __init__(self):
pass
def sample_doc(self, doc):
"""Sample LDA or SLDA topics for documents.
"""
raise NotImplementedError
class MHSampler(Sampler):
def __init__(self, model):
super().__init__()
self.__model = model
self.__topic_indexes = None
self.__alias_tables = None
self.__prob_sum = None
self.__beta_alias = VoseAlias()
self.__beta_prior_sum = None
self.__mh_steps = 2
self.__construct_alias_table()
def __construct_alias_table(self):
"""Construct alias table for all words.
"""
logger.info("Construct alias table for alias sampling method.")
vocab_size = self.__model.vocab_size()
self.__topic_indexes = [[] for _ in range(vocab_size)]
self.__alias_tables = [VoseAlias() for _ in range(vocab_size)]
self.__prob_sum = np.zeros(vocab_size)
# Construct each word's alias table (prior is not included).
for i in tqdm(range(vocab_size)):
dist = []
prob_sum = 0
for key in self.__model.word_topic(i):
topic_id = key
word_topic_count = self.__model.word_topic(i)[key]
topic_sum = self.__model.topic_sum_value(topic_id)
self.__topic_indexes[i].append(topic_id)
q = word_topic_count / (topic_sum + self.__model.beta_sum())
dist.append(q)
prob_sum += q
self.__prob_sum[i] = prob_sum
if len(dist) > 0:
dist = np.array(dist, dtype=np.float)
self.__alias_tables[i].initialize(dist)
# Build prior parameter beta's alias table.
beta_dist = self.__model.beta() / (
self.__model.topic_sum() + self.__model.beta_sum())
self.__beta_prior_sum = np.sum(beta_dist)
self.__beta_alias.initialize(beta_dist)
def sample_doc(self, doc):
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_token(doc, doc.token(i))
doc.set_topic(i, new_topic)
elif isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_sentence(doc, doc.sent(i))
doc.set_topic(i, new_topic)
def __sample_token(self, doc, token):
new_topic = token.topic
for i in range(self.__mh_steps):
doc_proposed_topic = self.__doc_proposal(doc, token)
new_topic = self.__word_proposal(doc, token, doc_proposed_topic)
return new_topic
def __sample_sentence(self, doc, sent):
new_topic = sent.topic
for i in range(self.__mh_steps):
doc_proposed_topic = self.__doc_proposal(doc, sent)
new_topic = self.__word_proposal(doc, sent, doc_proposed_topic)
return new_topic
def __doc_proposal(self, doc, token):
if isinstance(doc, LDADoc) and isinstance(token, Token):
old_topic = token.topic
dart = rand() * (doc.size() + self.__model.alpha_sum())
if dart < doc.size():
token_index = int(dart)
new_topic = doc.token(token_index).topic
else:
new_topic = rand_k(self.__model.num_topics())
if new_topic != old_topic:
proposal_old = self.__doc_proposal_distribution(doc, old_topic)
proposal_new = self.__doc_proposal_distribution(doc, new_topic)
proportion_old = self.__proportional_function(
doc, token, old_topic)
proportion_new = self.__proportional_function(
doc, token, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
old_topic = sent.topic
dart = rand() * (doc.size() + self.__model.alpha_sum())
if dart < doc.size():
token_index = int(dart)
new_topic = doc.sent(token_index).topic
else:
new_topic = rand_k(self.__model.num_topics())
if new_topic != old_topic:
proportion_old = self.__proportional_function(
doc, sent, old_topic)
proportion_new = self.__proportional_function(
doc, sent, new_topic)
proposal_old = self.__doc_proposal_distribution(doc, old_topic)
proposal_new = self.__doc_proposal_distribution(doc, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
def __word_proposal(self, doc, token, old_topic):
if isinstance(doc, LDADoc) and isinstance(token, Token):
new_topic = self.__propose(token.id)
if new_topic != old_topic:
proposal_old = self.__word_proposal_distribution(
token.id, old_topic)
proposal_new = self.__word_proposal_distribution(
token.id, new_topic)
proportion_old = self.__proportional_function(
doc, token, old_topic)
proportion_new = self.__proportional_function(
doc, token, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
new_topic = old_topic
for word_id in sent.tokens:
new_topic = self.__propose(word_id)
if new_topic != old_topic:
proportion_old = self.__proportional_function(
doc, sent, old_topic)
proportion_new = self.__proportional_function(
doc, sent, new_topic)
proposal_old = self.__word_proposal_distribution(
word_id, old_topic)
proposal_new = self.__word_proposal_distribution(
word_id, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
new_topic = (new_topic & mask) | (old_topic & ~mask)
return new_topic
def __proportional_function(self, doc, token, new_topic):
if isinstance(doc, LDADoc) and isinstance(token, Token):
old_topic = token.topic
dt_alpha = doc.topic_sum(new_topic) + self.__model.alpha()
wt_beta = self.__model.word_topic_value(
token.id, new_topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
new_topic) + self.__model.beta_sum()
if new_topic == old_topic and wt_beta > 1:
if dt_alpha > 1:
dt_alpha -= 1
wt_beta -= 1
t_sum_beta_sum -= 1
return dt_alpha * wt_beta / t_sum_beta_sum
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
old_topic = sent.topic
result = doc.topic_sum(new_topic) + self.__model.alpha()
if new_topic == old_topic:
result -= 1
for word_id in sent.tokens:
wt_beta = self.__model.word_topic_value(
word_id, new_topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
new_topic) + self.__model.beta_sum()
if new_topic == old_topic and wt_beta > 1:
wt_beta -= 1
t_sum_beta_sum -= 1
result *= wt_beta / t_sum_beta_sum
return result
else:
logger.error("Wrong input argument type!")
def __word_proposal_distribution(self, word_id, topic):
wt_beta = self.__model.word_topic_value(word_id,
topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
topic) + self.__model.beta_sum()
return wt_beta / t_sum_beta_sum
def __doc_proposal_distribution(self, doc, topic):
return doc.topic_sum(topic) + self.__model.alpha()
def __propose(self, word_id):
dart = rand() * (self.__prob_sum[word_id] + self.__beta_prior_sum)
if dart < self.__prob_sum[word_id]:
idx = self.__alias_tables[word_id].generate()
topic = self.__topic_indexes[word_id][idx]
else:
topic = self.__beta_alias.generate()
return topic
class GibbsSampler(Sampler):
def __init__(self, model):
super().__init__()
self.__model = model
def sample_doc(self, doc):
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_token(doc, doc.token(i))
doc.set_topic(i, new_topic)
elif isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_sentence(doc, doc.sent(i))
doc.set_topic(i, new_topic)
def __sample_token(self, doc, token):
old_topic = token.topic
num_topics = self.__model.num_topics()
accum_prob = np.zeros(num_topics)
prob = np.zeros(num_topics)
sum_ = 0
for i in range(num_topics):
dt_alpha = doc.topic_sum(i) + self.__model.alpha()
wt_beta = self.__model.word_topic_value(token.id,
i) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum(i) + self.__model.beta_sum()
if i == old_topic and wt_beta > 1:
if dt_alpha > 1:
dt_alpha -= 1
wt_beta -= 1
t_sum_beta_sum -= 1
prob[i] = dt_alpha * wt_beta / t_sum_beta_sum
sum_ += prob[i]
accum_prob[i] = prob[i] if i == 0 else accum_prob[i - 1] + prob[i]
dart = rand() * sum_
if dart <= accum_prob[0]:
return 0
for i in range(1, num_topics):
if accum_prob[i - 1] < dart <= accum_prob[i]:
return i
return num_topics - 1
def __sample_sentence(self, doc, sent):
old_topic = sent.topic
num_topics = self.__model.num_topics()
accum_prob = np.zeros(num_topics)
prob = np.zeros(num_topics)
sum_ = 0
for t in range(num_topics):
dt_alpha = doc.topic_sum(t) + self.__model.alpha()
t_sum_beta_sum = self.__model.topic_sum(t) + self.__model.beta_sum()
if t == old_topic:
if dt_alpha > 1:
dt_alpha -= 1
if t_sum_beta_sum > 1:
t_sum_beta_sum -= 1
prob[t] = dt_alpha
for i in range(len(sent.tokens)):
w = sent.tokens[i]
wt_beta = self.__model.word_topic_value(
w, t) + self.__model.beta()
if t == old_topic and wt_beta > 1:
wt_beta -= 1
# Note: if the length of the sentence is too long, the probability will be
# too small and the accuracy will be lost if there are too many multiply items
prob[t] *= wt_beta / t_sum_beta_sum
sum_ += prob[t]
accum_prob[t] = prob[t] if t == 0 else accum_prob[t - 1] + prob[t]
dart = rand() * sum
if dart <= accum_prob[0]:
return 0
for t in range(1, num_topics):
if accum_prob[t - 1] < dart <= accum_prob[t]:
return t
return num_topics - 1
import numpy as np
from lda_news.vocab import OOV
EPS = 1e-06
class WordAndDis(object):
def __init__(self):
self.word = None
self.distance = None
class SemanticMatching(object):
def __init__(self):
pass
def l2_norm(self, vec):
"""Calculate the length of vector.
"""
result = np.sqrt(np.sum(vec**2))
return result
def cosine_similarity(self, vec1, vec2):
"""Calculate the cosine similarity between two vectors.
"""
norm1 = self.l2_norm(vec1)
norm2 = self.l2_norm(vec2)
result = np.sum(vec1 * vec2) / norm1 / norm2
return result
def likelihood_based_similarity(self, terms, doc_topic_dist, model):
"""Calculate the likelihood based similarity.
Args:
terms: list of strings
doc_topic_dist: list of Topic class
model: TopicModel class
"""
num_of_term_in_vocab = 0
result = 0
for i in range(len(terms)):
term_id = model.term_id(terms[i])
if term_id == OOV:
continue
num_of_term_in_vocab += 1
for j in range(len(doc_topic_dist)):
topic_id = doc_topic_dist[j].tid
prob = doc_topic_dist[j].prob
result += model.word_topic_value(term_id, topic_id) * 1.0 / \
model.topic_sum_value(topic_id) * prob
if num_of_term_in_vocab == 0:
return result
return result / num_of_term_in_vocab
def kullback_leibler_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist2[dist2 < EPS] = EPS
result = np.sum(dist1 * np.log(dist1 / dist2))
return result
def jensen_shannon_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist1[dist1 < EPS] = EPS
dist2[dist2 < EPS] = EPS
mean = (dist1 + dist2) * 0.5
jsd = self.kullback_leibler_divergence(dist1, mean) * 0.5 + \
self.kullback_leibler_divergence(dist2, mean) * 0.5
return jsd
def hellinger_distance(self, dist1, dist2):
assert dist1.shape == dist2.shape
result = np.sum((np.sqrt(dist1) - np.sqrt(dist2))**2)
result = np.sqrt(result) * 0.7071067812
return result
"""This file defines tokenizer class object.
"""
class Tokenizer(object):
"""Base tokenizer class.
"""
def __init__(self):
pass
def tokenize(self, text):
raise NotImplementedError
class SimpleTokenizer(Tokenizer):
"""Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
be used in topic model demo, but not in real business application scenarios.
Notes: This tokenizer can only recognize the words in the corresponding vocab file.
"""
def __init__(self, vocab_path):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__load_vocab(vocab_path)
def tokenize(self, text):
"""Tokenize the input string `text`, and return the tokenize result.
"""
text_len = len(text)
result = []
i = 0
while i < text_len:
word = found_word = ""
# Deal with English characters.
if self.__is_eng_char(text[i]):
for j in range(i, text_len + 1):
if j < text_len and self.__is_eng_char(text[j]):
word += self.__tolower(text[j])
else:
# Forward matching by character granularity.
if word in self.__vocab:
result.append(word)
i = j - 1
break
else:
for j in range(i, min(i + self.__max_word_len, text_len)):
word += text[j]
if word in self.__vocab:
found_word = word
if len(found_word) > 0:
result.append(found_word)
i += len(found_word) - 1
i += 1
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def __is_eng_char(self, c):
"""Check whether char c is an English character.
"""
return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')
def __tolower(self, c):
"""Return the lowercase character of the corresponding character, or return
the original character if there is no corresponding lowercase character.
"""
return c.lower()
class LACTokenizer(Tokenizer):
def __init__(self, vocab_path, lac):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__lac = lac
self.__load_vocab(vocab_path)
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def tokenize(self, text):
results = self.__lac.lexical_analysis(
texts=[text], use_gpu=True, batch_size=1, return_tag=True)
# Change English words to lower case.
# And just preserve the word in vocab.
words = results[0]["word"]
result = []
for word in words:
word = word.lower()
if word in self.__vocab:
result.append(word)
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab
import time
import yaml
import numpy as np
from paddlehub.common.logger import logger
from lda_news.config import ModelType
def load_prototxt(config_file, config):
"""
Args:
config_file: model configuration file.
config: ModelConfig class
"""
logger.info("Loading LDA config.")
with open(config_file, 'r') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
if yaml_dict["type"] == "LDA":
config.type = ModelType.LDA
else:
config.type = ModelType.SLDA
config.num_topics = yaml_dict["num_topics"]
config.alpha = yaml_dict["alpha"]
config.beta = yaml_dict["beta"]
config.word_topic_file = yaml_dict["word_topic_file"]
config.vocab_file = yaml_dict["vocab_file"]
def fix_random_seed(seed=2147483647):
np.random.seed(seed)
def rand(min_=0, max_=1):
return np.random.uniform(low=min_, high=max_)
def rand_k(k):
"""Returns an integer float number between [0, k - 1].
"""
return int(rand() * k)
def timeit(f):
"""Return time cost of function f.
"""
def timed(*args, **kwargs):
start_time = time.time()
result = f(*args, **kwargs)
end_time = time.time()
print(" [-] %s : %2.5f sec" % (f.__name__, end_time - start_time))
return result
return timed
from paddlehub.common.logger import logger
OOV = -1
class WordCount(object):
def __init__(self, word_id, count):
self.word_id = word_id
self.count = count
class Vocab(object):
def __init__(self):
self.__term2id = {}
self.__id2term = {}
def get_id(self, word):
if word not in self.__term2id:
return OOV
return self.__term2id[word]
def load(self, vocab_file):
self.__term2id = {}
self.__id2term = {}
with open(vocab_file, 'r') as fin:
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(
fields) == 5, "Vocabulary file [%s] format error!" % (
vocab_file)
term = fields[1]
id_ = int(fields[2])
if term in self.__term2id:
logger.error("Duplicate word [%s] in vocab file!" % (term))
continue
self.__term2id[term] = id_
self.__id2term[id_] = term
def size(self):
return len(self.__term2id)
def vocabulary(self):
return self.__id2term
import numpy as np
from lda_news.util import rand, rand_k
class VoseAlias(object):
"""Vose's Alias Method.
"""
def __init__(self):
self.__alias = None
self.__prob = None # np.array
def initialize(self, distribution):
"""Initialize the alias table according to the input distribution
Arg:
distribution: the input distribution.
"""
size = distribution.shape[0]
self.__alias = np.zeros(size, dtype=np.int64)
self.__prob = np.zeros(size)
sum_ = np.sum(distribution)
p = distribution / sum_ * size # Scale up probability.
large, small = [], []
for i, p_ in enumerate(p):
if p_ < 1.0:
small.append(i)
else:
large.append(i)
while large and small:
l = small[0]
g = large[0]
small.pop(0)
large.pop(0)
self.__prob[l] = p[l]
self.__alias[l] = g
p[g] = p[g] + p[l] - 1 # A more numerically stable option.
if p[g] < 1.0:
small.append(g)
else:
large.append(g)
while large:
g = large[0]
large.pop(0)
self.__prob[g] = 1.0
while small:
l = small[0]
small.pop(0)
self.__prob[l] = 1.0
def generate(self):
"""Generate samples from given distribution.
"""
dart1 = rand_k(self.size())
dart2 = int(rand())
return dart1 if dart2 > self.__prob[dart1] else self.__alias[dart1]
def size(self):
return self.__prob.shape[0]
## 模型概述
主题模型(Topic Model)是以无监督学习的方式对文档的隐含语义结构进行聚类的统计模型,其中LDA(Latent Dirichlet Allocation)算法是主题模型的一种。LDA根据对词的共现信息的分析,拟合出词-文档-主题的分布,从而将词、文本映射到一个语义空间中。本Module基于的数据集为百度自建的小说领域数据集。
<p align="center">
<img src="https://bj.bcebos.com/paddlehub/model/nlp/semantic_model/lda.png" hspace='10'/> <br />
</p>
更多详情请参考[LDA论文](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf)
注:该Module由第三方开发者DesmonDay贡献。
## LDA模型 API 说明
### cal_doc_distance(doc_text1, doc_text2)
用于计算两个输入文档之间的距离,包括Jensen-Shannon divergence(JS散度)、Hellinger Distance(海林格距离)。
**参数**
- doc_text1(str): 输入的第一个文档。
- doc_text2(str): 输入的第二个文档。
**返回**
- jsd(float): 两个文档之间的JS散度([Jensen-Shannon divergence](https://blog.csdn.net/FrankieHello/article/details/80614422?utm_source=copy))。
- hd(float): 两个文档之间的海林格距离([Hellinger Distance](http://blog.sina.com.cn/s/blog_85f1ffb70101e65d.html))。
### cal_doc_keywords_similarity(document, top_k=10)
用于查找输入文档的前k个关键词及对应的与原文档的相似度。
**参数**
- document(str): 输入文档。
- top_k(int): 查找输入文档的前k个关键词。
**返回**
- results(list): 包含每个关键词以及对应的与原文档的相似度。其中,list的基本元素为dict,dict的key为关键词,value为对应的与原文档的相似度。
### cal_query_doc_similarity(query, document)
用于计算短文档与长文档之间的相似度。
**参数**
- query(str): 输入的短文档。
- document(str): 输入的长文档。
**返回**
- lda_sim(float): 返回短文档与长文档之间的相似度。
### infer_doc_topic_distribution(document)
用于推理出文档的主题分布。
**参数**
- document(str): 输入文档。
**返回**
- results(list): 包含主题分布下各个主题ID和对应的概率分布。其中,list的基本元素为dict,dict的key为主题ID,value为各个主题ID对应的概率。
### show_topic_keywords(topic_id, k=10)
用于展示出每个主题下对应的关键词,可配合推理主题分布的API使用。
**参数**
- topic_id(int): 主题ID。
- k(int): 需要知道对应主题的前k个关键词。
**返回**
- results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。
### 代码示例
这里展示部分API的使用示例。
``` python
import paddlehub as hub
lda_novel = hub.Module(name="lda_novel")
jsd, hd = lda_novel.cal_doc_distance(doc_text1="老人幸福地看着自己的儿子,露出了欣慰的笑容。", doc_text2="老奶奶看着自己的儿子,幸福地笑了。")
# jsd = 0.01292, hd = 0.11893
lda_sim = lda_novel.cal_query_doc_similarity(query='亲孙女', document='老人激动地打量着面前的女孩,似乎找到了自己的亲孙女一般,双手止不住地颤抖着。')
# LDA similarity = 0.0
```
## 查看代码
https://github.com/baidu/Familia
## 依赖
paddlepaddle >= 1.8.2
paddlehub >= 1.8.0
## 更新历史
* 1.0.0
初始发布
"""
This file defines the basic config information of LDA/SLDA model.
"""
class ModelType:
LDA = 0
SLDA = 1
class ModelConfig:
type = None
num_topics = None
alpha = None
beta = None
word_topic_file = None
vocab_file = None
import numpy as np
class Topic(object):
"""Basic data structure of topic, contains topic id and
corresponding probability.
"""
def __init__(self, tid, prob):
self.tid = tid # topic id
self.prob = prob # topic probability
class Token(object):
"""Basic storage unit of LDA documents, contains word id
and corresponding topic.
"""
def __init__(self, topic, id):
self.topic = topic
self.id = id
class Sentence(object):
"""Basic storage unit of SentenceLDA documents, contains word ids
of the sentence and its corresponding topic id.
"""
def __init__(self, topic, tokens):
self.topic = topic
self.tokens = tokens
class LDADoc(object):
"""The storage structure of LDA model's inference result.
"""
def __init__(self):
self._num_topics = None # Number of topics.
self._num_accum = None # Number of accumulated sample rounds.
self._alpha = None # Document prior parameter.
self._tokens = None # Storage structure of inference results.
self._topic_sum = None # Document's topic sum in one round samples.
self._accum_topic_sum = None # Accumulated results of topic sum.
def init(self, num_topics):
"""Initialize the LDADoc according to num_topics.
"""
self._num_topics = num_topics
self._num_accum = 0
self._tokens = []
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_token(self, token):
"""Add new word to current LDADoc.
Arg:
token: Token class object.
"""
assert token.topic >= 0, "Topic %d out of range!" % token.topic
assert token.topic < self._num_topics, "Topic %d out of range!" % token.topic
self._tokens.append(token)
self._topic_sum[token.topic] += 1
def token(self, index):
return self._tokens[index]
def set_topic(self, index, new_topic):
"""Set the index word's topic to new_topic, and update the corresponding
topic distribution.
"""
assert new_topic >= 0, "Topic %d out of range!" % new_topic
assert new_topic < self._num_topics, "Topic %d out of range!" % new_topic
old_topic = self._tokens[index].topic
if new_topic == old_topic:
return
self._tokens[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def set_alpha(self, alpha):
self._alpha = alpha
def size(self):
"""Return number of words in LDADoc.
"""
return len(self._tokens)
def topic_sum(self, topic_id):
return self._topic_sum[topic_id]
def sparse_topic_dist(self, sort=True):
"""Return the topic distribution of documents in sparse format.
By default, it is sorted according to the topic probability
under the descending order.
"""
topic_dist = []
sum_ = np.sum(self._accum_topic_sum)
if sum_ == 0:
return
for i in range(0, self._num_topics):
if self._accum_topic_sum[i] == 0:
continue
topic_dist.append(Topic(i, self._accum_topic_sum[i] * 1.0 / sum_))
if sort:
def take_elem(topic):
return topic.prob
topic_dist.sort(key=take_elem, reverse=True)
if topic_dist is None:
topic_dist = []
return topic_dist
def dense_topic_dist(self):
"""Return the distribution of document topics in dense format,
taking into account the prior parameter alpha.
"""
dense_dist = np.zeros(self._num_topics)
if self.size() == 0:
return dense_dist
dense_dist = (
self._accum_topic_sum * 1.0 / self._num_accum + self._alpha) / (
self.size() + self._alpha * self._num_topics)
return dense_dist
def accumulate_topic_num(self):
self._accum_topic_sum += self._topic_sum
self._num_accum += 1
class SLDADoc(LDADoc):
"""Sentence LDA Document, inherited from LDADoc.
Add add_sentence interface.
"""
def __init__(self):
super().__init__()
self.__sentences = None
def init(self, num_topics):
"""Initialize the SLDADoc according to num_topics.
"""
self._num_topics = num_topics
self.__sentences = []
self._num_accum = 0
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_sentence(self, sent):
"""Add new sentence to current SLDADoc.
Arg:
sent: Sentence class object.
"""
assert sent.topic >= 0, "Topic %d out of range!" % (sent.topic)
assert sent.topic < self._num_topics, "Topic %d out of range!" % (
sent.topic)
self.__sentences.append(sent)
self._topic_sum[sent.topic] += 1
def set_topic(self, index, new_topic):
assert new_topic >= 0, "Topic %d out of range!" % (new_topic)
assert new_topic < self._num_topics, "Topic %d out of range!" % (
new_topic)
old_topic = self.__sentences[index].topic
if new_topic == old_topic:
return
self.__sentences[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def size(self):
"""Return number of sentences in SLDADoc.
"""
return len(self.__sentences)
def sent(self, index):
return self.__sentences[index]
import os
from paddlehub.common.logger import logger
from lda_novel.config import ModelConfig
from lda_novel.util import load_prototxt, fix_random_seed, rand_k
from lda_novel.model import TopicModel
from lda_novel.sampler import GibbsSampler, MHSampler
from lda_novel.document import LDADoc, SLDADoc, Token, Sentence
from lda_novel.vocab import OOV
class SamplerType:
GibbsSampling = 0
MetropolisHastings = 1
class InferenceEngine(object):
def __init__(self,
model_dir,
conf_file,
type=SamplerType.MetropolisHastings):
# Read model configuration.
config = ModelConfig()
conf_file_path = os.path.join(model_dir, conf_file)
load_prototxt(conf_file_path, config)
self.__model = TopicModel(model_dir, config)
self.__config = config
# Initialize the sampler according to the configuration.
if type == SamplerType.GibbsSampling:
self.__sampler = GibbsSampler(self.__model)
elif type == SamplerType.MetropolisHastings:
self.__sampler = MHSampler(self.__model)
def infer(self, input, doc):
"""Perform LDA topic inference on input, and store the results in doc.
Args:
input: a list of strings after tokenization.
doc: LDADoc type or SLDADoc type.
"""
fix_random_seed()
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for token in input:
id_ = self.__model.term_id(token)
if id_ != OOV:
init_topic = rand_k(self.__model.num_topics())
doc.add_token(Token(init_topic, id_))
self.lda_infer(doc, 20, 50)
elif isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for sent in input:
words = []
for token in sent:
id_ = self.__model.term_id(token)
if id_ != OOV:
words.append(id_)
init_topic = rand_k(self.__model.num_topics())
doc.add_sentence(Sentence(init_topic, words))
self.slda_infer(doc, 20, 50)
else:
logger.error("Wrong Doc Type!")
def lda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def slda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def model_type(self):
return self.__model.type()
def get_model(self):
return self.__model
def get_config(self):
return self.__config
import os
from collections import OrderedDict
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from lda_novel.vocab import Vocab, WordCount
class TopicModel(object):
"""Storage Structure of Topic model, including vocabulary and word topic count.
"""
def __init__(self, model_dir, config):
"""
Args:
model_dir: the path of model directory
config: ModelConfig class.
"""
self.__word_topic = None # Model parameter of word topic.
self.__vocab = Vocab() # Vocab data structure of model.
self.__num_topics = config.num_topics # Number of topics.
self.__alpha = config.alpha
self.__alpha_sum = self.__alpha * self.__num_topics
self.__beta = config.beta
self.__beta_sum = None
self.__type = config.type # Model type.
self.__topic_sum = np.zeros(
self.__num_topics,
dtype="int64") # Accum sum of each topic in word topic.
self.__topic_words = [[] for _ in range(self.__num_topics)]
word_topic_path = os.path.join(model_dir, config.word_topic_file)
vocab_path = os.path.join(model_dir, config.vocab_file)
self.load_model(word_topic_path, vocab_path)
def term_id(self, term):
return self.__vocab.get_id(term)
def load_model(self, word_topic_path, vocab_path):
# Loading vocabulary
self.__vocab.load(vocab_path)
self.__beta_sum = self.__beta * self.__vocab.size()
self.__word_topic = [{} for _ in range(self.__vocab.size())] # 字典列表
self.__load_word_dict(word_topic_path)
logger.info(
"Model Info: #num_topics=%d #vocab_size=%d alpha=%f beta=%f" %
(self.num_topics(), self.vocab_size(), self.alpha(), self.beta()))
def word_topic_value(self, word_id, topic_id):
"""Return value of specific word under specific topic in the model.
"""
word_dict = self.__word_topic[word_id]
if topic_id not in word_dict:
return 0
return word_dict[topic_id]
def word_topic(self, term_id):
"""Return the topic distribution of a word.
"""
return self.__word_topic[term_id]
def topic_sum_value(self, topic_id):
return self.__topic_sum[topic_id]
def topic_sum(self):
return self.__topic_sum
def num_topics(self):
return self.__num_topics
def vocab_size(self):
return self.__vocab.size()
def alpha(self):
return self.__alpha
def alpha_sum(self):
return self.__alpha_sum
def beta(self):
return self.__beta
def beta_sum(self):
return self.__beta_sum
def type(self):
return self.__type
def __load_word_dict(self, word_dict_path):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
term_id = int(fields[0])
assert term_id < self.vocab_size(), "Term id out of range!"
assert term_id >= 0, "Term id out of range!"
for i in range(1, len(fields)):
topic_count = fields[i].split(":")
assert len(topic_count) == 2, "Topic count format error!"
topic_id = int(topic_count[0])
assert topic_id >= 0, "Topic out of range!"
assert topic_id < self.__num_topics, "Topic out of range!"
count = int(topic_count[1])
assert count >= 0, "Topic count error!"
self.__word_topic[term_id][topic_id] = count
self.__topic_sum[topic_id] += count
self.__topic_words[topic_id].append(
WordCount(term_id, count))
new_dict = OrderedDict()
for key in sorted(self.__word_topic[term_id]):
new_dict[key] = self.__word_topic[term_id][key]
self.__word_topic[term_id] = new_dict
def get_vocab(self):
return self.__vocab.vocabulary()
def topic_words(self):
return self.__topic_words
import os
import paddlehub as hub
from paddlehub.module.module import moduleinfo
from paddlehub.common.logger import logger
from lda_novel.inference_engine import InferenceEngine
from lda_novel.document import LDADoc, SLDADoc
from lda_novel.semantic_matching import SemanticMatching, WordAndDis
from lda_novel.tokenizer import LACTokenizer, SimpleTokenizer
from lda_novel.config import ModelType
from lda_novel.vocab import Vocab, WordCount
@moduleinfo(
name="lda_novel",
version="1.0.0",
summary=
"This is a PaddleHub Module for LDA topic model in novel dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.",
author="DesmonDay",
author_email="",
type="nlp/semantic_model")
class TopicModel(hub.Module):
def _initialize(self):
"""
Initialize with the necessary elements.
"""
self.model_dir = os.path.join(self.directory, 'novel')
self.conf_file = 'lda.conf'
self.__engine = InferenceEngine(self.model_dir, self.conf_file)
self.vocab_path = os.path.join(self.model_dir, 'vocab_info.txt')
lac = hub.Module(name="lac")
# self.__tokenizer = SimpleTokenizer(self.vocab_path)
self.__tokenizer = LACTokenizer(self.vocab_path, lac)
self.vocabulary = self.__engine.get_model().get_vocab()
self.config = self.__engine.get_config()
self.topic_words = self.__engine.get_model().topic_words()
self.topic_sum_table = self.__engine.get_model().topic_sum()
def take_elem(word_count):
return word_count.count
for i in range(self.config.num_topics):
self.topic_words[i].sort(key=take_elem, reverse=True)
logger.info("Finish initialization.")
def cal_doc_distance(self, doc_text1, doc_text2):
"""
This interface calculates the distance between documents.
Args:
doc_text1(str): the input document text 1.
doc_text2(str): the input document text 2.
Returns:
jsd(float): Jensen-Shannon Divergence distance of two documents.
hd(float): Hellinger Distance of two documents.
"""
doc1_tokens = self.__tokenizer.tokenize(doc_text1)
doc2_tokens = self.__tokenizer.tokenize(doc_text2)
# Document topic inference.
doc1, doc2 = LDADoc(), LDADoc()
self.__engine.infer(doc1_tokens, doc1)
self.__engine.infer(doc2_tokens, doc2)
# To calculate jsd, we need dense document topic distribution.
dense_dict1 = doc1.dense_topic_dist()
dense_dict2 = doc2.dense_topic_dist()
# Calculate the distance between distributions.
# The smaller the distance, the higher the document semantic similarity.
sm = SemanticMatching()
jsd = sm.jensen_shannon_divergence(dense_dict1, dense_dict2)
hd = sm.hellinger_distance(dense_dict1, dense_dict2)
return jsd, hd
def cal_doc_keywords_similarity(self, document, top_k=10):
"""
This interface can be used to find topk keywords of document.
Args:
document(str): the input document text.
top_k(int): top k keywords of this document.
Returns:
results(list): contains top_k keywords and their corresponding
similarity compared to document.
"""
d_tokens = self.__tokenizer.tokenize(document)
# Do topic inference on documents to obtain topic distribution.
doc = LDADoc()
self.__engine.infer(d_tokens, doc)
doc_topic_dist = doc.sparse_topic_dist()
items = []
words = set()
for word in d_tokens:
if word in words:
continue
words.add(word)
wd = WordAndDis()
wd.word = word
sm = SemanticMatching()
wd.distance = sm.likelihood_based_similarity(
terms=[word],
doc_topic_dist=doc_topic_dist,
model=self.__engine.get_model())
items.append(wd)
def take_elem(word_dis):
return word_dis.distance
items.sort(key=take_elem, reverse=True)
results = []
size = len(items)
for i in range(top_k):
if i >= size:
break
results.append({
"word": items[i].word,
"similarity": items[i].distance
})
return results
def cal_query_doc_similarity(self, query, document):
"""
This interface calculates the similarity between query and document.
Args:
query(str): the input query text.
document(str): the input document text.
Returns:
lda_sim(float): likelihood based similarity between query and document
based on LDA.
"""
q_tokens = self.__tokenizer.tokenize(query)
d_tokens = self.__tokenizer.tokenize(document)
doc = LDADoc()
self.__engine.infer(d_tokens, doc)
doc_topic_dist = doc.sparse_topic_dist()
sm = SemanticMatching()
lda_sim = sm.likelihood_based_similarity(q_tokens, doc_topic_dist,
self.__engine.get_model())
return lda_sim
def infer_doc_topic_distribution(self, document):
"""
This interface infers the topic distribution of document.
Args:
document(str): the input document text.
Returns:
results(list): returns the topic distribution of document.
"""
tokens = self.__tokenizer.tokenize(document)
if tokens == []:
return []
results = []
doc = LDADoc()
self.__engine.infer(tokens, doc)
topics = doc.sparse_topic_dist()
for topic in topics:
results.append({"topic id": topic.tid, "distribution": topic.prob})
return results
def show_topic_keywords(self, topic_id, k=10):
"""
This interface returns the k keywords under specific topic.
Args:
topic_id(int): topic information we want to know.
k(int): top k keywords.
Returns:
results(dict): contains specific topic's keywords and corresponding
probability.
"""
EPS = 1e-8
results = {}
if 0 <= topic_id < self.config.num_topics:
k = min(k, len(self.topic_words[topic_id]))
for i in range(k):
prob = self.topic_words[topic_id][i].count / \
(self.topic_sum_table[topic_id] + EPS)
results[self.vocabulary[self.topic_words[topic_id]
[i].word_id]] = prob
return results
else:
logger.error("%d is out of range!" % topic_id)
import os
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from lda_novel.document import LDADoc, SLDADoc, Token, Sentence
from lda_novel.vose_alias import VoseAlias
from lda_novel.util import rand, rand_k
class Sampler(object):
def __init__(self):
pass
def sample_doc(self, doc):
"""Sample LDA or SLDA topics for documents.
"""
raise NotImplementedError
class MHSampler(Sampler):
def __init__(self, model):
super().__init__()
self.__model = model
self.__topic_indexes = None
self.__alias_tables = None
self.__prob_sum = None
self.__beta_alias = VoseAlias()
self.__beta_prior_sum = None
self.__mh_steps = 2
self.__construct_alias_table()
def __construct_alias_table(self):
"""Construct alias table for all words.
"""
logger.info("Construct alias table for alias sampling method.")
vocab_size = self.__model.vocab_size()
self.__topic_indexes = [[] for _ in range(vocab_size)]
self.__alias_tables = [VoseAlias() for _ in range(vocab_size)]
self.__prob_sum = np.zeros(vocab_size)
# Construct each word's alias table (prior is not included).
for i in tqdm(range(vocab_size)):
dist = []
prob_sum = 0
for key in self.__model.word_topic(i):
topic_id = key
word_topic_count = self.__model.word_topic(i)[key]
topic_sum = self.__model.topic_sum_value(topic_id)
self.__topic_indexes[i].append(topic_id)
q = word_topic_count / (topic_sum + self.__model.beta_sum())
dist.append(q)
prob_sum += q
self.__prob_sum[i] = prob_sum
if len(dist) > 0:
dist = np.array(dist, dtype=np.float)
self.__alias_tables[i].initialize(dist)
# Build prior parameter beta's alias table.
beta_dist = self.__model.beta() / (
self.__model.topic_sum() + self.__model.beta_sum())
self.__beta_prior_sum = np.sum(beta_dist)
self.__beta_alias.initialize(beta_dist)
def sample_doc(self, doc):
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_token(doc, doc.token(i))
doc.set_topic(i, new_topic)
elif isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_sentence(doc, doc.sent(i))
doc.set_topic(i, new_topic)
def __sample_token(self, doc, token):
new_topic = token.topic
for i in range(self.__mh_steps):
doc_proposed_topic = self.__doc_proposal(doc, token)
new_topic = self.__word_proposal(doc, token, doc_proposed_topic)
return new_topic
def __sample_sentence(self, doc, sent):
new_topic = sent.topic
for i in range(self.__mh_steps):
doc_proposed_topic = self.__doc_proposal(doc, sent)
new_topic = self.__word_proposal(doc, sent, doc_proposed_topic)
return new_topic
def __doc_proposal(self, doc, token):
if isinstance(doc, LDADoc) and isinstance(token, Token):
old_topic = token.topic
dart = rand() * (doc.size() + self.__model.alpha_sum())
if dart < doc.size():
token_index = int(dart)
new_topic = doc.token(token_index).topic
else:
new_topic = rand_k(self.__model.num_topics())
if new_topic != old_topic:
proposal_old = self.__doc_proposal_distribution(doc, old_topic)
proposal_new = self.__doc_proposal_distribution(doc, new_topic)
proportion_old = self.__proportional_function(
doc, token, old_topic)
proportion_new = self.__proportional_function(
doc, token, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
old_topic = sent.topic
dart = rand() * (doc.size() + self.__model.alpha_sum())
if dart < doc.size():
token_index = int(dart)
new_topic = doc.sent(token_index).topic
else:
new_topic = rand_k(self.__model.num_topics())
if new_topic != old_topic:
proportion_old = self.__proportional_function(
doc, sent, old_topic)
proportion_new = self.__proportional_function(
doc, sent, new_topic)
proposal_old = self.__doc_proposal_distribution(doc, old_topic)
proposal_new = self.__doc_proposal_distribution(doc, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
def __word_proposal(self, doc, token, old_topic):
if isinstance(doc, LDADoc) and isinstance(token, Token):
new_topic = self.__propose(token.id)
if new_topic != old_topic:
proposal_old = self.__word_proposal_distribution(
token.id, old_topic)
proposal_new = self.__word_proposal_distribution(
token.id, new_topic)
proportion_old = self.__proportional_function(
doc, token, old_topic)
proportion_new = self.__proportional_function(
doc, token, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
new_topic = old_topic
for word_id in sent.tokens:
new_topic = self.__propose(word_id)
if new_topic != old_topic:
proportion_old = self.__proportional_function(
doc, sent, old_topic)
proportion_new = self.__proportional_function(
doc, sent, new_topic)
proposal_old = self.__word_proposal_distribution(
word_id, old_topic)
proposal_new = self.__word_proposal_distribution(
word_id, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
new_topic = (new_topic & mask) | (old_topic & ~mask)
return new_topic
def __proportional_function(self, doc, token, new_topic):
if isinstance(doc, LDADoc) and isinstance(token, Token):
old_topic = token.topic
dt_alpha = doc.topic_sum(new_topic) + self.__model.alpha()
wt_beta = self.__model.word_topic_value(
token.id, new_topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
new_topic) + self.__model.beta_sum()
if new_topic == old_topic and wt_beta > 1:
if dt_alpha > 1:
dt_alpha -= 1
wt_beta -= 1
t_sum_beta_sum -= 1
return dt_alpha * wt_beta / t_sum_beta_sum
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
old_topic = sent.topic
result = doc.topic_sum(new_topic) + self.__model.alpha()
if new_topic == old_topic:
result -= 1
for word_id in sent.tokens:
wt_beta = self.__model.word_topic_value(
word_id, new_topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
new_topic) + self.__model.beta_sum()
if new_topic == old_topic and wt_beta > 1:
wt_beta -= 1
t_sum_beta_sum -= 1
result *= wt_beta / t_sum_beta_sum
return result
else:
logger.error("Wrong input argument type!")
def __word_proposal_distribution(self, word_id, topic):
wt_beta = self.__model.word_topic_value(word_id,
topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
topic) + self.__model.beta_sum()
return wt_beta / t_sum_beta_sum
def __doc_proposal_distribution(self, doc, topic):
return doc.topic_sum(topic) + self.__model.alpha()
def __propose(self, word_id):
dart = rand() * (self.__prob_sum[word_id] + self.__beta_prior_sum)
if dart < self.__prob_sum[word_id]:
idx = self.__alias_tables[word_id].generate()
topic = self.__topic_indexes[word_id][idx]
else:
topic = self.__beta_alias.generate()
return topic
class GibbsSampler(Sampler):
def __init__(self, model):
super().__init__()
self.__model = model
def sample_doc(self, doc):
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_token(doc, doc.token(i))
doc.set_topic(i, new_topic)
elif isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_sentence(doc, doc.sent(i))
doc.set_topic(i, new_topic)
def __sample_token(self, doc, token):
old_topic = token.topic
num_topics = self.__model.num_topics()
accum_prob = np.zeros(num_topics)
prob = np.zeros(num_topics)
sum_ = 0
for i in range(num_topics):
dt_alpha = doc.topic_sum(i) + self.__model.alpha()
wt_beta = self.__model.word_topic_value(token.id,
i) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum(i) + self.__model.beta_sum()
if i == old_topic and wt_beta > 1:
if dt_alpha > 1:
dt_alpha -= 1
wt_beta -= 1
t_sum_beta_sum -= 1
prob[i] = dt_alpha * wt_beta / t_sum_beta_sum
sum_ += prob[i]
accum_prob[i] = prob[i] if i == 0 else accum_prob[i - 1] + prob[i]
dart = rand() * sum_
if dart <= accum_prob[0]:
return 0
for i in range(1, num_topics):
if accum_prob[i - 1] < dart <= accum_prob[i]:
return i
return num_topics - 1
def __sample_sentence(self, doc, sent):
old_topic = sent.topic
num_topics = self.__model.num_topics()
accum_prob = np.zeros(num_topics)
prob = np.zeros(num_topics)
sum_ = 0
for t in range(num_topics):
dt_alpha = doc.topic_sum(t) + self.__model.alpha()
t_sum_beta_sum = self.__model.topic_sum(t) + self.__model.beta_sum()
if t == old_topic:
if dt_alpha > 1:
dt_alpha -= 1
if t_sum_beta_sum > 1:
t_sum_beta_sum -= 1
prob[t] = dt_alpha
for i in range(len(sent.tokens)):
w = sent.tokens[i]
wt_beta = self.__model.word_topic_value(
w, t) + self.__model.beta()
if t == old_topic and wt_beta > 1:
wt_beta -= 1
# Note: if the length of the sentence is too long, the probability will be
# too small and the accuracy will be lost if there are too many multiply items
prob[t] *= wt_beta / t_sum_beta_sum
sum_ += prob[t]
accum_prob[t] = prob[t] if t == 0 else accum_prob[t - 1] + prob[t]
dart = rand() * sum
if dart <= accum_prob[0]:
return 0
for t in range(1, num_topics):
if accum_prob[t - 1] < dart <= accum_prob[t]:
return t
return num_topics - 1
import os
import numpy as np
from paddlehub.common.logger import logger
from lda_novel.vocab import OOV
EPS = 1e-06
class WordAndDis(object):
def __init__(self):
self.word = None
self.distance = None
class SemanticMatching(object):
def __init__(self):
pass
def l2_norm(self, vec):
"""Calculate the length of vector.
"""
result = np.sqrt(np.sum(vec**2))
return result
def cosine_similarity(self, vec1, vec2):
norm1 = self.l2_norm(vec1)
norm2 = self.l2_norm(vec2)
result = np.sum(vec1 * vec2) / norm1 / norm2
return result
def likelihood_based_similarity(self, terms, doc_topic_dist, model):
"""
Args:
terms: list of strings
doc_topic_dist: list of Topic class
model: TopicModel class
"""
num_of_term_in_vocab = 0
result = 0
for i in range(len(terms)):
term_id = model.term_id(terms[i])
if term_id == OOV:
continue
num_of_term_in_vocab += 1
for j in range(len(doc_topic_dist)):
topic_id = doc_topic_dist[j].tid
prob = doc_topic_dist[j].prob
result += model.word_topic_value(term_id, topic_id) * 1.0 / \
model.topic_sum_value(topic_id) * prob
if num_of_term_in_vocab == 0:
return result
return result / num_of_term_in_vocab
def kullback_leibler_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist2[dist2 < EPS] = EPS
result = np.sum(dist1 * np.log(dist1 / dist2))
return result
def jensen_shannon_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist1[dist1 < EPS] = EPS
dist2[dist2 < EPS] = EPS
mean = (dist1 + dist2) * 0.5
jsd = self.kullback_leibler_divergence(dist1, mean) * 0.5 + \
self.kullback_leibler_divergence(dist2, mean) * 0.5
return jsd
def hellinger_distance(self, dist1, dist2):
assert dist1.shape == dist2.shape
result = np.sum((np.sqrt(dist1) - np.sqrt(dist2))**2)
result = np.sqrt(result) * 0.7071067812
return result
import os
import numpy as np
from paddlehub.common.logger import logger
class Tokenizer(object):
"""Base tokenizer class.
"""
def __init__(self):
pass
def tokenize(self, text):
raise NotImplementedError
class SimpleTokenizer(Tokenizer):
"""Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
be used in topic model demo, but not in real business application scenarios.
Notes: This tokenizer can only recognize the words in the corresponding vocab file.
"""
def __init__(self, vocab_path):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__load_vocab(vocab_path)
def tokenize(self, text):
"""Tokenize the input string `text`, and return the tokenize result.
"""
text_len = len(text)
result = []
i = 0
while i < text_len:
word = found_word = ""
# Deal with English characters.
if self.__is_eng_char(text[i]):
for j in range(i, text_len + 1):
if j < text_len and self.__is_eng_char(text[j]):
word += self.__tolower(text[j])
else:
# Forward matching by character granularity.
if word in self.__vocab:
result.append(word)
i = j - 1
break
else:
for j in range(i, min(i + self.__max_word_len, text_len)):
word += text[j]
if word in self.__vocab:
found_word = word
if len(found_word) > 0:
result.append(found_word)
i += len(found_word) - 1
i += 1
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def __is_eng_char(self, c):
"""Check whether char c is an English character.
"""
return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')
def __tolower(self, c):
"""Return the lowercase character of the corresponding character, or return
the original character if there is no corresponding lowercase character.
"""
return c.lower()
class LACTokenizer(Tokenizer):
def __init__(self, vocab_path, lac):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__lac = lac
self.__load_vocab(vocab_path)
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def tokenize(self, text):
results = self.__lac.lexical_analysis(
texts=[text], use_gpu=True, batch_size=1, return_tag=True)
# Change English words to lower case.
# And just preserve the word in vocab.
words = results[0]["word"]
result = []
for word in words:
word = word.lower()
if word in self.__vocab:
result.append(word)
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab
import time
import yaml
import numpy as np
from paddlehub.common.logger import logger
from lda_novel.config import ModelType
def load_prototxt(config_file, config):
"""
Args:
config_file: model configuration file.
config: ModelConfig class
"""
logger.info("Loading LDA config.")
with open(config_file, 'r') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
if yaml_dict["type"] == "LDA":
config.type = ModelType.LDA
else:
config.type = ModelType.SLDA
config.num_topics = yaml_dict["num_topics"]
config.alpha = yaml_dict["alpha"]
config.beta = yaml_dict["beta"]
config.word_topic_file = yaml_dict["word_topic_file"]
config.vocab_file = yaml_dict["vocab_file"]
def fix_random_seed(seed=2147483647):
np.random.seed(seed)
def rand(min_=0, max_=1):
return np.random.uniform(low=min_, high=max_)
def rand_k(k):
"""Returns an integer float number between [0, k - 1].
"""
return int(rand() * k)
def timeit(f):
"""Return time cost of function f.
"""
def timed(*args, **kwargs):
start_time = time.time()
result = f(*args, **kwargs)
end_time = time.time()
print(" [-] %s : %2.5f sec" % (f.__name__, end_time - start_time))
return result
return timed
from paddlehub.common.logger import logger
OOV = -1
class WordCount(object):
def __init__(self, word_id, count):
self.word_id = word_id
self.count = count
class Vocab(object):
def __init__(self):
self.__term2id = {}
self.__id2term = {}
def get_id(self, word):
if word not in self.__term2id:
return OOV
return self.__term2id[word]
def load(self, vocab_file):
self.__term2id = {}
self.__id2term = {}
with open(vocab_file, 'r') as fin:
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(
fields) == 5, "Vocabulary file [%s] format error!" % (
vocab_file)
term = fields[1]
id_ = int(fields[2])
if term in self.__term2id:
logger.error("Duplicate word [%s] in vocab file!" % (term))
continue
self.__term2id[term] = id_
self.__id2term[id_] = term
def size(self):
return len(self.__term2id)
def vocabulary(self):
return self.__id2term
import os
import numpy as np
from paddlehub.common.logger import logger
from lda_novel.util import rand, rand_k
class VoseAlias(object):
"""Vose's Alias Method.
"""
def __init__(self):
self.__alias = None
self.__prob = None # np.array
def initialize(self, distribution):
"""Initialize the alias table according to the input distribution
Arg:
distribution: Numpy array.
"""
size = distribution.shape[0]
self.__alias = np.zeros(size, dtype=np.int64)
self.__prob = np.zeros(size)
sum_ = np.sum(distribution)
p = distribution / sum_ * size # Scale up probability.
large, small = [], []
for i, p_ in enumerate(p):
if p_ < 1.0:
small.append(i)
else:
large.append(i)
while large and small:
l = small[0]
g = large[0]
small.pop(0)
large.pop(0)
self.__prob[l] = p[l]
self.__alias[l] = g
p[g] = p[g] + p[l] - 1 # A more numerically stable option.
if p[g] < 1.0:
small.append(g)
else:
large.append(g)
while large:
g = large[0]
large.pop(0)
self.__prob[g] = 1.0
while small:
l = small[0]
small.pop(0)
self.__prob[l] = 1.0
def generate(self):
"""Generate samples from given distribution.
"""
dart1 = rand_k(self.size())
dart2 = int(rand())
return dart1 if dart2 > self.__prob[dart1] else self.__alias[dart1]
def size(self):
return self.__prob.shape[0]
## 模型概述
主题模型(Topic Model)是以无监督学习的方式对文档的隐含语义结构进行聚类的统计模型,其中LDA(Latent Dirichlet Allocation)算法是主题模型的一种。LDA根据对词的共现信息的分析,拟合出词-文档-主题的分布,从而将词、文本映射到一个语义空间中。本Module基于的数据集为百度自建的网页领域数据集。
<p align="center">
<img src="https://bj.bcebos.com/paddlehub/model/nlp/semantic_model/lda.png" hspace='10'/> <br />
</p>
更多详情请参考[LDA论文](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf)
注:该Module由第三方开发者DesmonDay贡献。
## LDA模型 API 说明
### cal_doc_distance(doc_text1, doc_text2)
用于计算两个输入文档之间的距离,包括Jensen-Shannon divergence(JS散度)、Hellinger Distance(海林格距离)。
**参数**
- doc_text1(str): 输入的第一个文档。
- doc_text2(str): 输入的第二个文档。
**返回**
- jsd(float): 两个文档之间的JS散度([Jensen-Shannon divergence](https://blog.csdn.net/FrankieHello/article/details/80614422?utm_source=copy))。
- hd(float): 两个文档之间的海林格距离([Hellinger Distance](http://blog.sina.com.cn/s/blog_85f1ffb70101e65d.html))。
### cal_doc_keywords_similarity(document, top_k=10)
用于查找输入文档的前k个关键词及对应的与原文档的相似度。
**参数**
- document(str): 输入文档。
- top_k(int): 查找输入文档的前k个关键词。
**返回**
- results(list): 包含每个关键词以及对应的与原文档的相似度。其中,list的基本元素为dict,dict的key为关键词,value为对应的与原文档的相似度。
### cal_query_doc_similarity(query, document)
用于计算短文档与长文档之间的相似度。
**参数**
- query(str): 输入的短文档。
- document(str): 输入的长文档。
**返回**
- lda_sim(float): 返回短文档与长文档之间的相似度。
### infer_doc_topic_distribution(document)
用于推理出文档的主题分布。
**参数**
- document(str): 输入文档。
**返回**
- results(list): 包含主题分布下各个主题ID和对应的概率分布。其中,list的基本元素为dict,dict的key为主题ID,value为各个主题ID对应的概率。
### show_topic_keywords(topic_id, k=10)
用于展示出每个主题下对应的关键词,可配合推理主题分布的API使用。
**参数**
- topic_id(int): 主题ID。
- k(int): 需要知道对应主题的前k个关键词。
**返回**
- results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。
### 代码示例
这里展示部分API的使用示例。
``` python
import paddlehub as hub
lda_webpage = hub.Module(name="lda_webpage")
jsd, hd = lda_webpage.cal_doc_distance(doc_text1="百度的网页上有着各种新闻的推荐,内容丰富多彩。", doc_text2="百度首页推荐着各种新闻,还提供了强大的搜索引擎功能。")
# jsd = 0.00249, hd = 0.0510
results = lda_webpage.cal_doc_keywords_similarity('百度首页推荐着各种新闻,还提供了强大的搜索引擎功能。')
# [{'word': '强大', 'similarity': 0.0838851256627093},
# {'word': '推荐', 'similarity': 0.06295345182499558},
# {'word': '新闻', 'similarity': 0.05894049247832139},
# {'word': '提供', 'similarity': 0.04179908620523299},
# {'word': '百度', 'similarity': 0.033778847361833536},
# {'word': '首页', 'similarity': 0.018429949496365026},
# {'word': '功能', 'similarity': 0.011409342579361237},
# {'word': '搜索引擎', 'similarity': 0.010392479335778413}]
```
## 查看代码
https://github.com/baidu/Familia
## 依赖
paddlepaddle >= 1.8.2
paddlehub >= 1.8.0
## 更新历史
* 1.0.0
初始发布
"""
This file defines the basic config information of LDA/SLDA model.
"""
class ModelType:
LDA = 0
SLDA = 1
class ModelConfig:
type = None
num_topics = None
alpha = None
beta = None
word_topic_file = None
vocab_file = None
import numpy as np
class Topic(object):
"""Basic data structure of topic, contains topic id and
corresponding probability.
"""
def __init__(self, tid, prob):
self.tid = tid # topic id
self.prob = prob # topic probability
class Token(object):
"""Basic storage unit of LDA documents, contains word id
and corresponding topic.
"""
def __init__(self, topic, id):
self.topic = topic
self.id = id
class Sentence(object):
"""Basic storage unit of SentenceLDA documents, contains word ids
of the sentence and its corresponding topic id.
"""
def __init__(self, topic, tokens):
self.topic = topic
self.tokens = tokens
class LDADoc(object):
"""The storage structure of LDA model's inference result.
"""
def __init__(self):
self._num_topics = None # Number of topics.
self._num_accum = None # Number of accumulated sample rounds.
self._alpha = None # Document prior parameter.
self._tokens = None # Storage structure of inference results.
self._topic_sum = None # Document's topic sum in one round samples.
self._accum_topic_sum = None # Accumulated results of topic sum.
def init(self, num_topics):
"""Initialize the LDADoc according to num_topics.
"""
self._num_topics = num_topics
self._num_accum = 0
self._tokens = []
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_token(self, token):
"""Add new word to current LDADoc.
Arg:
token: Token class object.
"""
assert token.topic >= 0, "Topic %d out of range!" % token.topic
assert token.topic < self._num_topics, "Topic %d out of range!" % token.topic
self._tokens.append(token)
self._topic_sum[token.topic] += 1
def token(self, index):
return self._tokens[index]
def set_topic(self, index, new_topic):
"""Set the index word's topic to new_topic, and update the corresponding
topic distribution.
"""
assert new_topic >= 0, "Topic %d out of range!" % new_topic
assert new_topic < self._num_topics, "Topic %d out of range!" % new_topic
old_topic = self._tokens[index].topic
if new_topic == old_topic:
return
self._tokens[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def set_alpha(self, alpha):
self._alpha = alpha
def size(self):
"""Return number of words in LDADoc.
"""
return len(self._tokens)
def topic_sum(self, topic_id):
return self._topic_sum[topic_id]
def sparse_topic_dist(self, sort=True):
"""Return the topic distribution of documents in sparse format.
By default, it is sorted according to the topic probability
under the descending order.
"""
topic_dist = []
sum_ = np.sum(self._accum_topic_sum)
if sum_ == 0:
return
for i in range(0, self._num_topics):
if self._accum_topic_sum[i] == 0:
continue
topic_dist.append(Topic(i, self._accum_topic_sum[i] * 1.0 / sum_))
if sort:
def take_elem(topic):
return topic.prob
topic_dist.sort(key=take_elem, reverse=True)
if topic_dist is None:
topic_dist = []
return topic_dist
def dense_topic_dist(self):
"""Return the distribution of document topics in dense format,
taking into account the prior parameter alpha.
"""
dense_dist = np.zeros(self._num_topics)
if self.size() == 0:
return dense_dist
dense_dist = (
self._accum_topic_sum * 1.0 / self._num_accum + self._alpha) / (
self.size() + self._alpha * self._num_topics)
return dense_dist
def accumulate_topic_num(self):
self._accum_topic_sum += self._topic_sum
self._num_accum += 1
class SLDADoc(LDADoc):
"""Sentence LDA Document, inherited from LDADoc.
Add add_sentence interface.
"""
def __init__(self):
super().__init__()
self.__sentences = None
def init(self, num_topics):
"""Initialize the SLDADoc according to num_topics.
"""
self._num_topics = num_topics
self.__sentences = []
self._num_accum = 0
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_sentence(self, sent):
"""Add new sentence to current SLDADoc.
Arg:
sent: Sentence class object.
"""
assert sent.topic >= 0, "Topic %d out of range!" % (sent.topic)
assert sent.topic < self._num_topics, "Topic %d out of range!" % (
sent.topic)
self.__sentences.append(sent)
self._topic_sum[sent.topic] += 1
def set_topic(self, index, new_topic):
assert new_topic >= 0, "Topic %d out of range!" % (new_topic)
assert new_topic < self._num_topics, "Topic %d out of range!" % (
new_topic)
old_topic = self.__sentences[index].topic
if new_topic == old_topic:
return
self.__sentences[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def size(self):
"""Return number of sentences in SLDADoc.
"""
return len(self.__sentences)
def sent(self, index):
return self.__sentences[index]
import os
from paddlehub.common.logger import logger
from lda_webpage.config import ModelConfig
from lda_webpage.util import load_prototxt, fix_random_seed, rand_k
from lda_webpage.model import TopicModel
from lda_webpage.sampler import GibbsSampler, MHSampler
from lda_webpage.document import LDADoc, SLDADoc, Token, Sentence
from lda_webpage.vocab import OOV
class SamplerType:
GibbsSampling = 0
MetropolisHastings = 1
class InferenceEngine(object):
def __init__(self,
model_dir,
conf_file,
type=SamplerType.MetropolisHastings):
# Read model configuration.
config = ModelConfig()
conf_file_path = os.path.join(model_dir, conf_file)
load_prototxt(conf_file_path, config)
self.__model = TopicModel(model_dir, config)
self.__config = config
# Initialize the sampler according to the configuration.
if type == SamplerType.GibbsSampling:
self.__sampler = GibbsSampler(self.__model)
elif type == SamplerType.MetropolisHastings:
self.__sampler = MHSampler(self.__model)
def infer(self, input, doc):
"""Perform LDA topic inference on input, and store the results in doc.
Args:
input: a list of strings after tokenization.
doc: LDADoc type or SLDADoc type.
"""
fix_random_seed()
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for token in input:
id_ = self.__model.term_id(token)
if id_ != OOV:
init_topic = rand_k(self.__model.num_topics())
doc.add_token(Token(init_topic, id_))
self.lda_infer(doc, 20, 50)
elif isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for sent in input:
words = []
for token in sent:
id_ = self.__model.term_id(token)
if id_ != OOV:
words.append(id_)
init_topic = rand_k(self.__model.num_topics())
doc.add_sentence(Sentence(init_topic, words))
self.slda_infer(doc, 20, 50)
else:
logger.error("Wrong Doc Type!")
def lda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def slda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def model_type(self):
return self.__model.type()
def get_model(self):
return self.__model
def get_config(self):
return self.__config
import os
from collections import OrderedDict
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from lda_webpage.vocab import Vocab, WordCount
class TopicModel(object):
"""Storage Structure of Topic model, including vocabulary and word topic count.
"""
def __init__(self, model_dir, config):
"""
Args:
model_dir: the path of model directory
config: ModelConfig class.
"""
self.__word_topic = None # Model parameter of word topic.
self.__vocab = Vocab() # Vocab data structure of model.
self.__num_topics = config.num_topics # Number of topics.
self.__alpha = config.alpha
self.__alpha_sum = self.__alpha * self.__num_topics
self.__beta = config.beta
self.__beta_sum = None
self.__type = config.type # Model type.
self.__topic_sum = np.zeros(
self.__num_topics,
dtype="int64") # Accum sum of each topic in word topic.
self.__topic_words = [[] for _ in range(self.__num_topics)]
word_topic_path = os.path.join(model_dir, config.word_topic_file)
vocab_path = os.path.join(model_dir, config.vocab_file)
self.load_model(word_topic_path, vocab_path)
def term_id(self, term):
return self.__vocab.get_id(term)
def load_model(self, word_topic_path, vocab_path):
# Loading vocabulary
self.__vocab.load(vocab_path)
self.__beta_sum = self.__beta * self.__vocab.size()
self.__word_topic = [{} for _ in range(self.__vocab.size())] # 字典列表
self.__load_word_dict(word_topic_path)
logger.info(
"Model Info: #num_topics=%d #vocab_size=%d alpha=%f beta=%f" %
(self.num_topics(), self.vocab_size(), self.alpha(), self.beta()))
def word_topic_value(self, word_id, topic_id):
"""Return value of specific word under specific topic in the model.
"""
word_dict = self.__word_topic[word_id]
if topic_id not in word_dict:
return 0
return word_dict[topic_id]
def word_topic(self, term_id):
"""Return the topic distribution of a word.
"""
return self.__word_topic[term_id]
def topic_sum_value(self, topic_id):
return self.__topic_sum[topic_id]
def topic_sum(self):
return self.__topic_sum
def num_topics(self):
return self.__num_topics
def vocab_size(self):
return self.__vocab.size()
def alpha(self):
return self.__alpha
def alpha_sum(self):
return self.__alpha_sum
def beta(self):
return self.__beta
def beta_sum(self):
return self.__beta_sum
def type(self):
return self.__type
def __load_word_dict(self, word_dict_path):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
term_id = int(fields[0])
assert term_id < self.vocab_size(), "Term id out of range!"
assert term_id >= 0, "Term id out of range!"
for i in range(1, len(fields)):
topic_count = fields[i].split(":")
assert len(topic_count) == 2, "Topic count format error!"
topic_id = int(topic_count[0])
assert topic_id >= 0, "Topic out of range!"
assert topic_id < self.__num_topics, "Topic out of range!"
count = int(topic_count[1])
assert count >= 0, "Topic count error!"
self.__word_topic[term_id][topic_id] = count
self.__topic_sum[topic_id] += count
self.__topic_words[topic_id].append(
WordCount(term_id, count))
new_dict = OrderedDict()
for key in sorted(self.__word_topic[term_id]):
new_dict[key] = self.__word_topic[term_id][key]
self.__word_topic[term_id] = new_dict
def get_vocab(self):
return self.__vocab.vocabulary()
def topic_words(self):
return self.__topic_words
import os
import paddlehub as hub
from paddlehub.module.module import moduleinfo
from paddlehub.common.logger import logger
from lda_webpage.inference_engine import InferenceEngine
from lda_webpage.document import LDADoc
from lda_webpage.semantic_matching import SemanticMatching, WordAndDis
from lda_webpage.tokenizer import LACTokenizer, SimpleTokenizer
from lda_webpage.config import ModelType
from lda_webpage.vocab import Vocab, WordCount
@moduleinfo(
name="lda_webpage",
version="1.0.0",
summary=
"This is a PaddleHub Module for LDA topic model in webpage dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.",
author="DesmonDay",
author_email="",
type="nlp/semantic_model")
class TopicModel(hub.Module):
def _initialize(self):
"""
Initialize with the necessary elements.
"""
self.model_dir = os.path.join(self.directory, 'webpage')
self.conf_file = 'lda.conf'
self.__engine = InferenceEngine(self.model_dir, self.conf_file)
self.vocab_path = os.path.join(self.model_dir, 'vocab_info.txt')
lac = hub.Module(name="lac")
# self.__tokenizer = SimpleTokenizer(self.vocab_path)
self.__tokenizer = LACTokenizer(self.vocab_path, lac)
self.vocabulary = self.__engine.get_model().get_vocab()
self.config = self.__engine.get_config()
self.topic_words = self.__engine.get_model().topic_words()
self.topic_sum_table = self.__engine.get_model().topic_sum()
def take_elem(word_count):
return word_count.count
for i in range(self.config.num_topics):
self.topic_words[i].sort(key=take_elem, reverse=True)
logger.info("Finish initialization.")
def cal_doc_distance(self, doc_text1, doc_text2):
"""
This interface calculates the distance between documents.
Args:
doc_text1(str): the input document text 1.
doc_text2(str): the input document text 2.
Returns:
jsd(float): Jensen-Shannon Divergence distance of two documents.
hd(float): Hellinger Distance of two documents.
"""
doc1_tokens = self.__tokenizer.tokenize(doc_text1)
doc2_tokens = self.__tokenizer.tokenize(doc_text2)
# Document topic inference.
doc1, doc2 = LDADoc(), LDADoc()
self.__engine.infer(doc1_tokens, doc1)
self.__engine.infer(doc2_tokens, doc2)
# To calculate jsd, we need dense document topic distribution.
dense_dict1 = doc1.dense_topic_dist()
dense_dict2 = doc2.dense_topic_dist()
# Calculate the distance between distributions.
# The smaller the distance, the higher the document semantic similarity.
sm = SemanticMatching()
jsd = sm.jensen_shannon_divergence(dense_dict1, dense_dict2)
hd = sm.hellinger_distance(dense_dict1, dense_dict2)
return jsd, hd
def cal_doc_keywords_similarity(self, document, top_k=10):
"""
This interface can be used to find topk keywords of document.
Args:
document(str): the input document text.
top_k(int): top k keywords of this document.
Returns:
results(list): contains top_k keywords and their
corresponding similarity compared to document.
"""
d_tokens = self.__tokenizer.tokenize(document)
# Do topic inference on documents to obtain topic distribution.
doc = LDADoc()
self.__engine.infer(d_tokens, doc)
doc_topic_dist = doc.sparse_topic_dist()
items = []
words = set()
for word in d_tokens:
if word in words:
continue
words.add(word)
wd = WordAndDis()
wd.word = word
sm = SemanticMatching()
wd.distance = sm.likelihood_based_similarity(
terms=[word],
doc_topic_dist=doc_topic_dist,
model=self.__engine.get_model())
items.append(wd)
def take_elem(word_dis):
return word_dis.distance
items.sort(key=take_elem, reverse=True)
results = []
size = len(items)
for i in range(top_k):
if i >= size:
break
results.append({
"word": items[i].word,
"similarity": items[i].distance
})
return results
def cal_query_doc_similarity(self, query, document):
"""
This interface calculates the similarity between query and document.
Args:
query(str): the input query text.
document(str): the input document text.
Returns:
lda_sim(float): likelihood based similarity between query and document based on LDA.
"""
q_tokens = self.__tokenizer.tokenize(query)
d_tokens = self.__tokenizer.tokenize(document)
doc = LDADoc()
self.__engine.infer(d_tokens, doc)
doc_topic_dist = doc.sparse_topic_dist()
sm = SemanticMatching()
lda_sim = sm.likelihood_based_similarity(q_tokens, doc_topic_dist,
self.__engine.get_model())
return lda_sim
def infer_doc_topic_distribution(self, document):
"""
This interface infers the topic distribution of document.
Args:
document(str): the input document text.
Returns:
results(list): returns the topic distribution of document.
"""
tokens = self.__tokenizer.tokenize(document)
if tokens == []:
return []
results = []
doc = LDADoc()
self.__engine.infer(tokens, doc)
topics = doc.sparse_topic_dist()
for topic in topics:
results.append({"topic id": topic.tid, "distribution": topic.prob})
return results
def show_topic_keywords(self, topic_id, k=10):
"""
This interface returns the first k keywords under specific topic.
Args:
topic_id(int): topic information we want to know.
k(int): top k keywords.
Returns:
results(dict): contains specific topic's keywords and
corresponding probability.
"""
EPS = 1e-8
results = {}
if 0 <= topic_id < self.config.num_topics:
k = min(k, len(self.topic_words[topic_id]))
for i in range(k):
prob = self.topic_words[topic_id][i].count / \
(self.topic_sum_table[topic_id] + EPS)
results[self.vocabulary[self.topic_words[topic_id]
[i].word_id]] = prob
return results
else:
logger.error("%d is out of range!" % topic_id)
import os
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from lda_webpage.document import LDADoc, SLDADoc, Token, Sentence
from lda_webpage.vose_alias import VoseAlias
from lda_webpage.util import rand, rand_k
class Sampler(object):
def __init__(self):
pass
def sample_doc(self, doc):
"""Sample LDA or SLDA topics for documents.
"""
raise NotImplementedError
class MHSampler(Sampler):
def __init__(self, model):
super().__init__()
self.__model = model
self.__topic_indexes = None
self.__alias_tables = None
self.__prob_sum = None
self.__beta_alias = VoseAlias()
self.__beta_prior_sum = None
self.__mh_steps = 2
self.__construct_alias_table()
def __construct_alias_table(self):
"""Construct alias table for all words.
"""
logger.info("Construct alias table for alias sampling method.")
vocab_size = self.__model.vocab_size()
self.__topic_indexes = [[] for _ in range(vocab_size)]
self.__alias_tables = [VoseAlias() for _ in range(vocab_size)]
self.__prob_sum = np.zeros(vocab_size)
# Construct each word's alias table (prior is not included).
for i in tqdm(range(vocab_size)):
dist = []
prob_sum = 0
for key in self.__model.word_topic(i):
topic_id = key
word_topic_count = self.__model.word_topic(i)[key]
topic_sum = self.__model.topic_sum_value(topic_id)
self.__topic_indexes[i].append(topic_id)
q = word_topic_count / (topic_sum + self.__model.beta_sum())
dist.append(q)
prob_sum += q
self.__prob_sum[i] = prob_sum
if len(dist) > 0:
dist = np.array(dist, dtype=np.float)
self.__alias_tables[i].initialize(dist)
# Build prior parameter beta's alias table.
beta_dist = self.__model.beta() / (
self.__model.topic_sum() + self.__model.beta_sum())
self.__beta_prior_sum = np.sum(beta_dist)
self.__beta_alias.initialize(beta_dist)
def sample_doc(self, doc):
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_token(doc, doc.token(i))
doc.set_topic(i, new_topic)
elif isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_sentence(doc, doc.sent(i))
doc.set_topic(i, new_topic)
def __sample_token(self, doc, token):
new_topic = token.topic
for i in range(self.__mh_steps):
doc_proposed_topic = self.__doc_proposal(doc, token)
new_topic = self.__word_proposal(doc, token, doc_proposed_topic)
return new_topic
def __sample_sentence(self, doc, sent):
new_topic = sent.topic
for i in range(self.__mh_steps):
doc_proposed_topic = self.__doc_proposal(doc, sent)
new_topic = self.__word_proposal(doc, sent, doc_proposed_topic)
return new_topic
def __doc_proposal(self, doc, token):
if isinstance(doc, LDADoc) and isinstance(token, Token):
old_topic = token.topic
dart = rand() * (doc.size() + self.__model.alpha_sum())
if dart < doc.size():
token_index = int(dart)
new_topic = doc.token(token_index).topic
else:
new_topic = rand_k(self.__model.num_topics())
if new_topic != old_topic:
proposal_old = self.__doc_proposal_distribution(doc, old_topic)
proposal_new = self.__doc_proposal_distribution(doc, new_topic)
proportion_old = self.__proportional_function(
doc, token, old_topic)
proportion_new = self.__proportional_function(
doc, token, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
old_topic = sent.topic
dart = rand() * (doc.size() + self.__model.alpha_sum())
if dart < doc.size():
token_index = int(dart)
new_topic = doc.sent(token_index).topic
else:
new_topic = rand_k(self.__model.num_topics())
if new_topic != old_topic:
proportion_old = self.__proportional_function(
doc, sent, old_topic)
proportion_new = self.__proportional_function(
doc, sent, new_topic)
proposal_old = self.__doc_proposal_distribution(doc, old_topic)
proposal_new = self.__doc_proposal_distribution(doc, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
def __word_proposal(self, doc, token, old_topic):
if isinstance(doc, LDADoc) and isinstance(token, Token):
new_topic = self.__propose(token.id)
if new_topic != old_topic:
proposal_old = self.__word_proposal_distribution(
token.id, old_topic)
proposal_new = self.__word_proposal_distribution(
token.id, new_topic)
proportion_old = self.__proportional_function(
doc, token, old_topic)
proportion_new = self.__proportional_function(
doc, token, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
return (new_topic & mask) | (old_topic & ~mask)
return new_topic
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
new_topic = old_topic
for word_id in sent.tokens:
new_topic = self.__propose(word_id)
if new_topic != old_topic:
proportion_old = self.__proportional_function(
doc, sent, old_topic)
proportion_new = self.__proportional_function(
doc, sent, new_topic)
proposal_old = self.__word_proposal_distribution(
word_id, old_topic)
proposal_new = self.__word_proposal_distribution(
word_id, new_topic)
transition_prob = float((proportion_new * proposal_old) /
(proportion_old * proposal_new))
rejection = rand()
mask = -(rejection < transition_prob)
new_topic = (new_topic & mask) | (old_topic & ~mask)
return new_topic
def __proportional_function(self, doc, token, new_topic):
if isinstance(doc, LDADoc) and isinstance(token, Token):
old_topic = token.topic
dt_alpha = doc.topic_sum(new_topic) + self.__model.alpha()
wt_beta = self.__model.word_topic_value(
token.id, new_topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
new_topic) + self.__model.beta_sum()
if new_topic == old_topic and wt_beta > 1:
if dt_alpha > 1:
dt_alpha -= 1
wt_beta -= 1
t_sum_beta_sum -= 1
return dt_alpha * wt_beta / t_sum_beta_sum
elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
sent = token
old_topic = sent.topic
result = doc.topic_sum(new_topic) + self.__model.alpha()
if new_topic == old_topic:
result -= 1
for word_id in sent.tokens:
wt_beta = self.__model.word_topic_value(
word_id, new_topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
new_topic) + self.__model.beta_sum()
if new_topic == old_topic and wt_beta > 1:
wt_beta -= 1
t_sum_beta_sum -= 1
result *= wt_beta / t_sum_beta_sum
return result
else:
logger.error("Wrong input argument type!")
def __word_proposal_distribution(self, word_id, topic):
wt_beta = self.__model.word_topic_value(word_id,
topic) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum_value(
topic) + self.__model.beta_sum()
return wt_beta / t_sum_beta_sum
def __doc_proposal_distribution(self, doc, topic):
return doc.topic_sum(topic) + self.__model.alpha()
def __propose(self, word_id):
dart = rand() * (self.__prob_sum[word_id] + self.__beta_prior_sum)
if dart < self.__prob_sum[word_id]:
idx = self.__alias_tables[word_id].generate()
topic = self.__topic_indexes[word_id][idx]
else:
topic = self.__beta_alias.generate()
return topic
class GibbsSampler(Sampler):
def __init__(self, model):
super().__init__()
self.__model = model
def sample_doc(self, doc):
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_token(doc, doc.token(i))
doc.set_topic(i, new_topic)
elif isinstance(doc, SLDADoc):
for i in range(doc.size()):
new_topic = self.__sample_sentence(doc, doc.sent(i))
doc.set_topic(i, new_topic)
def __sample_token(self, doc, token):
old_topic = token.topic
num_topics = self.__model.num_topics()
accum_prob = np.zeros(num_topics)
prob = np.zeros(num_topics)
sum_ = 0
for i in range(num_topics):
dt_alpha = doc.topic_sum(i) + self.__model.alpha()
wt_beta = self.__model.word_topic_value(token.id,
i) + self.__model.beta()
t_sum_beta_sum = self.__model.topic_sum(i) + self.__model.beta_sum()
if i == old_topic and wt_beta > 1:
if dt_alpha > 1:
dt_alpha -= 1
wt_beta -= 1
t_sum_beta_sum -= 1
prob[i] = dt_alpha * wt_beta / t_sum_beta_sum
sum_ += prob[i]
accum_prob[i] = prob[i] if i == 0 else accum_prob[i - 1] + prob[i]
dart = rand() * sum_
if dart <= accum_prob[0]:
return 0
for i in range(1, num_topics):
if accum_prob[i - 1] < dart <= accum_prob[i]:
return i
return num_topics - 1
def __sample_sentence(self, doc, sent):
old_topic = sent.topic
num_topics = self.__model.num_topics()
accum_prob = np.zeros(num_topics)
prob = np.zeros(num_topics)
sum_ = 0
for t in range(num_topics):
dt_alpha = doc.topic_sum(t) + self.__model.alpha()
t_sum_beta_sum = self.__model.topic_sum(t) + self.__model.beta_sum()
if t == old_topic:
if dt_alpha > 1:
dt_alpha -= 1
if t_sum_beta_sum > 1:
t_sum_beta_sum -= 1
prob[t] = dt_alpha
for i in range(len(sent.tokens)):
w = sent.tokens[i]
wt_beta = self.__model.word_topic_value(
w, t) + self.__model.beta()
if t == old_topic and wt_beta > 1:
wt_beta -= 1
# Note: if the length of the sentence is too long, the probability will be
# too small and the accuracy will be lost if there are too many multiply items
prob[t] *= wt_beta / t_sum_beta_sum
sum_ += prob[t]
accum_prob[t] = prob[t] if t == 0 else accum_prob[t - 1] + prob[t]
dart = rand() * sum
if dart <= accum_prob[0]:
return 0
for t in range(1, num_topics):
if accum_prob[t - 1] < dart <= accum_prob[t]:
return t
return num_topics - 1
import os
import numpy as np
from paddlehub.common.logger import logger
from lda_webpage.vocab import OOV
EPS = 1e-06
class WordAndDis(object):
def __init__(self):
self.word = None
self.distance = None
class SemanticMatching(object):
def __init__(self):
pass
def l2_norm(self, vec):
"""Calculate the length of vector.
"""
result = np.sqrt(np.sum(vec**2))
return result
def cosine_similarity(self, vec1, vec2):
norm1 = self.l2_norm(vec1)
norm2 = self.l2_norm(vec2)
result = np.sum(vec1 * vec2) / norm1 / norm2
return result
def likelihood_based_similarity(self, terms, doc_topic_dist, model):
"""
Args:
terms: list of strings
doc_topic_dist: list of Topic class
model: TopicModel class
"""
num_of_term_in_vocab = 0
result = 0
for i in range(len(terms)):
term_id = model.term_id(terms[i])
if term_id == OOV:
continue
num_of_term_in_vocab += 1
for j in range(len(doc_topic_dist)):
topic_id = doc_topic_dist[j].tid
prob = doc_topic_dist[j].prob
result += model.word_topic_value(term_id, topic_id) * 1.0 / \
model.topic_sum_value(topic_id) * prob
if num_of_term_in_vocab == 0:
return result
return result / num_of_term_in_vocab
def kullback_leibler_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist2[dist2 < EPS] = EPS
result = np.sum(dist1 * np.log(dist1 / dist2))
return result
def jensen_shannon_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist1[dist1 < EPS] = EPS
dist2[dist2 < EPS] = EPS
mean = (dist1 + dist2) * 0.5
jsd = self.kullback_leibler_divergence(dist1, mean) * 0.5 + \
self.kullback_leibler_divergence(dist2, mean) * 0.5
return jsd
def hellinger_distance(self, dist1, dist2):
assert dist1.shape == dist2.shape
result = np.sum((np.sqrt(dist1) - np.sqrt(dist2))**2)
result = np.sqrt(result) * 0.7071067812
return result
import os
import numpy as np
from paddlehub.common.logger import logger
class Tokenizer(object):
"""Base tokenizer class.
"""
def __init__(self):
pass
def tokenize(self, text):
raise NotImplementedError
class SimpleTokenizer(Tokenizer):
"""Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
be used in topic model demo, but not in real business application scenarios.
Notes: This tokenizer can only recognize the words in the corresponding vocab file.
"""
def __init__(self, vocab_path):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__load_vocab(vocab_path)
def tokenize(self, text):
"""Tokenize the input string `text`, and return the tokenize result.
"""
text_len = len(text)
result = []
i = 0
while i < text_len:
word = found_word = ""
# Deal with English characters.
if self.__is_eng_char(text[i]):
for j in range(i, text_len + 1):
if j < text_len and self.__is_eng_char(text[j]):
word += self.__tolower(text[j])
else:
# Forward matching by character granularity.
if word in self.__vocab:
result.append(word)
i = j - 1
break
else:
for j in range(i, min(i + self.__max_word_len, text_len)):
word += text[j]
if word in self.__vocab:
found_word = word
if len(found_word) > 0:
result.append(found_word)
i += len(found_word) - 1
i += 1
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def __is_eng_char(self, c):
"""Check whether char c is an English character.
"""
return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')
def __tolower(self, c):
"""Return the lowercase character of the corresponding character, or return
the original character if there is no corresponding lowercase character.
"""
return c.lower()
class LACTokenizer(Tokenizer):
def __init__(self, vocab_path, lac):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__lac = lac
self.__load_vocab(vocab_path)
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def tokenize(self, text):
results = self.__lac.lexical_analysis(
texts=[text], use_gpu=True, batch_size=1, return_tag=True)
# Change English words to lower case.
# And just preserve the word in vocab.
words = results[0]["word"]
result = []
for word in words:
word = word.lower()
if word in self.__vocab:
result.append(word)
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab
import time
import yaml
import numpy as np
from paddlehub.common.logger import logger
from lda_webpage.config import ModelType
def load_prototxt(config_file, config):
"""
Args:
config_file: model configuration file.
config: ModelConfig class
"""
logger.info("Loading LDA config.")
with open(config_file, 'r') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
if yaml_dict["type"] == "LDA":
config.type = ModelType.LDA
else:
config.type = ModelType.SLDA
config.num_topics = yaml_dict["num_topics"]
config.alpha = yaml_dict["alpha"]
config.beta = yaml_dict["beta"]
config.word_topic_file = yaml_dict["word_topic_file"]
config.vocab_file = yaml_dict["vocab_file"]
def fix_random_seed(seed=2147483647):
np.random.seed(seed)
def rand(min_=0, max_=1):
return np.random.uniform(low=min_, high=max_)
def rand_k(k):
"""Returns an integer float number between [0, k - 1].
"""
return int(rand() * k)
def timeit(f):
"""Return time cost of function f.
"""
def timed(*args, **kwargs):
start_time = time.time()
result = f(*args, **kwargs)
end_time = time.time()
print(" [-] %s : %2.5f sec" % (f.__name__, end_time - start_time))
return result
return timed
from paddlehub.common.logger import logger
OOV = -1
class WordCount(object):
def __init__(self, word_id, count):
self.word_id = word_id
self.count = count
class Vocab(object):
def __init__(self):
self.__term2id = {}
self.__id2term = {}
def get_id(self, word):
if word not in self.__term2id:
return OOV
return self.__term2id[word]
def load(self, vocab_file):
self.__term2id = {}
self.__id2term = {}
with open(vocab_file, 'r') as fin:
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(
fields) == 5, "Vocabulary file [%s] format error!" % (
vocab_file)
term = fields[1]
id_ = int(fields[2])
if term in self.__term2id:
logger.error("Duplicate word [%s] in vocab file!" % (term))
continue
self.__term2id[term] = id_
self.__id2term[id_] = term
def size(self):
return len(self.__term2id)
def vocabulary(self):
return self.__id2term
import os
import numpy as np
from paddlehub.common.logger import logger
from lda_webpage.util import rand, rand_k
class VoseAlias(object):
"""Vose's Alias Method.
"""
def __init__(self):
self.__alias = None
self.__prob = None # np.array
def initialize(self, distribution):
"""Initialize the alias table according to the input distribution
Arg:
distribution: Numpy array.
"""
size = distribution.shape[0]
self.__alias = np.zeros(size, dtype=np.int64)
self.__prob = np.zeros(size)
sum_ = np.sum(distribution)
p = distribution / sum_ * size # Scale up probability.
large, small = [], []
for i, p_ in enumerate(p):
if p_ < 1.0:
small.append(i)
else:
large.append(i)
while large and small:
l = small[0]
g = large[0]
small.pop(0)
large.pop(0)
self.__prob[l] = p[l]
self.__alias[l] = g
p[g] = p[g] + p[l] - 1 # A more numerically stable option.
if p[g] < 1.0:
small.append(g)
else:
large.append(g)
while large:
g = large[0]
large.pop(0)
self.__prob[g] = 1.0
while small:
l = small[0]
small.pop(0)
self.__prob[l] = 1.0
def generate(self):
"""Generate samples from given distribution.
"""
dart1 = rand_k(self.size())
dart2 = int(rand())
return dart1 if dart2 > self.__prob[dart1] else self.__alias[dart1]
def size(self):
return self.__prob.shape[0]
## 模型概述
主题模型(Topic Model)是以无监督学习的方式对文档的隐含语义结构进行聚类的统计模型,其中SLDA(Sentence-LDA)是主题模型的一种。SLDA是LDA主题模型的扩展,LDA假设每个单词对应一个主题,而SLDA假设每个句子对应一个主题。本Module基于的数据集为百度自建的新闻领域数据集。
<p alian="center">
<img src="https://bj.bcebos.com/paddlehub/model/nlp/semantic_model/slda.png" hspace='10'/> <br />
</p>
更多详情请参考[SLDA论文](https://pdfs.semanticscholar.org/c311/778adb9484c86250e915aecd9714f4206050.pdf)
注:该Module由第三方开发者DesmonDay贡献。
## SLDA模型 API 说明
### infer_doc_topic_distribution(document)
用于推理出文档的主题分布。
**参数**
- document(str): 输入文档。
**返回**
- results(list): 包含主题分布下各个主题ID和对应的概率分布。其中,list的基本元素为dict,dict的key为主题ID,value为各个主题ID对应的概率。
### show_topic_keywords(topic_id, k=10)
用于展示出每个主题下对应的关键词,可配合推理主题分布的API使用。
**参数**
- topic_id(int): 主题ID。
- k(int): 需要知道对应主题的前k个关键词。
**返回**
- results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。
### 代码示例
这里展示API的使用示例。
``` python
import paddlehub as hub
slda_news = hub.Module(name="slda_news")
topic_dist = slda_news.infer_doc_topic_distribution("百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。")
# {378: 0.5, 804: 0.5}
keywords = slda_news.show_topic_keywords(topic_id=804, k=10)
# {'百度': 0.08269021676897842,
# '搜索': 0.04154762385123992,
# '推广': 0.026193527138926424,
# '贴吧': 0.02125616298078334,
# '排名': 0.019595252609963018,
# '关键词': 0.015173719446828477,
# '广告': 0.013552941381750894,
# '搜索引擎': 0.010038529194616577,
# '公司': 0.009388342219512786,
# '网站': 0.009173721627932065}
```
## 查看代码
https://github.com/baidu/Familia
## 依赖
paddlepaddle >= 1.8.2
paddlehub >= 1.8.0
## 更新历史
* 1.0.0
初始发布
"""
This file defines the basic config information of LDA/SLDA model.
"""
class ModelType:
LDA = 0
SLDA = 1
class ModelConfig:
type = None
num_topics = None
alpha = None
beta = None
word_topic_file = None
vocab_file = None
import numpy as np
class Topic(object):
"""Basic data structure of topic, contains topic id and
corresponding probability.
"""
def __init__(self, tid, prob):
self.tid = tid # topic id
self.prob = prob # topic probability
class Token(object):
"""Basic storage unit of LDA documents, contains word id
and corresponding topic.
"""
def __init__(self, topic, id):
self.topic = topic
self.id = id
class Sentence(object):
"""Basic storage unit of SentenceLDA documents, contains word ids
of the sentence and its corresponding topic id.
"""
def __init__(self, topic, tokens):
self.topic = topic
self.tokens = tokens
class LDADoc(object):
"""The storage structure of LDA model's inference result.
"""
def __init__(self):
self._num_topics = None # Number of topics.
self._num_accum = None # Number of accumulated sample rounds.
self._alpha = None # Document prior parameter.
self._tokens = None # Storage structure of inference results.
self._topic_sum = None # Document's topic sum in one round samples.
self._accum_topic_sum = None # Accumulated results of topic sum.
def init(self, num_topics):
"""Initialize the LDADoc according to num_topics.
"""
self._num_topics = num_topics
self._num_accum = 0
self._tokens = []
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_token(self, token):
"""Add new word to current LDADoc.
Arg:
token: Token class object.
"""
assert token.topic >= 0, "Topic %d out of range!" % token.topic
assert token.topic < self._num_topics, "Topic %d out of range!" % token.topic
self._tokens.append(token)
self._topic_sum[token.topic] += 1
def token(self, index):
return self._tokens[index]
def set_topic(self, index, new_topic):
"""Set the index word's topic to new_topic, and update the corresponding
topic distribution.
"""
assert new_topic >= 0, "Topic %d out of range!" % new_topic
assert new_topic < self._num_topics, "Topic %d out of range!" % new_topic
old_topic = self._tokens[index].topic
if new_topic == old_topic:
return
self._tokens[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def set_alpha(self, alpha):
self._alpha = alpha
def size(self):
"""Return number of words in LDADoc.
"""
return len(self._tokens)
def topic_sum(self, topic_id):
return self._topic_sum[topic_id]
def sparse_topic_dist(self, sort=True):
"""Return the topic distribution of documents in sparse format.
By default, it is sorted according to the topic probability
under the descending order.
"""
topic_dist = []
sum_ = np.sum(self._accum_topic_sum)
if sum_ == 0:
return
for i in range(0, self._num_topics):
if self._accum_topic_sum[i] == 0:
continue
topic_dist.append(Topic(i, self._accum_topic_sum[i] * 1.0 / sum_))
if sort:
def take_elem(topic):
return topic.prob
topic_dist.sort(key=take_elem, reverse=True)
if topic_dist is None:
topic_dist = []
return topic_dist
def dense_topic_dist(self):
"""Return the distribution of document topics in dense format,
taking into account the prior parameter alpha.
"""
dense_dist = np.zeros(self._num_topics)
if self.size() == 0:
return dense_dist
dense_dist = (
self._accum_topic_sum * 1.0 / self._num_accum + self._alpha) / (
self.size() + self._alpha * self._num_topics)
return dense_dist
def accumulate_topic_num(self):
self._accum_topic_sum += self._topic_sum
self._num_accum += 1
class SLDADoc(LDADoc):
"""Sentence LDA Document, inherited from LDADoc.
Add add_sentence interface.
"""
def __init__(self):
super().__init__()
self.__sentences = None
def init(self, num_topics):
"""Initialize the SLDADoc according to num_topics.
"""
self._num_topics = num_topics
self.__sentences = []
self._num_accum = 0
self._topic_sum = np.zeros(self._num_topics)
self._accum_topic_sum = np.zeros(self._num_topics)
def add_sentence(self, sent):
"""Add new sentence to current SLDADoc.
Arg:
sent: Sentence class object.
"""
assert sent.topic >= 0, "Topic %d out of range!" % (sent.topic)
assert sent.topic < self._num_topics, "Topic %d out of range!" % (
sent.topic)
self.__sentences.append(sent)
self._topic_sum[sent.topic] += 1
def set_topic(self, index, new_topic):
assert new_topic >= 0, "Topic %d out of range!" % (new_topic)
assert new_topic < self._num_topics, "Topic %d out of range!" % (
new_topic)
old_topic = self.__sentences[index].topic
if new_topic == old_topic:
return
self.__sentences[index].topic = new_topic
self._topic_sum[old_topic] -= 1
self._topic_sum[new_topic] += 1
def size(self):
"""Return number of sentences in SLDADoc.
"""
return len(self.__sentences)
def sent(self, index):
return self.__sentences[index]
import os
from paddlehub.common.logger import logger
from slda_news.config import ModelConfig
from slda_news.util import load_prototxt, fix_random_seed, rand_k
from slda_news.model import TopicModel
from slda_news.sampler import GibbsSampler, MHSampler
from slda_news.document import LDADoc, SLDADoc, Token, Sentence
from slda_news.vocab import OOV
class SamplerType:
GibbsSampling = 0
MetropolisHastings = 1
class InferenceEngine(object):
def __init__(self,
model_dir,
conf_file,
type=SamplerType.MetropolisHastings):
# Read model configuration.
config = ModelConfig()
conf_file_path = os.path.join(model_dir, conf_file)
load_prototxt(conf_file_path, config)
self.__model = TopicModel(model_dir, config)
self.__config = config
# Initialize the sampler according to the configuration.
if type == SamplerType.GibbsSampling:
self.__sampler = GibbsSampler(self.__model)
elif type == SamplerType.MetropolisHastings:
self.__sampler = MHSampler(self.__model)
def infer(self, input, doc):
"""Perform LDA topic inference on input, and store the results in doc.
Args:
input: a list of strings after tokenization.
doc: LDADoc type or SLDADoc type.
"""
fix_random_seed()
if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for token in input:
id_ = self.__model.term_id(token)
if id_ != OOV:
init_topic = rand_k(self.__model.num_topics())
doc.add_token(Token(init_topic, id_))
self.lda_infer(doc, 20, 50)
elif isinstance(doc, SLDADoc):
doc.init(self.__model.num_topics())
doc.set_alpha(self.__model.alpha())
for sent in input:
words = []
for token in sent:
id_ = self.__model.term_id(token)
if id_ != OOV:
words.append(id_)
init_topic = rand_k(self.__model.num_topics())
doc.add_sentence(Sentence(init_topic, words))
self.slda_infer(doc, 20, 50)
else:
logger.error("Wrong Doc Type!")
def lda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def slda_infer(self, doc, burn_in_iter, total_iter):
assert burn_in_iter >= 0
assert total_iter > 0
assert total_iter > burn_in_iter
for iter_ in range(total_iter):
self.__sampler.sample_doc(doc)
if iter_ >= burn_in_iter:
doc.accumulate_topic_num()
def model_type(self):
return self.__model.type()
def get_model(self):
return self.__model
def get_config(self):
return self.__config
import os
from collections import OrderedDict
import numpy as np
from tqdm import tqdm
from paddlehub.common.logger import logger
from slda_news.vocab import Vocab, WordCount
class TopicModel(object):
"""Storage Structure of Topic model, including vocabulary and word topic count.
"""
def __init__(self, model_dir, config):
"""
Args:
model_dir: the path of model directory
config: ModelConfig class.
"""
self.__word_topic = None # Model parameter of word topic.
self.__vocab = Vocab() # Vocab data structure of model.
self.__num_topics = config.num_topics # Number of topics.
self.__alpha = config.alpha
self.__alpha_sum = self.__alpha * self.__num_topics
self.__beta = config.beta
self.__beta_sum = None
self.__type = config.type # Model type.
self.__topic_sum = np.zeros(
self.__num_topics,
dtype="int64") # Accum sum of each topic in word topic.
self.__topic_words = [[] for _ in range(self.__num_topics)]
word_topic_path = os.path.join(model_dir, config.word_topic_file)
vocab_path = os.path.join(model_dir, config.vocab_file)
self.load_model(word_topic_path, vocab_path)
def term_id(self, term):
return self.__vocab.get_id(term)
def load_model(self, word_topic_path, vocab_path):
# Loading vocabulary
self.__vocab.load(vocab_path)
self.__beta_sum = self.__beta * self.__vocab.size()
self.__word_topic = [{} for _ in range(self.__vocab.size())] # 字典列表
self.__load_word_dict(word_topic_path)
logger.info(
"Model Info: #num_topics=%d #vocab_size=%d alpha=%f beta=%f" %
(self.num_topics(), self.vocab_size(), self.alpha(), self.beta()))
def word_topic_value(self, word_id, topic_id):
"""Return value of specific word under specific topic in the model.
"""
word_dict = self.__word_topic[word_id]
if topic_id not in word_dict:
return 0
return word_dict[topic_id]
def word_topic(self, term_id):
"""Return the topic distribution of a word.
"""
return self.__word_topic[term_id]
def topic_sum_value(self, topic_id):
return self.__topic_sum[topic_id]
def topic_sum(self):
return self.__topic_sum
def num_topics(self):
return self.__num_topics
def vocab_size(self):
return self.__vocab.size()
def alpha(self):
return self.__alpha
def alpha_sum(self):
return self.__alpha_sum
def beta(self):
return self.__beta
def beta_sum(self):
return self.__beta_sum
def type(self):
return self.__type
def __load_word_dict(self, word_dict_path):
"""Load the word topic parameters.
"""
logger.info("Loading word topic.")
with open(word_dict_path, 'r') as f:
for line in tqdm(f.readlines()):
fields = line.strip().split(" ")
assert len(fields) > 0, "Model file format error!"
term_id = int(fields[0])
assert term_id < self.vocab_size(), "Term id out of range!"
assert term_id >= 0, "Term id out of range!"
for i in range(1, len(fields)):
topic_count = fields[i].split(":")
assert len(topic_count) == 2, "Topic count format error!"
topic_id = int(topic_count[0])
assert topic_id >= 0, "Topic out of range!"
assert topic_id < self.__num_topics, "Topic out of range!"
count = int(topic_count[1])
assert count >= 0, "Topic count error!"
self.__word_topic[term_id][topic_id] = count
self.__topic_sum[topic_id] += count
self.__topic_words[topic_id].append(
WordCount(term_id, count))
new_dict = OrderedDict()
for key in sorted(self.__word_topic[term_id]):
new_dict[key] = self.__word_topic[term_id][key]
self.__word_topic[term_id] = new_dict
def get_vocab(self):
return self.__vocab.vocabulary()
def topic_words(self):
return self.__topic_words
import os
import paddlehub as hub
from paddlehub.module.module import moduleinfo
from paddlehub.common.logger import logger
from slda_news.inference_engine import InferenceEngine
from slda_news.document import SLDADoc
from slda_news.semantic_matching import SemanticMatching, WordAndDis
from slda_news.tokenizer import LACTokenizer, SimpleTokenizer
from slda_news.config import ModelType
from slda_news.vocab import Vocab, WordCount
@moduleinfo(
name="slda_news",
version="1.0.0",
summary=
"This is a PaddleHub Module for SLDA topic model in news dataset, where we can infer the topic distribution of document.",
author="DesmonDay",
author_email="",
type="nlp/semantic_model")
class TopicModel(hub.Module):
def _initialize(self):
"""Initialize with the necessary elements.
"""
self.model_dir = os.path.join(self.directory, 'news')
self.conf_file = 'slda.conf'
self.__engine = InferenceEngine(self.model_dir, self.conf_file)
self.vocab_path = os.path.join(self.model_dir, 'vocab_info.txt')
lac = hub.Module(name="lac")
# self.__tokenizer = SimpleTokenizer(self.vocab_path)
self.__tokenizer = LACTokenizer(self.vocab_path, lac)
self.vocabulary = self.__engine.get_model().get_vocab()
self.config = self.__engine.get_config()
self.topic_words = self.__engine.get_model().topic_words()
self.topic_sum_table = self.__engine.get_model().topic_sum()
def take_elem(word_count):
return word_count.count
for i in range(self.config.num_topics):
self.topic_words[i].sort(key=take_elem, reverse=True)
logger.info("Finish Initialization.")
def infer_doc_topic_distribution(self, document):
"""
This interface infers the topic distribution of document.
Args:
document(str): the input document text.
Returns:
results(list): returns the topic distribution of document.
"""
tokens = self.__tokenizer.tokenize(document)
if tokens == []:
return []
results = []
sentences = []
sent = []
for i in range(len(tokens)):
sent.append(tokens[i])
if len(sent) % 5 == 0:
sentences.append(sent)
sent = []
if len(sent) > 0:
sentences.append(sent)
doc = SLDADoc()
self.__engine.infer(sentences, doc)
topics = doc.sparse_topic_dist()
for topic in topics:
results.append({"topic id": topic.tid, "distribution": topic.prob})
return results
def show_topic_keywords(self, topic_id, k=10):
"""
This interface returns the k keywords under specific topic.
Args:
topic_id(int): topic information we want to know.
k(int): top k keywords.
Returns:
results(dict): contains specific topic's keywords and corresponding
probability.
"""
EPS = 1e-8
results = {}
if 0 <= topic_id < self.config.num_topics:
k = min(k, len(self.topic_words[topic_id]))
for i in range(k):
prob = self.topic_words[topic_id][i].count / \
(self.topic_sum_table[topic_id] + EPS)
results[self.vocabulary[self.topic_words[topic_id]
[i].word_id]] = prob
return results
else:
logger.error("%d is out of range!" % topic_id)
import os
import numpy as np
from paddlehub.common.logger import logger
from slda_news.vocab import OOV
EPS = 1e-06
class WordAndDis(object):
def __init__(self):
self.word = None
self.distance = None
class SemanticMatching(object):
def __init__(self):
pass
def l2_norm(self, vec):
"""Calculate the length of vector.
"""
result = np.sqrt(np.sum(vec**2))
return result
def cosine_similarity(self, vec1, vec2):
norm1 = self.l2_norm(vec1)
norm2 = self.l2_norm(vec2)
result = np.sum(vec1 * vec2) / norm1 / norm2
return result
def likelihood_based_similarity(self, terms, doc_topic_dist, model):
"""
Args:
terms: list of strings
doc_topic_dist: list of Topic class
model: TopicModel class
"""
num_of_term_in_vocab = 0
result = 0
for i in range(len(terms)):
term_id = model.term_id(terms[i])
if term_id == OOV:
continue
num_of_term_in_vocab += 1
for j in range(len(doc_topic_dist)):
topic_id = doc_topic_dist[j].tid
prob = doc_topic_dist[j].prob
result += model.word_topic_value(term_id, topic_id) * 1.0 / \
model.topic_sum_value(topic_id) * prob
if num_of_term_in_vocab == 0:
return result
return result / num_of_term_in_vocab
def kullback_leibler_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist2[dist2 < EPS] = EPS
result = np.sum(dist1 * np.log(dist1 / dist2))
return result
def jensen_shannon_divergence(self, dist1, dist2):
assert dist1.shape == dist2.shape
dist1[dist1 < EPS] = EPS
dist2[dist2 < EPS] = EPS
mean = (dist1 + dist2) * 0.5
jsd = self.kullback_leibler_divergence(dist1, mean) * 0.5 + \
self.kullback_leibler_divergence(dist2, mean) * 0.5
return jsd
def hellinger_distance(self, dist1, dist2):
assert dist1.shape == dist2.shape
result = np.sum((np.sqrt(dist1) - np.sqrt(dist2))**2)
result = np.sqrt(result) * 0.7071067812
return result
import time
import yaml
import numpy as np
from paddlehub.common.logger import logger
from slda_news.config import ModelType
def load_prototxt(config_file, config):
"""
Args:
config_file: model configuration file.
config: ModelConfig class
"""
logger.info("Loading SLDA config.")
with open(config_file, 'r') as f:
yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
# Assignment.
if yaml_dict["type"] == "LDA":
config.type = ModelType.LDA
else:
config.type = ModelType.SLDA
config.num_topics = yaml_dict["num_topics"]
config.alpha = yaml_dict["alpha"]
config.beta = yaml_dict["beta"]
config.word_topic_file = yaml_dict["word_topic_file"]
config.vocab_file = yaml_dict["vocab_file"]
def fix_random_seed(seed=2147483647):
np.random.seed(seed)
def rand(min_=0, max_=1):
return np.random.uniform(low=min_, high=max_)
def rand_k(k):
"""Returns an integer float number between [0, k - 1].
"""
return int(rand() * k)
def timeit(f):
"""Return time cost of function f.
"""
def timed(*args, **kwargs):
start_time = time.time()
result = f(*args, **kwargs)
end_time = time.time()
print(" [-] %s : %2.5f sec" % (f.__name__, end_time - start_time))
return result
return timed
"""
This file defines the basic config information of LDA/SLDA model.
"""
class ModelType:
LDA = 0
SLDA = 1
class ModelConfig:
type = None
num_topics = None
alpha = None
beta = None
word_topic_file = None
vocab_file = None
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册