import numpy as np class Topic(object): """Basic data structure of topic, contains topic id and corresponding probability. """ def __init__(self, tid, prob): self.tid = tid # topic id self.prob = prob # topic probability class Token(object): """Basic storage unit of LDA documents, contains word id and corresponding topic. """ def __init__(self, topic, id): self.topic = topic self.id = id class Sentence(object): """Basic storage unit of SentenceLDA documents, contains word ids of the sentence and its corresponding topic id. """ def __init__(self, topic, tokens): self.topic = topic self.tokens = tokens class LDADoc(object): """The storage structure of LDA model's inference result. """ def __init__(self): self._num_topics = None # Number of topics. self._num_accum = None # Number of accumulated sample rounds. self._alpha = None # Document prior parameter. self._tokens = None # Storage structure of inference results. self._topic_sum = None # Document's topic sum in one round samples. self._accum_topic_sum = None # Accumulated results of topic sum. def init(self, num_topics): """Initialize the LDADoc according to num_topics. """ self._num_topics = num_topics self._num_accum = 0 self._tokens = [] self._topic_sum = np.zeros(self._num_topics) self._accum_topic_sum = np.zeros(self._num_topics) def add_token(self, token): """Add new word to current LDADoc. Arg: token: Token class object. """ assert token.topic >= 0, "Topic %d out of range!" % token.topic assert token.topic < self._num_topics, "Topic %d out of range!" % token.topic self._tokens.append(token) self._topic_sum[token.topic] += 1 def token(self, index): return self._tokens[index] def set_topic(self, index, new_topic): """Set the index word's topic to new_topic, and update the corresponding topic distribution. """ assert new_topic >= 0, "Topic %d out of range!" % new_topic assert new_topic < self._num_topics, "Topic %d out of range!" % new_topic old_topic = self._tokens[index].topic if new_topic == old_topic: return self._tokens[index].topic = new_topic self._topic_sum[old_topic] -= 1 self._topic_sum[new_topic] += 1 def set_alpha(self, alpha): self._alpha = alpha def size(self): """Return number of words in LDADoc. """ return len(self._tokens) def topic_sum(self, topic_id): return self._topic_sum[topic_id] def sparse_topic_dist(self, sort=True): """Return the topic distribution of documents in sparse format. By default, it is sorted according to the topic probability under the descending order. """ topic_dist = [] sum_ = np.sum(self._accum_topic_sum) if sum_ == 0: return for i in range(0, self._num_topics): if self._accum_topic_sum[i] == 0: continue topic_dist.append(Topic(i, self._accum_topic_sum[i] * 1.0 / sum_)) if sort: def take_elem(topic): return topic.prob topic_dist.sort(key=take_elem, reverse=True) if topic_dist is None: topic_dist = [] return topic_dist def dense_topic_dist(self): """Return the distribution of document topics in dense format, taking into account the prior parameter alpha. """ dense_dist = np.zeros(self._num_topics) if self.size() == 0: return dense_dist dense_dist = ( self._accum_topic_sum * 1.0 / self._num_accum + self._alpha) / ( self.size() + self._alpha * self._num_topics) return dense_dist def accumulate_topic_num(self): self._accum_topic_sum += self._topic_sum self._num_accum += 1 class SLDADoc(LDADoc): """Sentence LDA Document, inherited from LDADoc. Add add_sentence interface. """ def __init__(self): super().__init__() self.__sentences = None def init(self, num_topics): """Initialize the SLDADoc according to num_topics. """ self._num_topics = num_topics self.__sentences = [] self._num_accum = 0 self._topic_sum = np.zeros(self._num_topics) self._accum_topic_sum = np.zeros(self._num_topics) def add_sentence(self, sent): """Add new sentence to current SLDADoc. Arg: sent: Sentence class object. """ assert sent.topic >= 0, "Topic %d out of range!" % (sent.topic) assert sent.topic < self._num_topics, "Topic %d out of range!" % ( sent.topic) self.__sentences.append(sent) self._topic_sum[sent.topic] += 1 def set_topic(self, index, new_topic): assert new_topic >= 0, "Topic %d out of range!" % (new_topic) assert new_topic < self._num_topics, "Topic %d out of range!" % ( new_topic) old_topic = self.__sentences[index].topic if new_topic == old_topic: return self.__sentences[index].topic = new_topic self._topic_sum[old_topic] -= 1 self._topic_sum[new_topic] += 1 def size(self): """Return number of sentences in SLDADoc. """ return len(self.__sentences) def sent(self, index): return self.__sentences[index]