Add topic model (#759)

9b70b1f3 · SiMing Dai · GitHub · 3eece7b5 · 9b70b1f3 · 9b70b1f3
100 changed file
--- a/hub_module/modules/image/text_recognition/chinese_text_detection_db_mobile/module.py
+++ b/hub_module/modules/image/text_recognition/chinese_text_detection_db_mobile/module.py
@@ -48,10 +48,7 @@ class ChineseTextDetectionDB(hub.Module):
        try:
            import shapely, pyclipper
        except:
-            print(
-                'This module requires the shapely, pyclipper tools. The running enviroment does not meet the requirments. Please install the two packages.'
-            )
-            exit()
+            raise ImportError('This module requires the shapely, pyclipper tools. The running environment does not meet the requirements. Please install the two packages.')

    def _set_config(self):
        """

--- a/hub_module/modules/image/text_recognition/chinese_text_detection_db_server/module.py
+++ b/hub_module/modules/image/text_recognition/chinese_text_detection_db_server/module.py
@@ -48,10 +48,7 @@ class ChineseTextDetectionDBServer(hub.Module):
        try:
            import shapely, pyclipper
        except:
-            print(
-                'This module requires the shapely, pyclipper tools. The running enviroment does not meet the requirments. Please install the two packages.'
-            )
-            exit()
+            raise ImportError('This module requires the shapely, pyclipper tools. The running environment does not meet the requirements. Please install the two packages.')

    def _set_config(self):
        """

--- a/hub_module/modules/text/semantic_model/lda_news/README.md
+++ b/hub_module/modules/text/semantic_model/lda_news/README.md
+## 模型概述
+
+主题模型(Topic Model)是以无监督学习的方式对文档的隐含语义结构进行聚类的统计模型，其中LDA(Latent Dirichlet Allocation)算法是主题模型的一种。LDA根据对词的共现信息的分析，拟合出词-文档-主题的分布，从而将词、文本映射到一个语义空间中。本Module基于的数据集为百度自建的新闻领域数据集。
+
+<p align="center">
+<img src="https://bj.bcebos.com/paddlehub/model/nlp/semantic_model/lda.png" hspace='10'/> <br />
+</p>
+
+更多详情请参考[LDA论文](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf)。
+
+注：该Module由第三方开发者DesmonDay贡献。
+
+## LDA模型 API 说明
+### cal_doc_distance(doc_text1, doc_text2)
+用于计算两个输入文档之间的距离，包括Jensen-Shannon divergence(JS散度)、Hellinger Distance(海林格距离)。
+
+**参数**
+
+- doc_text1(str): 输入的第一个文档。
+- doc_text2(str): 输入的第二个文档。
+
+**返回**
+
+- jsd(float): 两个文档之间的JS散度([Jensen-Shannon divergence](https://blog.csdn.net/FrankieHello/article/details/80614422?utm_source=copy))。
+- hd(float): 两个文档之间的海林格距离([Hellinger Distance](http://blog.sina.com.cn/s/blog_85f1ffb70101e65d.html))。
+
+### cal_doc_keywords_similarity(document, top_k=10)
+
+用于查找输入文档的前k个关键词及对应的与原文档的相似度。
+
+**参数**
+
+- document(str): 输入文档。
+- top_k(int): 查找输入文档的前k个关键词。
+
+**返回**
+
+- results(list): 包含每个关键词以及对应的与原文档的相似度。其中，list的基本元素为dict，dict的key为关键词，value为对应的与原文档的相似度。
+
+### cal_query_doc_similarity(query, document)
+
+用于计算短文档与长文档之间的相似度。
+
+**参数**
+
+- query(str): 输入的短文档。
+- document(str): 输入的长文档。
+
+**返回**
+
+- lda_sim(float): 返回短文档与长文档之间的相似度。
+
+### infer_doc_topic_distribution(document)
+
+用于推理出文档的主题分布。
+
+**参数**
+
+- document(str): 输入文档。
+
+**返回**
+
+- results(list): 包含主题分布下各个主题ID和对应的概率分布。其中，list的基本元素为dict，dict的key为主题ID，value为各个主题ID对应的概率。
+
+### show_topic_keywords(topic_id, k=10)
+
+用于展示出每个主题下对应的关键词，可配合推理主题分布的API使用。
+
+**参数**
+
+- topic_id(int): 主题ID。
+- k(int): 需要知道对应主题的前k个关键词。
+
+**返回**
+
+- results(dict): 返回对应文档的前k个关键词，以及各个关键词在文档中的出现概率。
+
+### 代码示例
+
+这里展示部分API的使用示例。
+``` python
+import paddlehub as hub
+
+lda_news = hub.Module(name="lda_news")
+jsd, hd = lda_news.cal_doc_distance(doc_text1="今天的天气如何，适合出去游玩吗", doc_text2="感觉今天的天气不错，可以出去玩一玩了")
+# jsd = 0.003109, hd = 0.0573171
+
+lda_sim = lda_news.cal_query_doc_similarity(query='百度搜索引擎', document='百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息，找到所求。百度超过千亿的中文网页数据库，可以瞬间找到相关的搜索结果。')
+# LDA similarity = 0.06826
+
+results = lda_news.cal_doc_keywords_similarity('百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息，找到所求。百度超过千亿的中文网页数据库，可以瞬间找到相关的搜索结果。')
+# [{'word': '百度', 'similarity': 0.12943492762349573},
+#  {'word': '信息', 'similarity': 0.06139783578769882},
+#  {'word': '找到', 'similarity': 0.055296603463188265},
+#  {'word': '搜索', 'similarity': 0.04270794098349327},
+#  {'word': '全球', 'similarity': 0.03773627056367886},
+#  {'word': '超过', 'similarity': 0.03478658388202199},
+#  {'word': '相关', 'similarity': 0.026295857219683725},
+#  {'word': '获取', 'similarity': 0.021313585287833996},
+#  {'word': '中文', 'similarity': 0.020187103312009513},
+#  {'word': '搜索引擎', 'similarity': 0.007092890537169911}]
+
+```
+
+## 查看代码
+https://github.com/baidu/Familia
+
+
+## 依赖
+
+paddlepaddle >= 1.8.2
+
+paddlehub >= 1.8.0
+
+
+## 更新历史
+
+* 1.0.0
+
+  初始发布
--- a/hub_module/modules/text/semantic_model/lda_news/__init__.py
+++ b/hub_module/modules/text/semantic_model/lda_news/__init__.py
--- a/hub_module/modules/text/semantic_model/lda_news/config.py
+++ b/hub_module/modules/text/semantic_model/lda_news/config.py
+"""
+This file defines the basic config information of LDA/SLDA model.
+"""
+
+
+class ModelType:
+    LDA = 0
+    SLDA = 1
+
+
+class ModelConfig:
+    type = None
+    num_topics = None
+    alpha = None
+    beta = None
+    word_topic_file = None
+    vocab_file = None
--- a/hub_module/modules/text/semantic_model/lda_news/document.py
+++ b/hub_module/modules/text/semantic_model/lda_news/document.py
+import numpy as np
+
+
+class Topic(object):
+    """Basic data structure of topic, contains topic id and
+       corresponding probability.
+    """
+
+    def __init__(self, tid, prob):
+        self.tid = tid  # topic id
+        self.prob = prob  # topic probability
+
+
+class Token(object):
+    """Basic storage unit of LDA documents, contains word id
+       and corresponding topic.
+    """
+
+    def __init__(self, topic, id):
+        self.topic = topic
+        self.id = id
+
+
+class Sentence(object):
+    """Basic storage unit of SentenceLDA documents, contains word ids
+       of the sentence and its corresponding topic id.
+    """
+
+    def __init__(self, topic, tokens):
+        self.topic = topic
+        self.tokens = tokens
+
+
+class LDADoc(object):
+    """The storage structure of LDA model's inference result.
+    """
+
+    def __init__(self):
+        self._num_topics = None  # Number of topics.
+        self._num_accum = None  # Number of accumulated sample rounds.
+        self._alpha = None  # Document prior parameter.
+        self._tokens = None  # Storage structure of inference results.
+        self._topic_sum = None  # Document's topic sum in one round samples.
+        self._accum_topic_sum = None  # Accumulated results of topic sum.
+
+    def init(self, num_topics):
+        """Initialize the LDADoc according to num_topics.
+        """
+        self._num_topics = num_topics
+        self._num_accum = 0
+        self._tokens = []
+        self._topic_sum = np.zeros(self._num_topics)
+        self._accum_topic_sum = np.zeros(self._num_topics)
+
+    def add_token(self, token):
+        """Add new word to current LDADoc.
+        Arg:
+            token: Token class object.
+        """
+        assert token.topic >= 0, "Topic %d out of range!" % token.topic
+        assert token.topic < self._num_topics, "Topic %d out of range!" % token.topic
+        self._tokens.append(token)
+        self._topic_sum[token.topic] += 1
+
+    def token(self, index):
+        return self._tokens[index]
+
+    def set_topic(self, index, new_topic):
+        """Set the index word's topic to new_topic, and update the corresponding
+           topic distribution.
+        """
+        assert new_topic >= 0, "Topic %d out of range!" % new_topic
+        assert new_topic < self._num_topics, "Topic %d out of range!" % new_topic
+        old_topic = self._tokens[index].topic
+        if new_topic == old_topic:
+            return
+        self._tokens[index].topic = new_topic
+        self._topic_sum[old_topic] -= 1
+        self._topic_sum[new_topic] += 1
+
+    def set_alpha(self, alpha):
+        self._alpha = alpha
+
+    def size(self):
+        """Return number of words in LDADoc.
+        """
+        return len(self._tokens)
+
+    def topic_sum(self, topic_id):
+        return self._topic_sum[topic_id]
+
+    def sparse_topic_dist(self, sort=True):
+        """Return the topic distribution of documents in sparse format.
+           By default, it is sorted according to the topic probability
+           under the descending order.
+        """
+        topic_dist = []
+        sum_ = np.sum(self._accum_topic_sum)
+        if sum_ == 0:
+            return
+        for i in range(0, self._num_topics):
+            if self._accum_topic_sum[i] == 0:
+                continue
+            topic_dist.append(Topic(i, self._accum_topic_sum[i] * 1.0 / sum_))
+        if sort:
+
+            def take_elem(topic):
+                return topic.prob
+
+            topic_dist.sort(key=take_elem, reverse=True)
+            if topic_dist is None:
+                topic_dist = []
+
+        return topic_dist
+
+    def dense_topic_dist(self):
+        """Return the distribution of document topics in dense format,
+           taking into account the prior parameter alpha.
+        """
+        dense_dist = np.zeros(self._num_topics)
+        if self.size() == 0:
+            return dense_dist
+        dense_dist = (
+            self._accum_topic_sum * 1.0 / self._num_accum + self._alpha) / (
+                self.size() + self._alpha * self._num_topics)
+        return dense_dist
+
+    def accumulate_topic_num(self):
+        self._accum_topic_sum += self._topic_sum
+        self._num_accum += 1
+
+
+class SLDADoc(LDADoc):
+    """Sentence LDA Document, inherited from LDADoc.
+       Add add_sentence interface.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.__sentences = None
+
+    def init(self, num_topics):
+        """Initialize the SLDADoc according to num_topics.
+        """
+        self._num_topics = num_topics
+        self.__sentences = []
+        self._num_accum = 0
+        self._topic_sum = np.zeros(self._num_topics)
+        self._accum_topic_sum = np.zeros(self._num_topics)
+
+    def add_sentence(self, sent):
+        """Add new sentence to current SLDADoc.
+        Arg:
+            sent: Sentence class object.
+        """
+        assert sent.topic >= 0, "Topic %d out of range!" % (sent.topic)
+        assert sent.topic < self._num_topics, "Topic %d out of range!" % (
+            sent.topic)
+        self.__sentences.append(sent)
+        self._topic_sum[sent.topic] += 1
+
+    def set_topic(self, index, new_topic):
+        assert new_topic >= 0, "Topic %d out of range!" % (new_topic)
+        assert new_topic < self._num_topics, "Topic %d out of range!" % (
+            new_topic)
+        old_topic = self.__sentences[index].topic
+        if new_topic == old_topic:
+            return
+        self.__sentences[index].topic = new_topic
+        self._topic_sum[old_topic] -= 1
+        self._topic_sum[new_topic] += 1
+
+    def size(self):
+        """Return number of sentences in SLDADoc.
+        """
+        return len(self.__sentences)
+
+    def sent(self, index):
+        return self.__sentences[index]
--- a/hub_module/modules/text/semantic_model/lda_news/inference_engine.py
+++ b/hub_module/modules/text/semantic_model/lda_news/inference_engine.py
+import os
+
+from paddlehub.common.logger import logger
+
+from lda_news.config import ModelConfig
+from lda_news.util import load_prototxt, fix_random_seed, rand_k
+from lda_news.model import TopicModel
+from lda_news.sampler import GibbsSampler, MHSampler
+from lda_news.document import LDADoc, SLDADoc, Token, Sentence
+from lda_news.vocab import OOV
+
+
+class SamplerType:
+    GibbsSampling = 0
+    MetropolisHastings = 1
+
+
+class InferenceEngine(object):
+    def __init__(self,
+                 model_dir,
+                 conf_file,
+                 type=SamplerType.MetropolisHastings):
+        # Read model configuration.
+        config = ModelConfig()
+        conf_file_path = os.path.join(model_dir, conf_file)
+        load_prototxt(conf_file_path, config)
+        self.__model = TopicModel(model_dir, config)
+        self.__config = config
+
+        # Initialize the sampler according to the configuration.
+        if type == SamplerType.GibbsSampling:
+            self.__sampler = GibbsSampler(self.__model)
+        elif type == SamplerType.MetropolisHastings:
+            self.__sampler = MHSampler(self.__model)
+
+    def infer(self, input, doc):
+        """Perform LDA topic inference on input, and store the results in doc.
+        Args:
+            input: a list of strings after tokenization.
+            doc: LDADoc type or SLDADoc type.
+        """
+        fix_random_seed()
+        if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
+            doc.init(self.__model.num_topics())
+            doc.set_alpha(self.__model.alpha())
+            for token in input:
+                id_ = self.__model.term_id(token)
+                if id_ != OOV:
+                    init_topic = rand_k(self.__model.num_topics())
+                    doc.add_token(Token(init_topic, id_))
+            self.lda_infer(doc, 20, 50)
+        elif isinstance(doc, SLDADoc):
+            doc.init(self.__model.num_topics())
+            doc.set_alpha(self.__model.alpha())
+            for sent in input:
+                words = []
+                for token in sent:
+                    id_ = self.__model.term_id(token)
+                    if id_ != OOV:
+                        words.append(id_)
+                init_topic = rand_k(self.__model.num_topics())
+                doc.add_sentence(Sentence(init_topic, words))
+            self.slda_infer(doc, 20, 50)
+        else:
+            logger.error("Wrong Doc Type!")
+
+    def lda_infer(self, doc, burn_in_iter, total_iter):
+        assert burn_in_iter >= 0
+        assert total_iter > 0
+        assert total_iter > burn_in_iter
+
+        for iter_ in range(total_iter):
+            self.__sampler.sample_doc(doc)
+            if iter_ >= burn_in_iter:
+                doc.accumulate_topic_num()
+
+    def slda_infer(self, doc, burn_in_iter, total_iter):
+        assert burn_in_iter >= 0
+        assert total_iter > 0
+        assert total_iter > burn_in_iter
+
+        for iter_ in range(total_iter):
+            self.__sampler.sample_doc(doc)
+            if iter_ >= burn_in_iter:
+                doc.accumulate_topic_num()
+
+    def model_type(self):
+        return self.__model.type()
+
+    def get_model(self):
+        return self.__model
+
+    def get_config(self):
+        return self.__config
--- a/hub_module/modules/text/semantic_model/lda_news/model.py
+++ b/hub_module/modules/text/semantic_model/lda_news/model.py
+import os
+from collections import OrderedDict
+
+import numpy as np
+from tqdm import tqdm
+from paddlehub.common.logger import logger
+
+from lda_news.vocab import Vocab, WordCount
+
+
+class TopicModel(object):
+    """Storage Structure of Topic model, including vocabulary and word topic count.
+    """
+
+    def __init__(self, model_dir, config):
+        """
+        Args:
+            model_dir: the path of model directory
+            config: ModelConfig class.
+        """
+        self.__word_topic = None  # Model parameter of word topic.
+        self.__vocab = Vocab()  # Vocab data structure of model.
+        self.__num_topics = config.num_topics  # Number of topics.
+        self.__alpha = config.alpha
+        self.__alpha_sum = self.__alpha * self.__num_topics
+        self.__beta = config.beta
+        self.__beta_sum = None
+        self.__type = config.type  # Model type.
+        self.__topic_sum = np.zeros(
+            self.__num_topics,
+            dtype="int64")  # Accum sum of each topic in word topic.
+        self.__topic_words = [[] for _ in range(self.__num_topics)]
+        word_topic_path = os.path.join(model_dir, config.word_topic_file)
+        vocab_path = os.path.join(model_dir, config.vocab_file)
+        self.load_model(word_topic_path, vocab_path)
+
+    def term_id(self, term):
+        return self.__vocab.get_id(term)
+
+    def load_model(self, word_topic_path, vocab_path):
+
+        # Loading vocabulary
+        self.__vocab.load(vocab_path)
+
+        self.__beta_sum = self.__beta * self.__vocab.size()
+        self.__word_topic = [{} for _ in range(self.__vocab.size())]  # 字典列表
+        self.__load_word_dict(word_topic_path)
+        logger.info(
+            "Model Info: #num_topics=%d #vocab_size=%d alpha=%f beta=%f" %
+            (self.num_topics(), self.vocab_size(), self.alpha(), self.beta()))
+
+    def word_topic_value(self, word_id, topic_id):
+        """Return value of specific word under specific topic in the model.
+        """
+        word_dict = self.__word_topic[word_id]
+        if topic_id not in word_dict:
+            return 0
+        return word_dict[topic_id]
+
+    def word_topic(self, term_id):
+        """Return the topic distribution of a word.
+        """
+        return self.__word_topic[term_id]
+
+    def topic_sum_value(self, topic_id):
+        return self.__topic_sum[topic_id]
+
+    def topic_sum(self):
+        return self.__topic_sum
+
+    def num_topics(self):
+        return self.__num_topics
+
+    def vocab_size(self):
+        return self.__vocab.size()
+
+    def alpha(self):
+        return self.__alpha
+
+    def alpha_sum(self):
+        return self.__alpha_sum
+
+    def beta(self):
+        return self.__beta
+
+    def beta_sum(self):
+        return self.__beta_sum
+
+    def type(self):
+        return self.__type
+
+    def __load_word_dict(self, word_dict_path):
+        """Load the word topic parameters.
+        """
+        logger.info("Loading word topic.")
+        with open(word_dict_path, 'r') as f:
+            for line in tqdm(f.readlines()):
+                fields = line.strip().split(" ")
+                assert len(fields) > 0, "Model file format error!"
+                term_id = int(fields[0])
+                assert term_id < self.vocab_size(), "Term id out of range!"
+                assert term_id >= 0, "Term id out of range!"
+                for i in range(1, len(fields)):
+                    topic_count = fields[i].split(":")
+                    assert len(topic_count) == 2, "Topic count format error!"
+
+                    topic_id = int(topic_count[0])
+                    assert topic_id >= 0, "Topic out of range!"
+                    assert topic_id < self.__num_topics, "Topic out of range!"
+
+                    count = int(topic_count[1])
+                    assert count >= 0, "Topic count error!"
+
+                    self.__word_topic[term_id][topic_id] = count
+                    self.__topic_sum[topic_id] += count
+                    self.__topic_words[topic_id].append(
+                        WordCount(term_id, count))
+                new_dict = OrderedDict()
+                for key in sorted(self.__word_topic[term_id]):
+                    new_dict[key] = self.__word_topic[term_id][key]
+                self.__word_topic[term_id] = new_dict
+
+    def get_vocab(self):
+        return self.__vocab.vocabulary()
+
+    def topic_words(self):
+        return self.__topic_words
--- a/hub_module/modules/text/semantic_model/lda_news/module.py
+++ b/hub_module/modules/text/semantic_model/lda_news/module.py
+import os
+
+import paddlehub as hub
+from paddlehub.module.module import moduleinfo
+from paddlehub.common.logger import logger
+
+from lda_news.inference_engine import InferenceEngine
+from lda_news.document import LDADoc, SLDADoc
+from lda_news.semantic_matching import SemanticMatching, WordAndDis
+from lda_news.tokenizer import LACTokenizer, SimpleTokenizer
+from lda_news.config import ModelType
+from lda_news.vocab import Vocab, WordCount
+
+
+@moduleinfo(
+    name="lda_news",
+    version="1.0.0",
+    summary=
+    "This is a PaddleHub Module for LDA topic model in news dataset, where we can calculate doc distance, calculate the similarity between query and document, etc",
+    author="DesmonDay",
+    author_email="",
+    type="nlp/semantic_model")
+class TopicModel(hub.Module):
+    def _initialize(self):
+        """
+        Initialize with the necessary elements.
+        """
+        self.model_dir = os.path.join(self.directory, 'news')
+        self.conf_file = 'lda.conf'
+        self.__engine = InferenceEngine(self.model_dir, self.conf_file)
+        self.vocab_path = os.path.join(self.model_dir, 'vocab_info.txt')
+        lac = hub.Module(name="lac")
+        # self.__tokenizer = SimpleTokenizer(self.vocab_path)
+        self.__tokenizer = LACTokenizer(self.vocab_path, lac)
+
+        self.vocabulary = self.__engine.get_model().get_vocab()
+        self.config = self.__engine.get_config()
+        self.topic_words = self.__engine.get_model().topic_words()
+        self.topic_sum_table = self.__engine.get_model().topic_sum()
+
+        def take_elem(word_count):
+            return word_count.count
+
+        for i in range(self.config.num_topics):
+            self.topic_words[i].sort(key=take_elem, reverse=True)
+
+        logger.info("Finish initialization.")
+
+    def cal_doc_distance(self, doc_text1, doc_text2):
+        """
+        This interface calculates the distance between documents.
+
+        Args:
+            doc_text1(str): the input document text 1.
+            doc_text2(str): the input document text 2.
+
+        Returns:
+            jsd(float): Jensen-Shannon Divergence distance of two documents.
+            hd(float): Hellinger Distance of two documents.
+        """
+        doc1_tokens = self.__tokenizer.tokenize(doc_text1)
+        doc2_tokens = self.__tokenizer.tokenize(doc_text2)
+
+        # Document topic inference.
+        doc1, doc2 = LDADoc(), LDADoc()
+        self.__engine.infer(doc1_tokens, doc1)
+        self.__engine.infer(doc2_tokens, doc2)
+
+        # To calculate jsd, we need dense document topic distribution.
+        dense_dict1 = doc1.dense_topic_dist()
+        dense_dict2 = doc2.dense_topic_dist()
+        # Calculate the distance between distributions.
+        # The smaller the distance, the higher the document semantic similarity.
+        sm = SemanticMatching()
+        jsd = sm.jensen_shannon_divergence(dense_dict1, dense_dict2)
+        hd = sm.hellinger_distance(dense_dict1, dense_dict2)
+
+        return jsd, hd
+
+    def cal_doc_keywords_similarity(self, document, top_k=10):
+        """
+        This interface can be used to find top k keywords of document.
+
+        Args:
+            document(str): the input document text.
+            top_k(int): top k keywords of this document.
+
+        Returns:
+            results(list): contains top_k keywords and their corresponding
+                           similarity compared to document.
+        """
+        d_tokens = self.__tokenizer.tokenize(document)
+
+        # Do topic inference on documents to obtain topic distribution.
+        doc = LDADoc()
+        self.__engine.infer(d_tokens, doc)
+        doc_topic_dist = doc.sparse_topic_dist()
+
+        items = []
+        words = set()
+        for word in d_tokens:
+            if word in words:
+                continue
+            words.add(word)
+            wd = WordAndDis()
+            wd.word = word
+            sm = SemanticMatching()
+            wd.distance = sm.likelihood_based_similarity(
+                terms=[word],
+                doc_topic_dist=doc_topic_dist,
+                model=self.__engine.get_model())
+            items.append(wd)
+
+        def take_elem(word_dis):
+            return word_dis.distance
+
+        items.sort(key=take_elem, reverse=True)
+
+        results = []
+        size = len(items)
+        for i in range(top_k):
+            if i >= size:
+                break
+            results.append({
+                "word": items[i].word,
+                "similarity": items[i].distance
+            })
+
+        return results
+
+    def cal_query_doc_similarity(self, query, document):
+        """
+        This interface calculates the similarity between query and document.
+
+        Args:
+            query(str): the input query text.
+            document(str): the input document text.
+
+        Returns:
+            lda_sim(float): likelihood based similarity between query and document
+                            based on LDA.
+        """
+        q_tokens = self.__tokenizer.tokenize(query)
+        d_tokens = self.__tokenizer.tokenize(document)
+
+        doc = LDADoc()
+        self.__engine.infer(d_tokens, doc)
+        doc_topic_dist = doc.sparse_topic_dist()
+
+        sm = SemanticMatching()
+        lda_sim = sm.likelihood_based_similarity(q_tokens, doc_topic_dist,
+                                                 self.__engine.get_model())
+
+        return lda_sim
+
+    def infer_doc_topic_distribution(self, document):
+        """
+        This interface infers the topic distribution of document.
+
+        Args:
+            document(str): the input document text.
+
+        Returns:
+            results(list): returns the topic distribution of document.
+        """
+        tokens = self.__tokenizer.tokenize(document)
+        if tokens == []:
+            return []
+        results = []
+        doc = LDADoc()
+        self.__engine.infer(tokens, doc)
+        topics = doc.sparse_topic_dist()
+        for topic in topics:
+            results.append({"topic id": topic.tid, "distribution": topic.prob})
+        return results
+
+    def show_topic_keywords(self, topic_id, k=10):
+        """
+        This interface returns first k keywords under specific topic.
+
+        Args:
+            topic_id(int): topic information we want to know.
+            k(int): top k keywords.
+
+        Returns:
+            results(dict): contains specific topic's keywords and corresponding
+                           probability.
+        """
+        EPS = 1e-8
+        results = {}
+        if 0 <= topic_id < self.config.num_topics:
+            k = min(k, len(self.topic_words[topic_id]))
+            for i in range(k):
+                prob = self.topic_words[topic_id][i].count / \
+                       (self.topic_sum_table[topic_id] + EPS)
+                results[self.vocabulary[self.topic_words[topic_id]
+                                        [i].word_id]] = prob
+            return results
+        else:
+            logger.error("%d is out of range!" % topic_id)
--- a/hub_module/modules/text/semantic_model/lda_news/sampler.py
+++ b/hub_module/modules/text/semantic_model/lda_news/sampler.py
+import numpy as np
+from tqdm import tqdm
+from paddlehub.common.logger import logger
+
+from lda_news.document import LDADoc, SLDADoc, Token, Sentence
+from lda_news.vose_alias import VoseAlias
+from lda_news.util import rand, rand_k
+
+
+class Sampler(object):
+    def __init__(self):
+        pass
+
+    def sample_doc(self, doc):
+        """Sample LDA or SLDA topics for documents.
+        """
+        raise NotImplementedError
+
+
+class MHSampler(Sampler):
+    def __init__(self, model):
+        super().__init__()
+        self.__model = model
+        self.__topic_indexes = None
+        self.__alias_tables = None
+        self.__prob_sum = None
+        self.__beta_alias = VoseAlias()
+        self.__beta_prior_sum = None
+        self.__mh_steps = 2
+        self.__construct_alias_table()
+
+    def __construct_alias_table(self):
+        """Construct alias table for all words.
+        """
+        logger.info("Construct alias table for alias sampling method.")
+        vocab_size = self.__model.vocab_size()
+        self.__topic_indexes = [[] for _ in range(vocab_size)]
+        self.__alias_tables = [VoseAlias() for _ in range(vocab_size)]
+        self.__prob_sum = np.zeros(vocab_size)
+
+        # Construct each word's alias table (prior is not included).
+        for i in tqdm(range(vocab_size)):
+            dist = []
+            prob_sum = 0
+            for key in self.__model.word_topic(i):
+                topic_id = key
+                word_topic_count = self.__model.word_topic(i)[key]
+                topic_sum = self.__model.topic_sum_value(topic_id)
+
+                self.__topic_indexes[i].append(topic_id)
+                q = word_topic_count / (topic_sum + self.__model.beta_sum())
+                dist.append(q)
+                prob_sum += q
+            self.__prob_sum[i] = prob_sum
+            if len(dist) > 0:
+                dist = np.array(dist, dtype=np.float)
+                self.__alias_tables[i].initialize(dist)
+
+        # Build prior parameter beta's alias table.
+        beta_dist = self.__model.beta() / (
+            self.__model.topic_sum() + self.__model.beta_sum())
+        self.__beta_prior_sum = np.sum(beta_dist)
+        self.__beta_alias.initialize(beta_dist)
+
+    def sample_doc(self, doc):
+        if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
+            for i in range(doc.size()):
+                new_topic = self.__sample_token(doc, doc.token(i))
+                doc.set_topic(i, new_topic)
+        elif isinstance(doc, SLDADoc):
+            for i in range(doc.size()):
+                new_topic = self.__sample_sentence(doc, doc.sent(i))
+                doc.set_topic(i, new_topic)
+
+    def __sample_token(self, doc, token):
+        new_topic = token.topic
+        for i in range(self.__mh_steps):
+            doc_proposed_topic = self.__doc_proposal(doc, token)
+            new_topic = self.__word_proposal(doc, token, doc_proposed_topic)
+        return new_topic
+
+    def __sample_sentence(self, doc, sent):
+        new_topic = sent.topic
+        for i in range(self.__mh_steps):
+            doc_proposed_topic = self.__doc_proposal(doc, sent)
+            new_topic = self.__word_proposal(doc, sent, doc_proposed_topic)
+        return new_topic
+
+    def __doc_proposal(self, doc, token):
+        if isinstance(doc, LDADoc) and isinstance(token, Token):
+            old_topic = token.topic
+            dart = rand() * (doc.size() + self.__model.alpha_sum())
+            if dart < doc.size():
+                token_index = int(dart)
+                new_topic = doc.token(token_index).topic
+            else:
+                new_topic = rand_k(self.__model.num_topics())
+
+            if new_topic != old_topic:
+                proposal_old = self.__doc_proposal_distribution(doc, old_topic)
+                proposal_new = self.__doc_proposal_distribution(doc, new_topic)
+                proportion_old = self.__proportional_function(
+                    doc, token, old_topic)
+                proportion_new = self.__proportional_function(
+                    doc, token, new_topic)
+                transition_prob = float((proportion_new * proposal_old) /
+                                        (proportion_old * proposal_new))
+                rejection = rand()
+                mask = -(rejection < transition_prob)
+                return (new_topic & mask) | (old_topic & ~mask)
+
+            return new_topic
+
+        elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
+            sent = token
+            old_topic = sent.topic
+            dart = rand() * (doc.size() + self.__model.alpha_sum())
+            if dart < doc.size():
+                token_index = int(dart)
+                new_topic = doc.sent(token_index).topic
+            else:
+                new_topic = rand_k(self.__model.num_topics())
+
+            if new_topic != old_topic:
+                proportion_old = self.__proportional_function(
+                    doc, sent, old_topic)
+                proportion_new = self.__proportional_function(
+                    doc, sent, new_topic)
+                proposal_old = self.__doc_proposal_distribution(doc, old_topic)
+                proposal_new = self.__doc_proposal_distribution(doc, new_topic)
+                transition_prob = float((proportion_new * proposal_old) /
+                                        (proportion_old * proposal_new))
+                rejection = rand()
+                mask = -(rejection < transition_prob)
+                return (new_topic & mask) | (old_topic & ~mask)
+
+            return new_topic
+
+    def __word_proposal(self, doc, token, old_topic):
+        if isinstance(doc, LDADoc) and isinstance(token, Token):
+            new_topic = self.__propose(token.id)
+            if new_topic != old_topic:
+                proposal_old = self.__word_proposal_distribution(
+                    token.id, old_topic)
+                proposal_new = self.__word_proposal_distribution(
+                    token.id, new_topic)
+                proportion_old = self.__proportional_function(
+                    doc, token, old_topic)
+                proportion_new = self.__proportional_function(
+                    doc, token, new_topic)
+                transition_prob = float((proportion_new * proposal_old) /
+                                        (proportion_old * proposal_new))
+                rejection = rand()
+                mask = -(rejection < transition_prob)
+                return (new_topic & mask) | (old_topic & ~mask)
+            return new_topic
+
+        elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
+            sent = token
+            new_topic = old_topic
+            for word_id in sent.tokens:
+                new_topic = self.__propose(word_id)
+                if new_topic != old_topic:
+                    proportion_old = self.__proportional_function(
+                        doc, sent, old_topic)
+                    proportion_new = self.__proportional_function(
+                        doc, sent, new_topic)
+                    proposal_old = self.__word_proposal_distribution(
+                        word_id, old_topic)
+                    proposal_new = self.__word_proposal_distribution(
+                        word_id, new_topic)
+                    transition_prob = float((proportion_new * proposal_old) /
+                                            (proportion_old * proposal_new))
+                    rejection = rand()
+                    mask = -(rejection < transition_prob)
+                    new_topic = (new_topic & mask) | (old_topic & ~mask)
+            return new_topic
+
+    def __proportional_function(self, doc, token, new_topic):
+        if isinstance(doc, LDADoc) and isinstance(token, Token):
+            old_topic = token.topic
+            dt_alpha = doc.topic_sum(new_topic) + self.__model.alpha()
+            wt_beta = self.__model.word_topic_value(
+                token.id, new_topic) + self.__model.beta()
+            t_sum_beta_sum = self.__model.topic_sum_value(
+                new_topic) + self.__model.beta_sum()
+            if new_topic == old_topic and wt_beta > 1:
+                if dt_alpha > 1:
+                    dt_alpha -= 1
+                wt_beta -= 1
+                t_sum_beta_sum -= 1
+            return dt_alpha * wt_beta / t_sum_beta_sum
+
+        elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
+            sent = token
+            old_topic = sent.topic
+            result = doc.topic_sum(new_topic) + self.__model.alpha()
+            if new_topic == old_topic:
+                result -= 1
+            for word_id in sent.tokens:
+                wt_beta = self.__model.word_topic_value(
+                    word_id, new_topic) + self.__model.beta()
+                t_sum_beta_sum = self.__model.topic_sum_value(
+                    new_topic) + self.__model.beta_sum()
+                if new_topic == old_topic and wt_beta > 1:
+                    wt_beta -= 1
+                    t_sum_beta_sum -= 1
+                result *= wt_beta / t_sum_beta_sum
+            return result
+        else:
+            logger.error("Wrong input argument type!")
+
+    def __word_proposal_distribution(self, word_id, topic):
+        wt_beta = self.__model.word_topic_value(word_id,
+                                                topic) + self.__model.beta()
+        t_sum_beta_sum = self.__model.topic_sum_value(
+            topic) + self.__model.beta_sum()
+        return wt_beta / t_sum_beta_sum
+
+    def __doc_proposal_distribution(self, doc, topic):
+        return doc.topic_sum(topic) + self.__model.alpha()
+
+    def __propose(self, word_id):
+        dart = rand() * (self.__prob_sum[word_id] + self.__beta_prior_sum)
+        if dart < self.__prob_sum[word_id]:
+            idx = self.__alias_tables[word_id].generate()
+            topic = self.__topic_indexes[word_id][idx]
+        else:
+            topic = self.__beta_alias.generate()
+        return topic
+
+
+class GibbsSampler(Sampler):
+    def __init__(self, model):
+        super().__init__()
+        self.__model = model
+
+    def sample_doc(self, doc):
+        if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
+            for i in range(doc.size()):
+                new_topic = self.__sample_token(doc, doc.token(i))
+                doc.set_topic(i, new_topic)
+        elif isinstance(doc, SLDADoc):
+            for i in range(doc.size()):
+                new_topic = self.__sample_sentence(doc, doc.sent(i))
+                doc.set_topic(i, new_topic)
+
+    def __sample_token(self, doc, token):
+        old_topic = token.topic
+        num_topics = self.__model.num_topics()
+        accum_prob = np.zeros(num_topics)
+        prob = np.zeros(num_topics)
+        sum_ = 0
+        for i in range(num_topics):
+            dt_alpha = doc.topic_sum(i) + self.__model.alpha()
+            wt_beta = self.__model.word_topic_value(token.id,
+                                                    i) + self.__model.beta()
+            t_sum_beta_sum = self.__model.topic_sum(i) + self.__model.beta_sum()
+            if i == old_topic and wt_beta > 1:
+                if dt_alpha > 1:
+                    dt_alpha -= 1
+                wt_beta -= 1
+                t_sum_beta_sum -= 1
+            prob[i] = dt_alpha * wt_beta / t_sum_beta_sum
+            sum_ += prob[i]
+            accum_prob[i] = prob[i] if i == 0 else accum_prob[i - 1] + prob[i]
+
+        dart = rand() * sum_
+        if dart <= accum_prob[0]:
+            return 0
+        for i in range(1, num_topics):
+            if accum_prob[i - 1] < dart <= accum_prob[i]:
+                return i
+        return num_topics - 1
+
+    def __sample_sentence(self, doc, sent):
+        old_topic = sent.topic
+        num_topics = self.__model.num_topics()
+        accum_prob = np.zeros(num_topics)
+        prob = np.zeros(num_topics)
+        sum_ = 0
+        for t in range(num_topics):
+            dt_alpha = doc.topic_sum(t) + self.__model.alpha()
+            t_sum_beta_sum = self.__model.topic_sum(t) + self.__model.beta_sum()
+            if t == old_topic:
+                if dt_alpha > 1:
+                    dt_alpha -= 1
+                if t_sum_beta_sum > 1:
+                    t_sum_beta_sum -= 1
+            prob[t] = dt_alpha
+            for i in range(len(sent.tokens)):
+                w = sent.tokens[i]
+                wt_beta = self.__model.word_topic_value(
+                    w, t) + self.__model.beta()
+                if t == old_topic and wt_beta > 1:
+                    wt_beta -= 1
+                # Note: if the length of the sentence is too long, the probability will be
+                # too small and the accuracy will be lost if there are too many multiply items
+                prob[t] *= wt_beta / t_sum_beta_sum
+            sum_ += prob[t]
+            accum_prob[t] = prob[t] if t == 0 else accum_prob[t - 1] + prob[t]
+
+        dart = rand() * sum
+        if dart <= accum_prob[0]:
+            return 0
+        for t in range(1, num_topics):
+            if accum_prob[t - 1] < dart <= accum_prob[t]:
+                return t
+        return num_topics - 1
--- a/hub_module/modules/text/semantic_model/lda_news/semantic_matching.py
+++ b/hub_module/modules/text/semantic_model/lda_news/semantic_matching.py
+import numpy as np
+
+from lda_news.vocab import OOV
+
+EPS = 1e-06
+
+
+class WordAndDis(object):
+    def __init__(self):
+        self.word = None
+        self.distance = None
+
+
+class SemanticMatching(object):
+    def __init__(self):
+        pass
+
+    def l2_norm(self, vec):
+        """Calculate the length of vector.
+        """
+        result = np.sqrt(np.sum(vec**2))
+        return result
+
+    def cosine_similarity(self, vec1, vec2):
+        """Calculate the cosine similarity between two vectors.
+        """
+        norm1 = self.l2_norm(vec1)
+        norm2 = self.l2_norm(vec2)
+        result = np.sum(vec1 * vec2) / norm1 / norm2
+        return result
+
+    def likelihood_based_similarity(self, terms, doc_topic_dist, model):
+        """Calculate the likelihood based similarity.
+        Args:
+            terms: list of strings
+            doc_topic_dist: list of Topic class
+            model: TopicModel class
+        """
+        num_of_term_in_vocab = 0
+        result = 0
+        for i in range(len(terms)):
+            term_id = model.term_id(terms[i])
+            if term_id == OOV:
+                continue
+            num_of_term_in_vocab += 1
+            for j in range(len(doc_topic_dist)):
+                topic_id = doc_topic_dist[j].tid
+                prob = doc_topic_dist[j].prob
+                result += model.word_topic_value(term_id, topic_id) * 1.0 / \
+                          model.topic_sum_value(topic_id) * prob
+
+        if num_of_term_in_vocab == 0:
+            return result
+        return result / num_of_term_in_vocab
+
+    def kullback_leibler_divergence(self, dist1, dist2):
+        assert dist1.shape == dist2.shape
+        dist2[dist2 < EPS] = EPS
+        result = np.sum(dist1 * np.log(dist1 / dist2))
+        return result
+
+    def jensen_shannon_divergence(self, dist1, dist2):
+        assert dist1.shape == dist2.shape
+        dist1[dist1 < EPS] = EPS
+        dist2[dist2 < EPS] = EPS
+        mean = (dist1 + dist2) * 0.5
+        jsd = self.kullback_leibler_divergence(dist1, mean) * 0.5 + \
+              self.kullback_leibler_divergence(dist2, mean) * 0.5
+        return jsd
+
+    def hellinger_distance(self, dist1, dist2):
+        assert dist1.shape == dist2.shape
+        result = np.sum((np.sqrt(dist1) - np.sqrt(dist2))**2)
+        result = np.sqrt(result) * 0.7071067812
+        return result
--- a/hub_module/modules/text/semantic_model/lda_news/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/lda_news/tokenizer.py
+"""This file defines tokenizer class object.
+"""
+
+
+class Tokenizer(object):
+    """Base tokenizer class.
+    """
+
+    def __init__(self):
+        pass
+
+    def tokenize(self, text):
+        raise NotImplementedError
+
+
+class SimpleTokenizer(Tokenizer):
+    """Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
+       be used in topic model demo, but not in real business application scenarios.
+
+       Notes: This tokenizer can only recognize the words in the corresponding vocab file.
+    """
+
+    def __init__(self, vocab_path):
+        super().__init__()
+        self.__max_word_len = 0
+        self.__vocab = set()
+        self.__load_vocab(vocab_path)
+
+    def tokenize(self, text):
+        """Tokenize the input string `text`, and return the tokenize result.
+        """
+        text_len = len(text)
+        result = []
+        i = 0
+        while i < text_len:
+            word = found_word = ""
+            # Deal with English characters.
+            if self.__is_eng_char(text[i]):
+                for j in range(i, text_len + 1):
+                    if j < text_len and self.__is_eng_char(text[j]):
+                        word += self.__tolower(text[j])
+                    else:
+                        # Forward matching by character granularity.
+                        if word in self.__vocab:
+                            result.append(word)
+                        i = j - 1
+                        break
+            else:
+                for j in range(i, min(i + self.__max_word_len, text_len)):
+                    word += text[j]
+                    if word in self.__vocab:
+                        found_word = word
+                if len(found_word) > 0:
+                    result.append(found_word)
+                    i += len(found_word) - 1
+            i += 1
+        return result
+
+    def contains(self, word):
+        """Check whether the word is in the vocabulary.
+        """
+        return word in self.__vocab
+
+    def __load_vocab(self, vocab_path):
+        """Load the word dictionary.
+        """
+        with open(vocab_path, 'r') as fin:
+            vocab_size = 0
+            for line in fin.readlines():
+                fields = line.strip().split('\t')
+                assert len(fields) >= 2
+                word = fields[1]
+                self.__max_word_len = max(self.__max_word_len, len(word))
+                self.__vocab.add(word)
+                vocab_size += 1
+
+    def __is_eng_char(self, c):
+        """Check whether char c is an English character.
+        """
+        return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')
+
+    def __tolower(self, c):
+        """Return the lowercase character of the corresponding character, or return
+           the original character if there is no corresponding lowercase character.
+        """
+        return c.lower()
+
+
+class LACTokenizer(Tokenizer):
+    def __init__(self, vocab_path, lac):
+        super().__init__()
+        self.__max_word_len = 0
+        self.__vocab = set()
+        self.__lac = lac
+        self.__load_vocab(vocab_path)
+
+    def __load_vocab(self, vocab_path):
+        """Load the word dictionary.
+                """
+        with open(vocab_path, 'r') as fin:
+            vocab_size = 0
+            for line in fin.readlines():
+                fields = line.strip().split('\t')
+                assert len(fields) >= 2
+                word = fields[1]
+                self.__max_word_len = max(self.__max_word_len, len(word))
+                self.__vocab.add(word)
+                vocab_size += 1
+
+    def tokenize(self, text):
+        results = self.__lac.lexical_analysis(
+            texts=[text], use_gpu=True, batch_size=1, return_tag=True)
+        # Change English words to lower case.
+        # And just preserve the word in vocab.
+        words = results[0]["word"]
+        result = []
+        for word in words:
+            word = word.lower()
+            if word in self.__vocab:
+                result.append(word)
+        return result
+
+    def contains(self, word):
+        """Check whether the word is in the vocabulary.
+        """
+        return word in self.__vocab
--- a/hub_module/modules/text/semantic_model/lda_news/util.py
+++ b/hub_module/modules/text/semantic_model/lda_news/util.py
+import time
+import yaml
+
+import numpy as np
+from paddlehub.common.logger import logger
+
+from lda_news.config import ModelType
+
+
+def load_prototxt(config_file, config):
+    """
+    Args:
+        config_file: model configuration file.
+        config: ModelConfig class
+    """
+    logger.info("Loading LDA config.")
+    with open(config_file, 'r') as f:
+        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
+
+    # Assignment.
+    if yaml_dict["type"] == "LDA":
+        config.type = ModelType.LDA
+    else:
+        config.type = ModelType.SLDA
+    config.num_topics = yaml_dict["num_topics"]
+    config.alpha = yaml_dict["alpha"]
+    config.beta = yaml_dict["beta"]
+    config.word_topic_file = yaml_dict["word_topic_file"]
+    config.vocab_file = yaml_dict["vocab_file"]
+
+
+def fix_random_seed(seed=2147483647):
+    np.random.seed(seed)
+
+
+def rand(min_=0, max_=1):
+    return np.random.uniform(low=min_, high=max_)
+
+
+def rand_k(k):
+    """Returns an integer float number between [0, k - 1].
+    """
+    return int(rand() * k)
+
+
+def timeit(f):
+    """Return time cost of function f.
+    """
+
+    def timed(*args, **kwargs):
+        start_time = time.time()
+        result = f(*args, **kwargs)
+        end_time = time.time()
+        print("   [-] %s : %2.5f sec" % (f.__name__, end_time - start_time))
+        return result
+
+    return timed
--- a/hub_module/modules/text/semantic_model/lda_news/vocab.py
+++ b/hub_module/modules/text/semantic_model/lda_news/vocab.py
+from paddlehub.common.logger import logger
+
+OOV = -1
+
+
+class WordCount(object):
+    def __init__(self, word_id, count):
+        self.word_id = word_id
+        self.count = count
+
+
+class Vocab(object):
+    def __init__(self):
+        self.__term2id = {}
+        self.__id2term = {}
+
+    def get_id(self, word):
+        if word not in self.__term2id:
+            return OOV
+        return self.__term2id[word]
+
+    def load(self, vocab_file):
+        self.__term2id = {}
+        self.__id2term = {}
+        with open(vocab_file, 'r') as fin:
+            for line in fin.readlines():
+                fields = line.strip().split('\t')
+                assert len(
+                    fields) == 5, "Vocabulary file [%s] format error!" % (
+                        vocab_file)
+                term = fields[1]
+                id_ = int(fields[2])
+                if term in self.__term2id:
+                    logger.error("Duplicate word [%s] in vocab file!" % (term))
+                    continue
+                self.__term2id[term] = id_
+                self.__id2term[id_] = term
+
+    def size(self):
+        return len(self.__term2id)
+
+    def vocabulary(self):
+        return self.__id2term
--- a/hub_module/modules/text/semantic_model/lda_news/vose_alias.py
+++ b/hub_module/modules/text/semantic_model/lda_news/vose_alias.py
+import numpy as np
+
+from lda_news.util import rand, rand_k
+
+
+class VoseAlias(object):
+    """Vose's Alias Method.
+    """
+
+    def __init__(self):
+        self.__alias = None
+        self.__prob = None  # np.array
+
+    def initialize(self, distribution):
+        """Initialize the alias table according to the input distribution
+        Arg:
+            distribution: the input distribution.
+        """
+        size = distribution.shape[0]
+        self.__alias = np.zeros(size, dtype=np.int64)
+        self.__prob = np.zeros(size)
+        sum_ = np.sum(distribution)
+        p = distribution / sum_ * size  # Scale up probability.
+        large, small = [], []
+        for i, p_ in enumerate(p):
+            if p_ < 1.0:
+                small.append(i)
+            else:
+                large.append(i)
+
+        while large and small:
+            l = small[0]
+            g = large[0]
+            small.pop(0)
+            large.pop(0)
+            self.__prob[l] = p[l]
+            self.__alias[l] = g
+            p[g] = p[g] + p[l] - 1  # A more numerically stable option.
+            if p[g] < 1.0:
+                small.append(g)
+            else:
+                large.append(g)
+
+        while large:
+            g = large[0]
+            large.pop(0)
+            self.__prob[g] = 1.0
+
+        while small:
+            l = small[0]
+            small.pop(0)
+            self.__prob[l] = 1.0
+
+    def generate(self):
+        """Generate samples from given distribution.
+        """
+        dart1 = rand_k(self.size())
+        dart2 = int(rand())
+        return dart1 if dart2 > self.__prob[dart1] else self.__alias[dart1]
+
+    def size(self):
+        return self.__prob.shape[0]
--- a/hub_module/modules/text/semantic_model/lda_novel/README.md
+++ b/hub_module/modules/text/semantic_model/lda_novel/README.md
+## 模型概述
+
+主题模型(Topic Model)是以无监督学习的方式对文档的隐含语义结构进行聚类的统计模型，其中LDA(Latent Dirichlet Allocation)算法是主题模型的一种。LDA根据对词的共现信息的分析，拟合出词-文档-主题的分布，从而将词、文本映射到一个语义空间中。本Module基于的数据集为百度自建的小说领域数据集。
+
+<p align="center">
+<img src="https://bj.bcebos.com/paddlehub/model/nlp/semantic_model/lda.png" hspace='10'/> <br />
+</p>
+
+更多详情请参考[LDA论文](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf)。
+
+注：该Module由第三方开发者DesmonDay贡献。
+
+## LDA模型 API 说明
+### cal_doc_distance(doc_text1, doc_text2)
+用于计算两个输入文档之间的距离，包括Jensen-Shannon divergence(JS散度)、Hellinger Distance(海林格距离)。
+
+**参数**
+
+- doc_text1(str): 输入的第一个文档。
+- doc_text2(str): 输入的第二个文档。
+
+**返回**
+
+- jsd(float): 两个文档之间的JS散度([Jensen-Shannon divergence](https://blog.csdn.net/FrankieHello/article/details/80614422?utm_source=copy))。
+- hd(float): 两个文档之间的海林格距离([Hellinger Distance](http://blog.sina.com.cn/s/blog_85f1ffb70101e65d.html))。
+
+### cal_doc_keywords_similarity(document, top_k=10)
+
+用于查找输入文档的前k个关键词及对应的与原文档的相似度。
+
+**参数**
+
+- document(str): 输入文档。
+- top_k(int): 查找输入文档的前k个关键词。
+
+**返回**
+
+- results(list): 包含每个关键词以及对应的与原文档的相似度。其中，list的基本元素为dict，dict的key为关键词，value为对应的与原文档的相似度。
+
+### cal_query_doc_similarity(query, document)
+
+用于计算短文档与长文档之间的相似度。
+
+**参数**
+
+- query(str): 输入的短文档。
+- document(str): 输入的长文档。
+
+**返回**
+
+- lda_sim(float): 返回短文档与长文档之间的相似度。
+
+### infer_doc_topic_distribution(document)
+
+用于推理出文档的主题分布。
+
+**参数**
+
+- document(str): 输入文档。
+
+**返回**
+
+- results(list): 包含主题分布下各个主题ID和对应的概率分布。其中，list的基本元素为dict，dict的key为主题ID，value为各个主题ID对应的概率。
+
+### show_topic_keywords(topic_id, k=10)
+
+用于展示出每个主题下对应的关键词，可配合推理主题分布的API使用。
+
+**参数**
+
+- topic_id(int): 主题ID。
+- k(int): 需要知道对应主题的前k个关键词。
+
+**返回**
+
+- results(dict): 返回对应文档的前k个关键词，以及各个关键词在文档中的出现概率。
+
+### 代码示例
+
+这里展示部分API的使用示例。
+``` python
+import paddlehub as hub
+
+lda_novel = hub.Module(name="lda_novel")
+jsd, hd = lda_novel.cal_doc_distance(doc_text1="老人幸福地看着自己的儿子，露出了欣慰的笑容。", doc_text2="老奶奶看着自己的儿子，幸福地笑了。")
+# jsd = 0.01292, hd = 0.11893
+
+lda_sim = lda_novel.cal_query_doc_similarity(query='亲孙女', document='老人激动地打量着面前的女孩，似乎找到了自己的亲孙女一般，双手止不住地颤抖着。')
+# LDA similarity = 0.0
+
+```
+
+## 查看代码
+https://github.com/baidu/Familia
+
+
+## 依赖
+
+paddlepaddle >= 1.8.2
+
+paddlehub >= 1.8.0
+
+
+## 更新历史
+
+* 1.0.0
+
+  初始发布
--- a/hub_module/modules/text/semantic_model/lda_novel/__init__.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/__init__.py
--- a/hub_module/modules/text/semantic_model/lda_novel/config.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/config.py
+"""
+This file defines the basic config information of LDA/SLDA model.
+"""
+
+
+class ModelType:
+    LDA = 0
+    SLDA = 1
+
+
+class ModelConfig:
+    type = None
+    num_topics = None
+    alpha = None
+    beta = None
+    word_topic_file = None
+    vocab_file = None
--- a/hub_module/modules/text/semantic_model/lda_novel/document.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/document.py
+import numpy as np
+
+
+class Topic(object):
+    """Basic data structure of topic, contains topic id and
+       corresponding probability.
+    """
+
+    def __init__(self, tid, prob):
+        self.tid = tid  # topic id
+        self.prob = prob  # topic probability
+
+
+class Token(object):
+    """Basic storage unit of LDA documents, contains word id
+       and corresponding topic.
+    """
+
+    def __init__(self, topic, id):
+        self.topic = topic
+        self.id = id
+
+
+class Sentence(object):
+    """Basic storage unit of SentenceLDA documents, contains word ids
+       of the sentence and its corresponding topic id.
+    """
+
+    def __init__(self, topic, tokens):
+        self.topic = topic
+        self.tokens = tokens
+
+
+class LDADoc(object):
+    """The storage structure of LDA model's inference result.
+    """
+
+    def __init__(self):
+        self._num_topics = None  # Number of topics.
+        self._num_accum = None  # Number of accumulated sample rounds.
+        self._alpha = None  # Document prior parameter.
+        self._tokens = None  # Storage structure of inference results.
+        self._topic_sum = None  # Document's topic sum in one round samples.
+        self._accum_topic_sum = None  # Accumulated results of topic sum.
+
+    def init(self, num_topics):
+        """Initialize the LDADoc according to num_topics.
+        """
+        self._num_topics = num_topics
+        self._num_accum = 0
+        self._tokens = []
+        self._topic_sum = np.zeros(self._num_topics)
+        self._accum_topic_sum = np.zeros(self._num_topics)
+
+    def add_token(self, token):
+        """Add new word to current LDADoc.
+        Arg:
+            token: Token class object.
+        """
+        assert token.topic >= 0, "Topic %d out of range!" % token.topic
+        assert token.topic < self._num_topics, "Topic %d out of range!" % token.topic
+        self._tokens.append(token)
+        self._topic_sum[token.topic] += 1
+
+    def token(self, index):
+        return self._tokens[index]
+
+    def set_topic(self, index, new_topic):
+        """Set the index word's topic to new_topic, and update the corresponding
+           topic distribution.
+        """
+        assert new_topic >= 0, "Topic %d out of range!" % new_topic
+        assert new_topic < self._num_topics, "Topic %d out of range!" % new_topic
+        old_topic = self._tokens[index].topic
+        if new_topic == old_topic:
+            return
+        self._tokens[index].topic = new_topic
+        self._topic_sum[old_topic] -= 1
+        self._topic_sum[new_topic] += 1
+
+    def set_alpha(self, alpha):
+        self._alpha = alpha
+
+    def size(self):
+        """Return number of words in LDADoc.
+        """
+        return len(self._tokens)
+
+    def topic_sum(self, topic_id):
+        return self._topic_sum[topic_id]
+
+    def sparse_topic_dist(self, sort=True):
+        """Return the topic distribution of documents in sparse format.
+           By default, it is sorted according to the topic probability
+           under the descending order.
+        """
+        topic_dist = []
+        sum_ = np.sum(self._accum_topic_sum)
+        if sum_ == 0:
+            return
+        for i in range(0, self._num_topics):
+            if self._accum_topic_sum[i] == 0:
+                continue
+            topic_dist.append(Topic(i, self._accum_topic_sum[i] * 1.0 / sum_))
+        if sort:
+
+            def take_elem(topic):
+                return topic.prob
+
+            topic_dist.sort(key=take_elem, reverse=True)
+            if topic_dist is None:
+                topic_dist = []
+
+        return topic_dist
+
+    def dense_topic_dist(self):
+        """Return the distribution of document topics in dense format,
+           taking into account the prior parameter alpha.
+        """
+        dense_dist = np.zeros(self._num_topics)
+        if self.size() == 0:
+            return dense_dist
+        dense_dist = (
+            self._accum_topic_sum * 1.0 / self._num_accum + self._alpha) / (
+                self.size() + self._alpha * self._num_topics)
+        return dense_dist
+
+    def accumulate_topic_num(self):
+        self._accum_topic_sum += self._topic_sum
+        self._num_accum += 1
+
+
+class SLDADoc(LDADoc):
+    """Sentence LDA Document, inherited from LDADoc.
+       Add add_sentence interface.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.__sentences = None
+
+    def init(self, num_topics):
+        """Initialize the SLDADoc according to num_topics.
+        """
+        self._num_topics = num_topics
+        self.__sentences = []
+        self._num_accum = 0
+        self._topic_sum = np.zeros(self._num_topics)
+        self._accum_topic_sum = np.zeros(self._num_topics)
+
+    def add_sentence(self, sent):
+        """Add new sentence to current SLDADoc.
+        Arg:
+            sent: Sentence class object.
+        """
+        assert sent.topic >= 0, "Topic %d out of range!" % (sent.topic)
+        assert sent.topic < self._num_topics, "Topic %d out of range!" % (
+            sent.topic)
+        self.__sentences.append(sent)
+        self._topic_sum[sent.topic] += 1
+
+    def set_topic(self, index, new_topic):
+        assert new_topic >= 0, "Topic %d out of range!" % (new_topic)
+        assert new_topic < self._num_topics, "Topic %d out of range!" % (
+            new_topic)
+        old_topic = self.__sentences[index].topic
+        if new_topic == old_topic:
+            return
+        self.__sentences[index].topic = new_topic
+        self._topic_sum[old_topic] -= 1
+        self._topic_sum[new_topic] += 1
+
+    def size(self):
+        """Return number of sentences in SLDADoc.
+        """
+        return len(self.__sentences)
+
+    def sent(self, index):
+        return self.__sentences[index]
--- a/hub_module/modules/text/semantic_model/lda_novel/inference_engine.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/inference_engine.py
+import os
+
+from paddlehub.common.logger import logger
+
+from lda_novel.config import ModelConfig
+from lda_novel.util import load_prototxt, fix_random_seed, rand_k
+from lda_novel.model import TopicModel
+from lda_novel.sampler import GibbsSampler, MHSampler
+from lda_novel.document import LDADoc, SLDADoc, Token, Sentence
+from lda_novel.vocab import OOV
+
+
+class SamplerType:
+    GibbsSampling = 0
+    MetropolisHastings = 1
+
+
+class InferenceEngine(object):
+    def __init__(self,
+                 model_dir,
+                 conf_file,
+                 type=SamplerType.MetropolisHastings):
+        # Read model configuration.
+        config = ModelConfig()
+        conf_file_path = os.path.join(model_dir, conf_file)
+        load_prototxt(conf_file_path, config)
+        self.__model = TopicModel(model_dir, config)
+        self.__config = config
+
+        # Initialize the sampler according to the configuration.
+        if type == SamplerType.GibbsSampling:
+            self.__sampler = GibbsSampler(self.__model)
+        elif type == SamplerType.MetropolisHastings:
+            self.__sampler = MHSampler(self.__model)
+
+    def infer(self, input, doc):
+        """Perform LDA topic inference on input, and store the results in doc.
+        Args:
+            input: a list of strings after tokenization.
+            doc: LDADoc type or SLDADoc type.
+        """
+        fix_random_seed()
+        if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
+            doc.init(self.__model.num_topics())
+            doc.set_alpha(self.__model.alpha())
+            for token in input:
+                id_ = self.__model.term_id(token)
+                if id_ != OOV:
+                    init_topic = rand_k(self.__model.num_topics())
+                    doc.add_token(Token(init_topic, id_))
+            self.lda_infer(doc, 20, 50)
+        elif isinstance(doc, SLDADoc):
+            doc.init(self.__model.num_topics())
+            doc.set_alpha(self.__model.alpha())
+            for sent in input:
+                words = []
+                for token in sent:
+                    id_ = self.__model.term_id(token)
+                    if id_ != OOV:
+                        words.append(id_)
+                init_topic = rand_k(self.__model.num_topics())
+                doc.add_sentence(Sentence(init_topic, words))
+            self.slda_infer(doc, 20, 50)
+        else:
+            logger.error("Wrong Doc Type!")
+
+    def lda_infer(self, doc, burn_in_iter, total_iter):
+        assert burn_in_iter >= 0
+        assert total_iter > 0
+        assert total_iter > burn_in_iter
+
+        for iter_ in range(total_iter):
+            self.__sampler.sample_doc(doc)
+            if iter_ >= burn_in_iter:
+                doc.accumulate_topic_num()
+
+    def slda_infer(self, doc, burn_in_iter, total_iter):
+        assert burn_in_iter >= 0
+        assert total_iter > 0
+        assert total_iter > burn_in_iter
+
+        for iter_ in range(total_iter):
+            self.__sampler.sample_doc(doc)
+            if iter_ >= burn_in_iter:
+                doc.accumulate_topic_num()
+
+    def model_type(self):
+        return self.__model.type()
+
+    def get_model(self):
+        return self.__model
+
+    def get_config(self):
+        return self.__config
--- a/hub_module/modules/text/semantic_model/lda_novel/model.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/model.py
+import os
+from collections import OrderedDict
+
+import numpy as np
+from tqdm import tqdm
+from paddlehub.common.logger import logger
+
+from lda_novel.vocab import Vocab, WordCount
+
+
+class TopicModel(object):
+    """Storage Structure of Topic model, including vocabulary and word topic count.
+    """
+
+    def __init__(self, model_dir, config):
+        """
+        Args:
+            model_dir: the path of model directory
+            config: ModelConfig class.
+        """
+        self.__word_topic = None  # Model parameter of word topic.
+        self.__vocab = Vocab()  # Vocab data structure of model.
+        self.__num_topics = config.num_topics  # Number of topics.
+        self.__alpha = config.alpha
+        self.__alpha_sum = self.__alpha * self.__num_topics
+        self.__beta = config.beta
+        self.__beta_sum = None
+        self.__type = config.type  # Model type.
+        self.__topic_sum = np.zeros(
+            self.__num_topics,
+            dtype="int64")  # Accum sum of each topic in word topic.
+        self.__topic_words = [[] for _ in range(self.__num_topics)]
+        word_topic_path = os.path.join(model_dir, config.word_topic_file)
+        vocab_path = os.path.join(model_dir, config.vocab_file)
+        self.load_model(word_topic_path, vocab_path)
+
+    def term_id(self, term):
+        return self.__vocab.get_id(term)
+
+    def load_model(self, word_topic_path, vocab_path):
+
+        # Loading vocabulary
+        self.__vocab.load(vocab_path)
+
+        self.__beta_sum = self.__beta * self.__vocab.size()
+        self.__word_topic = [{} for _ in range(self.__vocab.size())]  # 字典列表
+        self.__load_word_dict(word_topic_path)
+        logger.info(
+            "Model Info: #num_topics=%d #vocab_size=%d alpha=%f beta=%f" %
+            (self.num_topics(), self.vocab_size(), self.alpha(), self.beta()))
+
+    def word_topic_value(self, word_id, topic_id):
+        """Return value of specific word under specific topic in the model.
+        """
+        word_dict = self.__word_topic[word_id]
+        if topic_id not in word_dict:
+            return 0
+        return word_dict[topic_id]
+
+    def word_topic(self, term_id):
+        """Return the topic distribution of a word.
+        """
+        return self.__word_topic[term_id]
+
+    def topic_sum_value(self, topic_id):
+        return self.__topic_sum[topic_id]
+
+    def topic_sum(self):
+        return self.__topic_sum
+
+    def num_topics(self):
+        return self.__num_topics
+
+    def vocab_size(self):
+        return self.__vocab.size()
+
+    def alpha(self):
+        return self.__alpha
+
+    def alpha_sum(self):
+        return self.__alpha_sum
+
+    def beta(self):
+        return self.__beta
+
+    def beta_sum(self):
+        return self.__beta_sum
+
+    def type(self):
+        return self.__type
+
+    def __load_word_dict(self, word_dict_path):
+        """Load the word topic parameters.
+        """
+        logger.info("Loading word topic.")
+        with open(word_dict_path, 'r') as f:
+            for line in tqdm(f.readlines()):
+                fields = line.strip().split(" ")
+                assert len(fields) > 0, "Model file format error!"
+                term_id = int(fields[0])
+                assert term_id < self.vocab_size(), "Term id out of range!"
+                assert term_id >= 0, "Term id out of range!"
+                for i in range(1, len(fields)):
+                    topic_count = fields[i].split(":")
+                    assert len(topic_count) == 2, "Topic count format error!"
+
+                    topic_id = int(topic_count[0])
+                    assert topic_id >= 0, "Topic out of range!"
+                    assert topic_id < self.__num_topics, "Topic out of range!"
+
+                    count = int(topic_count[1])
+                    assert count >= 0, "Topic count error!"
+
+                    self.__word_topic[term_id][topic_id] = count
+                    self.__topic_sum[topic_id] += count
+                    self.__topic_words[topic_id].append(
+                        WordCount(term_id, count))
+                new_dict = OrderedDict()
+                for key in sorted(self.__word_topic[term_id]):
+                    new_dict[key] = self.__word_topic[term_id][key]
+                self.__word_topic[term_id] = new_dict
+
+    def get_vocab(self):
+        return self.__vocab.vocabulary()
+
+    def topic_words(self):
+        return self.__topic_words
--- a/hub_module/modules/text/semantic_model/lda_novel/module.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/module.py
+import os
+
+import paddlehub as hub
+from paddlehub.module.module import moduleinfo
+from paddlehub.common.logger import logger
+
+from lda_novel.inference_engine import InferenceEngine
+from lda_novel.document import LDADoc, SLDADoc
+from lda_novel.semantic_matching import SemanticMatching, WordAndDis
+from lda_novel.tokenizer import LACTokenizer, SimpleTokenizer
+from lda_novel.config import ModelType
+from lda_novel.vocab import Vocab, WordCount
+
+
+@moduleinfo(
+    name="lda_novel",
+    version="1.0.0",
+    summary=
+    "This is a PaddleHub Module for LDA topic model in novel dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.",
+    author="DesmonDay",
+    author_email="",
+    type="nlp/semantic_model")
+class TopicModel(hub.Module):
+    def _initialize(self):
+        """
+        Initialize with the necessary elements.
+        """
+        self.model_dir = os.path.join(self.directory, 'novel')
+        self.conf_file = 'lda.conf'
+        self.__engine = InferenceEngine(self.model_dir, self.conf_file)
+        self.vocab_path = os.path.join(self.model_dir, 'vocab_info.txt')
+        lac = hub.Module(name="lac")
+        # self.__tokenizer = SimpleTokenizer(self.vocab_path)
+        self.__tokenizer = LACTokenizer(self.vocab_path, lac)
+
+        self.vocabulary = self.__engine.get_model().get_vocab()
+        self.config = self.__engine.get_config()
+        self.topic_words = self.__engine.get_model().topic_words()
+        self.topic_sum_table = self.__engine.get_model().topic_sum()
+
+        def take_elem(word_count):
+            return word_count.count
+
+        for i in range(self.config.num_topics):
+            self.topic_words[i].sort(key=take_elem, reverse=True)
+
+        logger.info("Finish initialization.")
+
+    def cal_doc_distance(self, doc_text1, doc_text2):
+        """
+        This interface calculates the distance between documents.
+
+        Args:
+            doc_text1(str): the input document text 1.
+            doc_text2(str): the input document text 2.
+
+        Returns:
+            jsd(float): Jensen-Shannon Divergence distance of two documents.
+            hd(float): Hellinger Distance of two documents.
+        """
+        doc1_tokens = self.__tokenizer.tokenize(doc_text1)
+        doc2_tokens = self.__tokenizer.tokenize(doc_text2)
+
+        # Document topic inference.
+        doc1, doc2 = LDADoc(), LDADoc()
+        self.__engine.infer(doc1_tokens, doc1)
+        self.__engine.infer(doc2_tokens, doc2)
+
+        # To calculate jsd, we need dense document topic distribution.
+        dense_dict1 = doc1.dense_topic_dist()
+        dense_dict2 = doc2.dense_topic_dist()
+        # Calculate the distance between distributions.
+        # The smaller the distance, the higher the document semantic similarity.
+        sm = SemanticMatching()
+        jsd = sm.jensen_shannon_divergence(dense_dict1, dense_dict2)
+        hd = sm.hellinger_distance(dense_dict1, dense_dict2)
+
+        return jsd, hd
+
+    def cal_doc_keywords_similarity(self, document, top_k=10):
+        """
+        This interface can be used to find topk keywords of document.
+
+        Args:
+            document(str): the input document text.
+            top_k(int): top k keywords of this document.
+
+        Returns:
+            results(list): contains top_k keywords and their corresponding
+                           similarity compared to document.
+        """
+        d_tokens = self.__tokenizer.tokenize(document)
+
+        # Do topic inference on documents to obtain topic distribution.
+        doc = LDADoc()
+        self.__engine.infer(d_tokens, doc)
+        doc_topic_dist = doc.sparse_topic_dist()
+
+        items = []
+        words = set()
+        for word in d_tokens:
+            if word in words:
+                continue
+            words.add(word)
+            wd = WordAndDis()
+            wd.word = word
+            sm = SemanticMatching()
+            wd.distance = sm.likelihood_based_similarity(
+                terms=[word],
+                doc_topic_dist=doc_topic_dist,
+                model=self.__engine.get_model())
+            items.append(wd)
+
+        def take_elem(word_dis):
+            return word_dis.distance
+
+        items.sort(key=take_elem, reverse=True)
+
+        results = []
+        size = len(items)
+        for i in range(top_k):
+            if i >= size:
+                break
+            results.append({
+                "word": items[i].word,
+                "similarity": items[i].distance
+            })
+
+        return results
+
+    def cal_query_doc_similarity(self, query, document):
+        """
+        This interface calculates the similarity between query and document.
+
+        Args:
+            query(str): the input query text.
+            document(str): the input document text.
+
+        Returns:
+            lda_sim(float): likelihood based similarity between query and document
+                            based on LDA.
+        """
+        q_tokens = self.__tokenizer.tokenize(query)
+        d_tokens = self.__tokenizer.tokenize(document)
+
+        doc = LDADoc()
+        self.__engine.infer(d_tokens, doc)
+        doc_topic_dist = doc.sparse_topic_dist()
+
+        sm = SemanticMatching()
+        lda_sim = sm.likelihood_based_similarity(q_tokens, doc_topic_dist,
+                                                 self.__engine.get_model())
+
+        return lda_sim
+
+    def infer_doc_topic_distribution(self, document):
+        """
+        This interface infers the topic distribution of document.
+
+        Args:
+            document(str): the input document text.
+
+        Returns:
+            results(list): returns the topic distribution of document.
+        """
+        tokens = self.__tokenizer.tokenize(document)
+        if tokens == []:
+            return []
+        results = []
+        doc = LDADoc()
+        self.__engine.infer(tokens, doc)
+        topics = doc.sparse_topic_dist()
+        for topic in topics:
+            results.append({"topic id": topic.tid, "distribution": topic.prob})
+        return results
+
+    def show_topic_keywords(self, topic_id, k=10):
+        """
+        This interface returns the k keywords under specific topic.
+
+        Args:
+            topic_id(int): topic information we want to know.
+            k(int): top k keywords.
+
+        Returns:
+            results(dict): contains specific topic's keywords and corresponding
+                           probability.
+        """
+        EPS = 1e-8
+        results = {}
+        if 0 <= topic_id < self.config.num_topics:
+            k = min(k, len(self.topic_words[topic_id]))
+            for i in range(k):
+                prob = self.topic_words[topic_id][i].count / \
+                       (self.topic_sum_table[topic_id] + EPS)
+                results[self.vocabulary[self.topic_words[topic_id]
+                                        [i].word_id]] = prob
+            return results
+        else:
+            logger.error("%d is out of range!" % topic_id)
--- a/hub_module/modules/text/semantic_model/lda_novel/sampler.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/sampler.py
+import os
+
+import numpy as np
+from tqdm import tqdm
+from paddlehub.common.logger import logger
+
+from lda_novel.document import LDADoc, SLDADoc, Token, Sentence
+from lda_novel.vose_alias import VoseAlias
+from lda_novel.util import rand, rand_k
+
+
+class Sampler(object):
+    def __init__(self):
+        pass
+
+    def sample_doc(self, doc):
+        """Sample LDA or SLDA topics for documents.
+        """
+        raise NotImplementedError
+
+
+class MHSampler(Sampler):
+    def __init__(self, model):
+        super().__init__()
+        self.__model = model
+        self.__topic_indexes = None
+        self.__alias_tables = None
+        self.__prob_sum = None
+        self.__beta_alias = VoseAlias()
+        self.__beta_prior_sum = None
+        self.__mh_steps = 2
+        self.__construct_alias_table()
+
+    def __construct_alias_table(self):
+        """Construct alias table for all words.
+        """
+        logger.info("Construct alias table for alias sampling method.")
+        vocab_size = self.__model.vocab_size()
+        self.__topic_indexes = [[] for _ in range(vocab_size)]
+        self.__alias_tables = [VoseAlias() for _ in range(vocab_size)]
+        self.__prob_sum = np.zeros(vocab_size)
+
+        # Construct each word's alias table (prior is not included).
+        for i in tqdm(range(vocab_size)):
+            dist = []
+            prob_sum = 0
+            for key in self.__model.word_topic(i):
+                topic_id = key
+                word_topic_count = self.__model.word_topic(i)[key]
+                topic_sum = self.__model.topic_sum_value(topic_id)
+
+                self.__topic_indexes[i].append(topic_id)
+                q = word_topic_count / (topic_sum + self.__model.beta_sum())
+                dist.append(q)
+                prob_sum += q
+            self.__prob_sum[i] = prob_sum
+            if len(dist) > 0:
+                dist = np.array(dist, dtype=np.float)
+                self.__alias_tables[i].initialize(dist)
+
+        # Build prior parameter beta's alias table.
+        beta_dist = self.__model.beta() / (
+            self.__model.topic_sum() + self.__model.beta_sum())
+        self.__beta_prior_sum = np.sum(beta_dist)
+        self.__beta_alias.initialize(beta_dist)
+
+    def sample_doc(self, doc):
+        if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
+            for i in range(doc.size()):
+                new_topic = self.__sample_token(doc, doc.token(i))
+                doc.set_topic(i, new_topic)
+        elif isinstance(doc, SLDADoc):
+            for i in range(doc.size()):
+                new_topic = self.__sample_sentence(doc, doc.sent(i))
+                doc.set_topic(i, new_topic)
+
+    def __sample_token(self, doc, token):
+        new_topic = token.topic
+        for i in range(self.__mh_steps):
+            doc_proposed_topic = self.__doc_proposal(doc, token)
+            new_topic = self.__word_proposal(doc, token, doc_proposed_topic)
+        return new_topic
+
+    def __sample_sentence(self, doc, sent):
+        new_topic = sent.topic
+        for i in range(self.__mh_steps):
+            doc_proposed_topic = self.__doc_proposal(doc, sent)
+            new_topic = self.__word_proposal(doc, sent, doc_proposed_topic)
+        return new_topic
+
+    def __doc_proposal(self, doc, token):
+        if isinstance(doc, LDADoc) and isinstance(token, Token):
+            old_topic = token.topic
+            dart = rand() * (doc.size() + self.__model.alpha_sum())
+            if dart < doc.size():
+                token_index = int(dart)
+                new_topic = doc.token(token_index).topic
+            else:
+                new_topic = rand_k(self.__model.num_topics())
+
+            if new_topic != old_topic:
+                proposal_old = self.__doc_proposal_distribution(doc, old_topic)
+                proposal_new = self.__doc_proposal_distribution(doc, new_topic)
+                proportion_old = self.__proportional_function(
+                    doc, token, old_topic)
+                proportion_new = self.__proportional_function(
+                    doc, token, new_topic)
+                transition_prob = float((proportion_new * proposal_old) /
+                                        (proportion_old * proposal_new))
+                rejection = rand()
+                mask = -(rejection < transition_prob)
+                return (new_topic & mask) | (old_topic & ~mask)
+
+            return new_topic
+
+        elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
+            sent = token
+            old_topic = sent.topic
+            dart = rand() * (doc.size() + self.__model.alpha_sum())
+            if dart < doc.size():
+                token_index = int(dart)
+                new_topic = doc.sent(token_index).topic
+            else:
+                new_topic = rand_k(self.__model.num_topics())
+
+            if new_topic != old_topic:
+                proportion_old = self.__proportional_function(
+                    doc, sent, old_topic)
+                proportion_new = self.__proportional_function(
+                    doc, sent, new_topic)
+                proposal_old = self.__doc_proposal_distribution(doc, old_topic)
+                proposal_new = self.__doc_proposal_distribution(doc, new_topic)
+                transition_prob = float((proportion_new * proposal_old) /
+                                        (proportion_old * proposal_new))
+                rejection = rand()
+                mask = -(rejection < transition_prob)
+                return (new_topic & mask) | (old_topic & ~mask)
+
+            return new_topic
+
+    def __word_proposal(self, doc, token, old_topic):
+        if isinstance(doc, LDADoc) and isinstance(token, Token):
+            new_topic = self.__propose(token.id)
+            if new_topic != old_topic:
+                proposal_old = self.__word_proposal_distribution(
+                    token.id, old_topic)
+                proposal_new = self.__word_proposal_distribution(
+                    token.id, new_topic)
+                proportion_old = self.__proportional_function(
+                    doc, token, old_topic)
+                proportion_new = self.__proportional_function(
+                    doc, token, new_topic)
+                transition_prob = float((proportion_new * proposal_old) /
+                                        (proportion_old * proposal_new))
+                rejection = rand()
+                mask = -(rejection < transition_prob)
+                return (new_topic & mask) | (old_topic & ~mask)
+            return new_topic
+
+        elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
+            sent = token
+            new_topic = old_topic
+            for word_id in sent.tokens:
+                new_topic = self.__propose(word_id)
+                if new_topic != old_topic:
+                    proportion_old = self.__proportional_function(
+                        doc, sent, old_topic)
+                    proportion_new = self.__proportional_function(
+                        doc, sent, new_topic)
+                    proposal_old = self.__word_proposal_distribution(
+                        word_id, old_topic)
+                    proposal_new = self.__word_proposal_distribution(
+                        word_id, new_topic)
+                    transition_prob = float((proportion_new * proposal_old) /
+                                            (proportion_old * proposal_new))
+                    rejection = rand()
+                    mask = -(rejection < transition_prob)
+                    new_topic = (new_topic & mask) | (old_topic & ~mask)
+            return new_topic
+
+    def __proportional_function(self, doc, token, new_topic):
+        if isinstance(doc, LDADoc) and isinstance(token, Token):
+            old_topic = token.topic
+            dt_alpha = doc.topic_sum(new_topic) + self.__model.alpha()
+            wt_beta = self.__model.word_topic_value(
+                token.id, new_topic) + self.__model.beta()
+            t_sum_beta_sum = self.__model.topic_sum_value(
+                new_topic) + self.__model.beta_sum()
+            if new_topic == old_topic and wt_beta > 1:
+                if dt_alpha > 1:
+                    dt_alpha -= 1
+                wt_beta -= 1
+                t_sum_beta_sum -= 1
+            return dt_alpha * wt_beta / t_sum_beta_sum
+
+        elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
+            sent = token
+            old_topic = sent.topic
+            result = doc.topic_sum(new_topic) + self.__model.alpha()
+            if new_topic == old_topic:
+                result -= 1
+            for word_id in sent.tokens:
+                wt_beta = self.__model.word_topic_value(
+                    word_id, new_topic) + self.__model.beta()
+                t_sum_beta_sum = self.__model.topic_sum_value(
+                    new_topic) + self.__model.beta_sum()
+                if new_topic == old_topic and wt_beta > 1:
+                    wt_beta -= 1
+                    t_sum_beta_sum -= 1
+                result *= wt_beta / t_sum_beta_sum
+            return result
+        else:
+            logger.error("Wrong input argument type!")
+
+    def __word_proposal_distribution(self, word_id, topic):
+        wt_beta = self.__model.word_topic_value(word_id,
+                                                topic) + self.__model.beta()
+        t_sum_beta_sum = self.__model.topic_sum_value(
+            topic) + self.__model.beta_sum()
+        return wt_beta / t_sum_beta_sum
+
+    def __doc_proposal_distribution(self, doc, topic):
+        return doc.topic_sum(topic) + self.__model.alpha()
+
+    def __propose(self, word_id):
+        dart = rand() * (self.__prob_sum[word_id] + self.__beta_prior_sum)
+        if dart < self.__prob_sum[word_id]:
+            idx = self.__alias_tables[word_id].generate()
+            topic = self.__topic_indexes[word_id][idx]
+        else:
+            topic = self.__beta_alias.generate()
+        return topic
+
+
+class GibbsSampler(Sampler):
+    def __init__(self, model):
+        super().__init__()
+        self.__model = model
+
+    def sample_doc(self, doc):
+        if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
+            for i in range(doc.size()):
+                new_topic = self.__sample_token(doc, doc.token(i))
+                doc.set_topic(i, new_topic)
+        elif isinstance(doc, SLDADoc):
+            for i in range(doc.size()):
+                new_topic = self.__sample_sentence(doc, doc.sent(i))
+                doc.set_topic(i, new_topic)
+
+    def __sample_token(self, doc, token):
+        old_topic = token.topic
+        num_topics = self.__model.num_topics()
+        accum_prob = np.zeros(num_topics)
+        prob = np.zeros(num_topics)
+        sum_ = 0
+        for i in range(num_topics):
+            dt_alpha = doc.topic_sum(i) + self.__model.alpha()
+            wt_beta = self.__model.word_topic_value(token.id,
+                                                    i) + self.__model.beta()
+            t_sum_beta_sum = self.__model.topic_sum(i) + self.__model.beta_sum()
+            if i == old_topic and wt_beta > 1:
+                if dt_alpha > 1:
+                    dt_alpha -= 1
+                wt_beta -= 1
+                t_sum_beta_sum -= 1
+            prob[i] = dt_alpha * wt_beta / t_sum_beta_sum
+            sum_ += prob[i]
+            accum_prob[i] = prob[i] if i == 0 else accum_prob[i - 1] + prob[i]
+
+        dart = rand() * sum_
+        if dart <= accum_prob[0]:
+            return 0
+        for i in range(1, num_topics):
+            if accum_prob[i - 1] < dart <= accum_prob[i]:
+                return i
+        return num_topics - 1
+
+    def __sample_sentence(self, doc, sent):
+        old_topic = sent.topic
+        num_topics = self.__model.num_topics()
+        accum_prob = np.zeros(num_topics)
+        prob = np.zeros(num_topics)
+        sum_ = 0
+        for t in range(num_topics):
+            dt_alpha = doc.topic_sum(t) + self.__model.alpha()
+            t_sum_beta_sum = self.__model.topic_sum(t) + self.__model.beta_sum()
+            if t == old_topic:
+                if dt_alpha > 1:
+                    dt_alpha -= 1
+                if t_sum_beta_sum > 1:
+                    t_sum_beta_sum -= 1
+            prob[t] = dt_alpha
+            for i in range(len(sent.tokens)):
+                w = sent.tokens[i]
+                wt_beta = self.__model.word_topic_value(
+                    w, t) + self.__model.beta()
+                if t == old_topic and wt_beta > 1:
+                    wt_beta -= 1
+                # Note: if the length of the sentence is too long, the probability will be
+                # too small and the accuracy will be lost if there are too many multiply items
+                prob[t] *= wt_beta / t_sum_beta_sum
+            sum_ += prob[t]
+            accum_prob[t] = prob[t] if t == 0 else accum_prob[t - 1] + prob[t]
+
+        dart = rand() * sum
+        if dart <= accum_prob[0]:
+            return 0
+        for t in range(1, num_topics):
+            if accum_prob[t - 1] < dart <= accum_prob[t]:
+                return t
+        return num_topics - 1
--- a/hub_module/modules/text/semantic_model/lda_novel/semantic_matching.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/semantic_matching.py
+import os
+
+import numpy as np
+from paddlehub.common.logger import logger
+
+from lda_novel.vocab import OOV
+
+EPS = 1e-06
+
+
+class WordAndDis(object):
+    def __init__(self):
+        self.word = None
+        self.distance = None
+
+
+class SemanticMatching(object):
+    def __init__(self):
+        pass
+
+    def l2_norm(self, vec):
+        """Calculate the length of vector.
+        """
+        result = np.sqrt(np.sum(vec**2))
+        return result
+
+    def cosine_similarity(self, vec1, vec2):
+        norm1 = self.l2_norm(vec1)
+        norm2 = self.l2_norm(vec2)
+        result = np.sum(vec1 * vec2) / norm1 / norm2
+        return result
+
+    def likelihood_based_similarity(self, terms, doc_topic_dist, model):
+        """
+        Args:
+            terms: list of strings
+            doc_topic_dist: list of Topic class
+            model: TopicModel class
+        """
+        num_of_term_in_vocab = 0
+        result = 0
+        for i in range(len(terms)):
+            term_id = model.term_id(terms[i])
+            if term_id == OOV:
+                continue
+            num_of_term_in_vocab += 1
+            for j in range(len(doc_topic_dist)):
+                topic_id = doc_topic_dist[j].tid
+                prob = doc_topic_dist[j].prob
+                result += model.word_topic_value(term_id, topic_id) * 1.0 / \
+                          model.topic_sum_value(topic_id) * prob
+
+        if num_of_term_in_vocab == 0:
+            return result
+        return result / num_of_term_in_vocab
+
+    def kullback_leibler_divergence(self, dist1, dist2):
+        assert dist1.shape == dist2.shape
+        dist2[dist2 < EPS] = EPS
+        result = np.sum(dist1 * np.log(dist1 / dist2))
+        return result
+
+    def jensen_shannon_divergence(self, dist1, dist2):
+        assert dist1.shape == dist2.shape
+        dist1[dist1 < EPS] = EPS
+        dist2[dist2 < EPS] = EPS
+        mean = (dist1 + dist2) * 0.5
+        jsd = self.kullback_leibler_divergence(dist1, mean) * 0.5 + \
+              self.kullback_leibler_divergence(dist2, mean) * 0.5
+        return jsd
+
+    def hellinger_distance(self, dist1, dist2):
+        assert dist1.shape == dist2.shape
+        result = np.sum((np.sqrt(dist1) - np.sqrt(dist2))**2)
+        result = np.sqrt(result) * 0.7071067812
+        return result
--- a/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py
+import os
+
+import numpy as np
+from paddlehub.common.logger import logger
+
+
+class Tokenizer(object):
+    """Base tokenizer class.
+    """
+
+    def __init__(self):
+        pass
+
+    def tokenize(self, text):
+        raise NotImplementedError
+
+
+class SimpleTokenizer(Tokenizer):
+    """Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
+       be used in topic model demo, but not in real business application scenarios.
+
+       Notes: This tokenizer can only recognize the words in the corresponding vocab file.
+    """
+
+    def __init__(self, vocab_path):
+        super().__init__()
+        self.__max_word_len = 0
+        self.__vocab = set()
+        self.__load_vocab(vocab_path)
+
+    def tokenize(self, text):
+        """Tokenize the input string `text`, and return the tokenize result.
+        """
+        text_len = len(text)
+        result = []
+        i = 0
+        while i < text_len:
+            word = found_word = ""
+            # Deal with English characters.
+            if self.__is_eng_char(text[i]):
+                for j in range(i, text_len + 1):
+                    if j < text_len and self.__is_eng_char(text[j]):
+                        word += self.__tolower(text[j])
+                    else:
+                        # Forward matching by character granularity.
+                        if word in self.__vocab:
+                            result.append(word)
+                        i = j - 1
+                        break
+            else:
+                for j in range(i, min(i + self.__max_word_len, text_len)):
+                    word += text[j]
+                    if word in self.__vocab:
+                        found_word = word
+                if len(found_word) > 0:
+                    result.append(found_word)
+                    i += len(found_word) - 1
+            i += 1
+        return result
+
+    def contains(self, word):
+        """Check whether the word is in the vocabulary.
+        """
+        return word in self.__vocab
+
+    def __load_vocab(self, vocab_path):
+        """Load the word dictionary.
+        """
+        with open(vocab_path, 'r') as fin:
+            vocab_size = 0
+            for line in fin.readlines():
+                fields = line.strip().split('\t')
+                assert len(fields) >= 2
+                word = fields[1]
+                self.__max_word_len = max(self.__max_word_len, len(word))
+                self.__vocab.add(word)
+                vocab_size += 1
+
+    def __is_eng_char(self, c):
+        """Check whether char c is an English character.
+        """
+        return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')
+
+    def __tolower(self, c):
+        """Return the lowercase character of the corresponding character, or return
+           the original character if there is no corresponding lowercase character.
+        """
+        return c.lower()
+
+
+class LACTokenizer(Tokenizer):
+    def __init__(self, vocab_path, lac):
+        super().__init__()
+        self.__max_word_len = 0
+        self.__vocab = set()
+        self.__lac = lac
+        self.__load_vocab(vocab_path)
+
+    def __load_vocab(self, vocab_path):
+        """Load the word dictionary.
+                """
+        with open(vocab_path, 'r') as fin:
+            vocab_size = 0
+            for line in fin.readlines():
+                fields = line.strip().split('\t')
+                assert len(fields) >= 2
+                word = fields[1]
+                self.__max_word_len = max(self.__max_word_len, len(word))
+                self.__vocab.add(word)
+                vocab_size += 1
+
+    def tokenize(self, text):
+        results = self.__lac.lexical_analysis(
+            texts=[text], use_gpu=True, batch_size=1, return_tag=True)
+        # Change English words to lower case.
+        # And just preserve the word in vocab.
+        words = results[0]["word"]
+        result = []
+        for word in words:
+            word = word.lower()
+            if word in self.__vocab:
+                result.append(word)
+        return result
+
+    def contains(self, word):
+        """Check whether the word is in the vocabulary.
+        """
+        return word in self.__vocab
--- a/hub_module/modules/text/semantic_model/lda_novel/util.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/util.py
+import time
+import yaml
+
+import numpy as np
+from paddlehub.common.logger import logger
+
+from lda_novel.config import ModelType
+
+
+def load_prototxt(config_file, config):
+    """
+    Args:
+        config_file: model configuration file.
+        config: ModelConfig class
+    """
+    logger.info("Loading LDA config.")
+    with open(config_file, 'r') as f:
+        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
+
+    # Assignment.
+    if yaml_dict["type"] == "LDA":
+        config.type = ModelType.LDA
+    else:
+        config.type = ModelType.SLDA
+    config.num_topics = yaml_dict["num_topics"]
+    config.alpha = yaml_dict["alpha"]
+    config.beta = yaml_dict["beta"]
+    config.word_topic_file = yaml_dict["word_topic_file"]
+    config.vocab_file = yaml_dict["vocab_file"]
+
+
+def fix_random_seed(seed=2147483647):
+    np.random.seed(seed)
+
+
+def rand(min_=0, max_=1):
+    return np.random.uniform(low=min_, high=max_)
+
+
+def rand_k(k):
+    """Returns an integer float number between [0, k - 1].
+    """
+    return int(rand() * k)
+
+
+def timeit(f):
+    """Return time cost of function f.
+    """
+
+    def timed(*args, **kwargs):
+        start_time = time.time()
+        result = f(*args, **kwargs)
+        end_time = time.time()
+        print("   [-] %s : %2.5f sec" % (f.__name__, end_time - start_time))
+        return result
+
+    return timed
--- a/hub_module/modules/text/semantic_model/lda_novel/vocab.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/vocab.py
+from paddlehub.common.logger import logger
+
+OOV = -1
+
+
+class WordCount(object):
+    def __init__(self, word_id, count):
+        self.word_id = word_id
+        self.count = count
+
+
+class Vocab(object):
+    def __init__(self):
+        self.__term2id = {}
+        self.__id2term = {}
+
+    def get_id(self, word):
+        if word not in self.__term2id:
+            return OOV
+        return self.__term2id[word]
+
+    def load(self, vocab_file):
+        self.__term2id = {}
+        self.__id2term = {}
+        with open(vocab_file, 'r') as fin:
+            for line in fin.readlines():
+                fields = line.strip().split('\t')
+                assert len(
+                    fields) == 5, "Vocabulary file [%s] format error!" % (
+                        vocab_file)
+                term = fields[1]
+                id_ = int(fields[2])
+                if term in self.__term2id:
+                    logger.error("Duplicate word [%s] in vocab file!" % (term))
+                    continue
+                self.__term2id[term] = id_
+                self.__id2term[id_] = term
+
+    def size(self):
+        return len(self.__term2id)
+
+    def vocabulary(self):
+        return self.__id2term
--- a/hub_module/modules/text/semantic_model/lda_novel/vose_alias.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/vose_alias.py
+import os
+
+import numpy as np
+from paddlehub.common.logger import logger
+
+from lda_novel.util import rand, rand_k
+
+
+class VoseAlias(object):
+    """Vose's Alias Method.
+    """
+
+    def __init__(self):
+        self.__alias = None
+        self.__prob = None  # np.array
+
+    def initialize(self, distribution):
+        """Initialize the alias table according to the input distribution
+        Arg:
+            distribution: Numpy array.
+        """
+        size = distribution.shape[0]
+        self.__alias = np.zeros(size, dtype=np.int64)
+        self.__prob = np.zeros(size)
+        sum_ = np.sum(distribution)
+        p = distribution / sum_ * size  # Scale up probability.
+        large, small = [], []
+        for i, p_ in enumerate(p):
+            if p_ < 1.0:
+                small.append(i)
+            else:
+                large.append(i)
+
+        while large and small:
+            l = small[0]
+            g = large[0]
+            small.pop(0)
+            large.pop(0)
+            self.__prob[l] = p[l]
+            self.__alias[l] = g
+            p[g] = p[g] + p[l] - 1  # A more numerically stable option.
+            if p[g] < 1.0:
+                small.append(g)
+            else:
+                large.append(g)
+
+        while large:
+            g = large[0]
+            large.pop(0)
+            self.__prob[g] = 1.0
+
+        while small:
+            l = small[0]
+            small.pop(0)
+            self.__prob[l] = 1.0
+
+    def generate(self):
+        """Generate samples from given distribution.
+        """
+        dart1 = rand_k(self.size())
+        dart2 = int(rand())
+        return dart1 if dart2 > self.__prob[dart1] else self.__alias[dart1]
+
+    def size(self):
+        return self.__prob.shape[0]
--- a/hub_module/modules/text/semantic_model/lda_webpage/README.md
+++ b/hub_module/modules/text/semantic_model/lda_webpage/README.md
+## 模型概述
+
+主题模型(Topic Model)是以无监督学习的方式对文档的隐含语义结构进行聚类的统计模型，其中LDA(Latent Dirichlet Allocation)算法是主题模型的一种。LDA根据对词的共现信息的分析，拟合出词-文档-主题的分布，从而将词、文本映射到一个语义空间中。本Module基于的数据集为百度自建的网页领域数据集。
+
+<p align="center">
+<img src="https://bj.bcebos.com/paddlehub/model/nlp/semantic_model/lda.png" hspace='10'/> <br />
+</p>
+
+更多详情请参考[LDA论文](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf)。
+
+注：该Module由第三方开发者DesmonDay贡献。
+
+## LDA模型 API 说明
+### cal_doc_distance(doc_text1, doc_text2)
+用于计算两个输入文档之间的距离，包括Jensen-Shannon divergence(JS散度)、Hellinger Distance(海林格距离)。
+
+**参数**
+
+- doc_text1(str): 输入的第一个文档。
+- doc_text2(str): 输入的第二个文档。
+
+**返回**
+
+- jsd(float): 两个文档之间的JS散度([Jensen-Shannon divergence](https://blog.csdn.net/FrankieHello/article/details/80614422?utm_source=copy))。
+- hd(float): 两个文档之间的海林格距离([Hellinger Distance](http://blog.sina.com.cn/s/blog_85f1ffb70101e65d.html))。
+
+### cal_doc_keywords_similarity(document, top_k=10)
+
+用于查找输入文档的前k个关键词及对应的与原文档的相似度。
+
+**参数**
+
+- document(str): 输入文档。
+- top_k(int): 查找输入文档的前k个关键词。
+
+**返回**
+
+- results(list): 包含每个关键词以及对应的与原文档的相似度。其中，list的基本元素为dict，dict的key为关键词，value为对应的与原文档的相似度。
+
+### cal_query_doc_similarity(query, document)
+
+用于计算短文档与长文档之间的相似度。
+
+**参数**
+
+- query(str): 输入的短文档。
+- document(str): 输入的长文档。
+
+**返回**
+
+- lda_sim(float): 返回短文档与长文档之间的相似度。
+
+### infer_doc_topic_distribution(document)
+
+用于推理出文档的主题分布。
+
+**参数**
+
+- document(str): 输入文档。
+
+**返回**
+
+- results(list): 包含主题分布下各个主题ID和对应的概率分布。其中，list的基本元素为dict，dict的key为主题ID，value为各个主题ID对应的概率。
+
+### show_topic_keywords(topic_id, k=10)
+
+用于展示出每个主题下对应的关键词，可配合推理主题分布的API使用。
+
+**参数**
+
+- topic_id(int): 主题ID。
+- k(int): 需要知道对应主题的前k个关键词。
+
+**返回**
+
+- results(dict): 返回对应文档的前k个关键词，以及各个关键词在文档中的出现概率。
+
+### 代码示例
+
+这里展示部分API的使用示例。
+``` python
+import paddlehub as hub
+
+lda_webpage = hub.Module(name="lda_webpage")
+jsd, hd = lda_webpage.cal_doc_distance(doc_text1="百度的网页上有着各种新闻的推荐，内容丰富多彩。", doc_text2="百度首页推荐着各种新闻，还提供了强大的搜索引擎功能。")
+# jsd = 0.00249, hd = 0.0510
+
+results = lda_webpage.cal_doc_keywords_similarity('百度首页推荐着各种新闻，还提供了强大的搜索引擎功能。')
+#  [{'word': '强大', 'similarity': 0.0838851256627093},
+#   {'word': '推荐', 'similarity': 0.06295345182499558},
+#   {'word': '新闻', 'similarity': 0.05894049247832139},
+#   {'word': '提供', 'similarity': 0.04179908620523299},
+#   {'word': '百度', 'similarity': 0.033778847361833536},
+#   {'word': '首页', 'similarity': 0.018429949496365026},
+#   {'word': '功能', 'similarity': 0.011409342579361237},
+#   {'word': '搜索引擎', 'similarity': 0.010392479335778413}]
+
+```
+
+## 查看代码
+https://github.com/baidu/Familia
+
+
+## 依赖
+
+paddlepaddle >= 1.8.2
+
+paddlehub >= 1.8.0
+
+## 更新历史
+
+* 1.0.0
+
+  初始发布
--- a/hub_module/modules/text/semantic_model/lda_webpage/__init__.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/__init__.py
--- a/hub_module/modules/text/semantic_model/lda_webpage/config.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/config.py
+"""
+This file defines the basic config information of LDA/SLDA model.
+"""
+
+
+class ModelType:
+    LDA = 0
+    SLDA = 1
+
+
+class ModelConfig:
+    type = None
+    num_topics = None
+    alpha = None
+    beta = None
+    word_topic_file = None
+    vocab_file = None
--- a/hub_module/modules/text/semantic_model/lda_webpage/document.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/document.py
+import numpy as np
+
+
+class Topic(object):
+    """Basic data structure of topic, contains topic id and
+       corresponding probability.
+    """
+
+    def __init__(self, tid, prob):
+        self.tid = tid  # topic id
+        self.prob = prob  # topic probability
+
+
+class Token(object):
+    """Basic storage unit of LDA documents, contains word id
+       and corresponding topic.
+    """
+
+    def __init__(self, topic, id):
+        self.topic = topic
+        self.id = id
+
+
+class Sentence(object):
+    """Basic storage unit of SentenceLDA documents, contains word ids
+       of the sentence and its corresponding topic id.
+    """
+
+    def __init__(self, topic, tokens):
+        self.topic = topic
+        self.tokens = tokens
+
+
+class LDADoc(object):
+    """The storage structure of LDA model's inference result.
+    """
+
+    def __init__(self):
+        self._num_topics = None  # Number of topics.
+        self._num_accum = None  # Number of accumulated sample rounds.
+        self._alpha = None  # Document prior parameter.
+        self._tokens = None  # Storage structure of inference results.
+        self._topic_sum = None  # Document's topic sum in one round samples.
+        self._accum_topic_sum = None  # Accumulated results of topic sum.
+
+    def init(self, num_topics):
+        """Initialize the LDADoc according to num_topics.
+        """
+        self._num_topics = num_topics
+        self._num_accum = 0
+        self._tokens = []
+        self._topic_sum = np.zeros(self._num_topics)
+        self._accum_topic_sum = np.zeros(self._num_topics)
+
+    def add_token(self, token):
+        """Add new word to current LDADoc.
+        Arg:
+            token: Token class object.
+        """
+        assert token.topic >= 0, "Topic %d out of range!" % token.topic
+        assert token.topic < self._num_topics, "Topic %d out of range!" % token.topic
+        self._tokens.append(token)
+        self._topic_sum[token.topic] += 1
+
+    def token(self, index):
+        return self._tokens[index]
+
+    def set_topic(self, index, new_topic):
+        """Set the index word's topic to new_topic, and update the corresponding
+           topic distribution.
+        """
+        assert new_topic >= 0, "Topic %d out of range!" % new_topic
+        assert new_topic < self._num_topics, "Topic %d out of range!" % new_topic
+        old_topic = self._tokens[index].topic
+        if new_topic == old_topic:
+            return
+        self._tokens[index].topic = new_topic
+        self._topic_sum[old_topic] -= 1
+        self._topic_sum[new_topic] += 1
+
+    def set_alpha(self, alpha):
+        self._alpha = alpha
+
+    def size(self):
+        """Return number of words in LDADoc.
+        """
+        return len(self._tokens)
+
+    def topic_sum(self, topic_id):
+        return self._topic_sum[topic_id]
+
+    def sparse_topic_dist(self, sort=True):
+        """Return the topic distribution of documents in sparse format.
+           By default, it is sorted according to the topic probability
+           under the descending order.
+        """
+        topic_dist = []
+        sum_ = np.sum(self._accum_topic_sum)
+        if sum_ == 0:
+            return
+        for i in range(0, self._num_topics):
+            if self._accum_topic_sum[i] == 0:
+                continue
+            topic_dist.append(Topic(i, self._accum_topic_sum[i] * 1.0 / sum_))
+        if sort:
+
+            def take_elem(topic):
+                return topic.prob
+
+            topic_dist.sort(key=take_elem, reverse=True)
+            if topic_dist is None:
+                topic_dist = []
+
+        return topic_dist
+
+    def dense_topic_dist(self):
+        """Return the distribution of document topics in dense format,
+           taking into account the prior parameter alpha.
+        """
+        dense_dist = np.zeros(self._num_topics)
+        if self.size() == 0:
+            return dense_dist
+        dense_dist = (
+            self._accum_topic_sum * 1.0 / self._num_accum + self._alpha) / (
+                self.size() + self._alpha * self._num_topics)
+        return dense_dist
+
+    def accumulate_topic_num(self):
+        self._accum_topic_sum += self._topic_sum
+        self._num_accum += 1
+
+
+class SLDADoc(LDADoc):
+    """Sentence LDA Document, inherited from LDADoc.
+       Add add_sentence interface.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.__sentences = None
+
+    def init(self, num_topics):
+        """Initialize the SLDADoc according to num_topics.
+        """
+        self._num_topics = num_topics
+        self.__sentences = []
+        self._num_accum = 0
+        self._topic_sum = np.zeros(self._num_topics)
+        self._accum_topic_sum = np.zeros(self._num_topics)
+
+    def add_sentence(self, sent):
+        """Add new sentence to current SLDADoc.
+        Arg:
+            sent: Sentence class object.
+        """
+        assert sent.topic >= 0, "Topic %d out of range!" % (sent.topic)
+        assert sent.topic < self._num_topics, "Topic %d out of range!" % (
+            sent.topic)
+        self.__sentences.append(sent)
+        self._topic_sum[sent.topic] += 1
+
+    def set_topic(self, index, new_topic):
+        assert new_topic >= 0, "Topic %d out of range!" % (new_topic)
+        assert new_topic < self._num_topics, "Topic %d out of range!" % (
+            new_topic)
+        old_topic = self.__sentences[index].topic
+        if new_topic == old_topic:
+            return
+        self.__sentences[index].topic = new_topic
+        self._topic_sum[old_topic] -= 1
+        self._topic_sum[new_topic] += 1
+
+    def size(self):
+        """Return number of sentences in SLDADoc.
+        """
+        return len(self.__sentences)
+
+    def sent(self, index):
+        return self.__sentences[index]
--- a/hub_module/modules/text/semantic_model/lda_webpage/inference_engine.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/inference_engine.py
+import os
+
+from paddlehub.common.logger import logger
+
+from lda_webpage.config import ModelConfig
+from lda_webpage.util import load_prototxt, fix_random_seed, rand_k
+from lda_webpage.model import TopicModel
+from lda_webpage.sampler import GibbsSampler, MHSampler
+from lda_webpage.document import LDADoc, SLDADoc, Token, Sentence
+from lda_webpage.vocab import OOV
+
+
+class SamplerType:
+    GibbsSampling = 0
+    MetropolisHastings = 1
+
+
+class InferenceEngine(object):
+    def __init__(self,
+                 model_dir,
+                 conf_file,
+                 type=SamplerType.MetropolisHastings):
+        # Read model configuration.
+        config = ModelConfig()
+        conf_file_path = os.path.join(model_dir, conf_file)
+        load_prototxt(conf_file_path, config)
+        self.__model = TopicModel(model_dir, config)
+        self.__config = config
+
+        # Initialize the sampler according to the configuration.
+        if type == SamplerType.GibbsSampling:
+            self.__sampler = GibbsSampler(self.__model)
+        elif type == SamplerType.MetropolisHastings:
+            self.__sampler = MHSampler(self.__model)
+
+    def infer(self, input, doc):
+        """Perform LDA topic inference on input, and store the results in doc.
+        Args:
+            input: a list of strings after tokenization.
+            doc: LDADoc type or SLDADoc type.
+        """
+        fix_random_seed()
+        if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
+            doc.init(self.__model.num_topics())
+            doc.set_alpha(self.__model.alpha())
+            for token in input:
+                id_ = self.__model.term_id(token)
+                if id_ != OOV:
+                    init_topic = rand_k(self.__model.num_topics())
+                    doc.add_token(Token(init_topic, id_))
+            self.lda_infer(doc, 20, 50)
+        elif isinstance(doc, SLDADoc):
+            doc.init(self.__model.num_topics())
+            doc.set_alpha(self.__model.alpha())
+            for sent in input:
+                words = []
+                for token in sent:
+                    id_ = self.__model.term_id(token)
+                    if id_ != OOV:
+                        words.append(id_)
+                init_topic = rand_k(self.__model.num_topics())
+                doc.add_sentence(Sentence(init_topic, words))
+            self.slda_infer(doc, 20, 50)
+        else:
+            logger.error("Wrong Doc Type!")
+
+    def lda_infer(self, doc, burn_in_iter, total_iter):
+        assert burn_in_iter >= 0
+        assert total_iter > 0
+        assert total_iter > burn_in_iter
+
+        for iter_ in range(total_iter):
+            self.__sampler.sample_doc(doc)
+            if iter_ >= burn_in_iter:
+                doc.accumulate_topic_num()
+
+    def slda_infer(self, doc, burn_in_iter, total_iter):
+        assert burn_in_iter >= 0
+        assert total_iter > 0
+        assert total_iter > burn_in_iter
+
+        for iter_ in range(total_iter):
+            self.__sampler.sample_doc(doc)
+            if iter_ >= burn_in_iter:
+                doc.accumulate_topic_num()
+
+    def model_type(self):
+        return self.__model.type()
+
+    def get_model(self):
+        return self.__model
+
+    def get_config(self):
+        return self.__config
--- a/hub_module/modules/text/semantic_model/lda_webpage/model.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/model.py
+import os
+from collections import OrderedDict
+
+import numpy as np
+from tqdm import tqdm
+from paddlehub.common.logger import logger
+
+from lda_webpage.vocab import Vocab, WordCount
+
+
+class TopicModel(object):
+    """Storage Structure of Topic model, including vocabulary and word topic count.
+    """
+
+    def __init__(self, model_dir, config):
+        """
+        Args:
+            model_dir: the path of model directory
+            config: ModelConfig class.
+        """
+        self.__word_topic = None  # Model parameter of word topic.
+        self.__vocab = Vocab()  # Vocab data structure of model.
+        self.__num_topics = config.num_topics  # Number of topics.
+        self.__alpha = config.alpha
+        self.__alpha_sum = self.__alpha * self.__num_topics
+        self.__beta = config.beta
+        self.__beta_sum = None
+        self.__type = config.type  # Model type.
+        self.__topic_sum = np.zeros(
+            self.__num_topics,
+            dtype="int64")  # Accum sum of each topic in word topic.
+        self.__topic_words = [[] for _ in range(self.__num_topics)]
+        word_topic_path = os.path.join(model_dir, config.word_topic_file)
+        vocab_path = os.path.join(model_dir, config.vocab_file)
+        self.load_model(word_topic_path, vocab_path)
+
+    def term_id(self, term):
+        return self.__vocab.get_id(term)
+
+    def load_model(self, word_topic_path, vocab_path):
+
+        # Loading vocabulary
+        self.__vocab.load(vocab_path)
+
+        self.__beta_sum = self.__beta * self.__vocab.size()
+        self.__word_topic = [{} for _ in range(self.__vocab.size())]  # 字典列表
+        self.__load_word_dict(word_topic_path)
+        logger.info(
+            "Model Info: #num_topics=%d #vocab_size=%d alpha=%f beta=%f" %
+            (self.num_topics(), self.vocab_size(), self.alpha(), self.beta()))
+
+    def word_topic_value(self, word_id, topic_id):
+        """Return value of specific word under specific topic in the model.
+        """
+        word_dict = self.__word_topic[word_id]
+        if topic_id not in word_dict:
+            return 0
+        return word_dict[topic_id]
+
+    def word_topic(self, term_id):
+        """Return the topic distribution of a word.
+        """
+        return self.__word_topic[term_id]
+
+    def topic_sum_value(self, topic_id):
+        return self.__topic_sum[topic_id]
+
+    def topic_sum(self):
+        return self.__topic_sum
+
+    def num_topics(self):
+        return self.__num_topics
+
+    def vocab_size(self):
+        return self.__vocab.size()
+
+    def alpha(self):
+        return self.__alpha
+
+    def alpha_sum(self):
+        return self.__alpha_sum
+
+    def beta(self):
+        return self.__beta
+
+    def beta_sum(self):
+        return self.__beta_sum
+
+    def type(self):
+        return self.__type
+
+    def __load_word_dict(self, word_dict_path):
+        """Load the word topic parameters.
+        """
+        logger.info("Loading word topic.")
+        with open(word_dict_path, 'r') as f:
+            for line in tqdm(f.readlines()):
+                fields = line.strip().split(" ")
+                assert len(fields) > 0, "Model file format error!"
+                term_id = int(fields[0])
+                assert term_id < self.vocab_size(), "Term id out of range!"
+                assert term_id >= 0, "Term id out of range!"
+                for i in range(1, len(fields)):
+                    topic_count = fields[i].split(":")
+                    assert len(topic_count) == 2, "Topic count format error!"
+
+                    topic_id = int(topic_count[0])
+                    assert topic_id >= 0, "Topic out of range!"
+                    assert topic_id < self.__num_topics, "Topic out of range!"
+
+                    count = int(topic_count[1])
+                    assert count >= 0, "Topic count error!"
+
+                    self.__word_topic[term_id][topic_id] = count
+                    self.__topic_sum[topic_id] += count
+                    self.__topic_words[topic_id].append(
+                        WordCount(term_id, count))
+                new_dict = OrderedDict()
+                for key in sorted(self.__word_topic[term_id]):
+                    new_dict[key] = self.__word_topic[term_id][key]
+                self.__word_topic[term_id] = new_dict
+
+    def get_vocab(self):
+        return self.__vocab.vocabulary()
+
+    def topic_words(self):
+        return self.__topic_words
--- a/hub_module/modules/text/semantic_model/lda_webpage/module.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/module.py
+import os
+
+import paddlehub as hub
+from paddlehub.module.module import moduleinfo
+from paddlehub.common.logger import logger
+
+from lda_webpage.inference_engine import InferenceEngine
+from lda_webpage.document import LDADoc
+from lda_webpage.semantic_matching import SemanticMatching, WordAndDis
+from lda_webpage.tokenizer import LACTokenizer, SimpleTokenizer
+from lda_webpage.config import ModelType
+from lda_webpage.vocab import Vocab, WordCount
+
+
+@moduleinfo(
+    name="lda_webpage",
+    version="1.0.0",
+    summary=
+    "This is a PaddleHub Module for LDA topic model in webpage dataset, where we can calculate doc distance, calculate the similarity between query and document, etc.",
+    author="DesmonDay",
+    author_email="",
+    type="nlp/semantic_model")
+class TopicModel(hub.Module):
+    def _initialize(self):
+        """
+        Initialize with the necessary elements.
+        """
+        self.model_dir = os.path.join(self.directory, 'webpage')
+        self.conf_file = 'lda.conf'
+        self.__engine = InferenceEngine(self.model_dir, self.conf_file)
+        self.vocab_path = os.path.join(self.model_dir, 'vocab_info.txt')
+        lac = hub.Module(name="lac")
+        # self.__tokenizer = SimpleTokenizer(self.vocab_path)
+        self.__tokenizer = LACTokenizer(self.vocab_path, lac)
+
+        self.vocabulary = self.__engine.get_model().get_vocab()
+        self.config = self.__engine.get_config()
+        self.topic_words = self.__engine.get_model().topic_words()
+        self.topic_sum_table = self.__engine.get_model().topic_sum()
+
+        def take_elem(word_count):
+            return word_count.count
+
+        for i in range(self.config.num_topics):
+            self.topic_words[i].sort(key=take_elem, reverse=True)
+
+        logger.info("Finish initialization.")
+
+    def cal_doc_distance(self, doc_text1, doc_text2):
+        """
+        This interface calculates the distance between documents.
+
+        Args:
+            doc_text1(str): the input document text 1.
+            doc_text2(str): the input document text 2.
+
+        Returns:
+            jsd(float): Jensen-Shannon Divergence distance of two documents.
+            hd(float): Hellinger Distance of two documents.
+        """
+        doc1_tokens = self.__tokenizer.tokenize(doc_text1)
+        doc2_tokens = self.__tokenizer.tokenize(doc_text2)
+
+        # Document topic inference.
+        doc1, doc2 = LDADoc(), LDADoc()
+        self.__engine.infer(doc1_tokens, doc1)
+        self.__engine.infer(doc2_tokens, doc2)
+
+        # To calculate jsd, we need dense document topic distribution.
+        dense_dict1 = doc1.dense_topic_dist()
+        dense_dict2 = doc2.dense_topic_dist()
+        # Calculate the distance between distributions.
+        # The smaller the distance, the higher the document semantic similarity.
+        sm = SemanticMatching()
+        jsd = sm.jensen_shannon_divergence(dense_dict1, dense_dict2)
+        hd = sm.hellinger_distance(dense_dict1, dense_dict2)
+
+        return jsd, hd
+
+    def cal_doc_keywords_similarity(self, document, top_k=10):
+        """
+        This interface can be used to find topk keywords of document.
+
+        Args:
+            document(str): the input document text.
+            top_k(int): top k keywords of this document.
+
+        Returns:
+            results(list): contains top_k keywords and their
+                     corresponding similarity compared to document.
+        """
+        d_tokens = self.__tokenizer.tokenize(document)
+
+        # Do topic inference on documents to obtain topic distribution.
+        doc = LDADoc()
+        self.__engine.infer(d_tokens, doc)
+        doc_topic_dist = doc.sparse_topic_dist()
+
+        items = []
+        words = set()
+        for word in d_tokens:
+            if word in words:
+                continue
+            words.add(word)
+            wd = WordAndDis()
+            wd.word = word
+            sm = SemanticMatching()
+            wd.distance = sm.likelihood_based_similarity(
+                terms=[word],
+                doc_topic_dist=doc_topic_dist,
+                model=self.__engine.get_model())
+            items.append(wd)
+
+        def take_elem(word_dis):
+            return word_dis.distance
+
+        items.sort(key=take_elem, reverse=True)
+
+        results = []
+        size = len(items)
+        for i in range(top_k):
+            if i >= size:
+                break
+            results.append({
+                "word": items[i].word,
+                "similarity": items[i].distance
+            })
+
+        return results
+
+    def cal_query_doc_similarity(self, query, document):
+        """
+        This interface calculates the similarity between query and document.
+
+        Args:
+            query(str): the input query text.
+            document(str): the input document text.
+
+        Returns:
+            lda_sim(float): likelihood based similarity between query and document based on LDA.
+        """
+        q_tokens = self.__tokenizer.tokenize(query)
+        d_tokens = self.__tokenizer.tokenize(document)
+
+        doc = LDADoc()
+        self.__engine.infer(d_tokens, doc)
+        doc_topic_dist = doc.sparse_topic_dist()
+
+        sm = SemanticMatching()
+        lda_sim = sm.likelihood_based_similarity(q_tokens, doc_topic_dist,
+                                                 self.__engine.get_model())
+
+        return lda_sim
+
+    def infer_doc_topic_distribution(self, document):
+        """
+        This interface infers the topic distribution of document.
+
+        Args:
+            document(str): the input document text.
+
+        Returns:
+            results(list): returns the topic distribution of document.
+        """
+        tokens = self.__tokenizer.tokenize(document)
+        if tokens == []:
+            return []
+        results = []
+        doc = LDADoc()
+        self.__engine.infer(tokens, doc)
+        topics = doc.sparse_topic_dist()
+        for topic in topics:
+            results.append({"topic id": topic.tid, "distribution": topic.prob})
+        return results
+
+    def show_topic_keywords(self, topic_id, k=10):
+        """
+        This interface returns the first k keywords under specific topic.
+
+        Args:
+            topic_id(int): topic information we want to know.
+            k(int): top k keywords.
+
+        Returns:
+            results(dict): contains specific topic's keywords and
+                     corresponding probability.
+        """
+        EPS = 1e-8
+        results = {}
+        if 0 <= topic_id < self.config.num_topics:
+            k = min(k, len(self.topic_words[topic_id]))
+            for i in range(k):
+                prob = self.topic_words[topic_id][i].count / \
+                       (self.topic_sum_table[topic_id] + EPS)
+                results[self.vocabulary[self.topic_words[topic_id]
+                                        [i].word_id]] = prob
+            return results
+        else:
+            logger.error("%d is out of range!" % topic_id)
--- a/hub_module/modules/text/semantic_model/lda_webpage/sampler.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/sampler.py
+import os
+
+import numpy as np
+from tqdm import tqdm
+from paddlehub.common.logger import logger
+
+from lda_webpage.document import LDADoc, SLDADoc, Token, Sentence
+from lda_webpage.vose_alias import VoseAlias
+from lda_webpage.util import rand, rand_k
+
+
+class Sampler(object):
+    def __init__(self):
+        pass
+
+    def sample_doc(self, doc):
+        """Sample LDA or SLDA topics for documents.
+        """
+        raise NotImplementedError
+
+
+class MHSampler(Sampler):
+    def __init__(self, model):
+        super().__init__()
+        self.__model = model
+        self.__topic_indexes = None
+        self.__alias_tables = None
+        self.__prob_sum = None
+        self.__beta_alias = VoseAlias()
+        self.__beta_prior_sum = None
+        self.__mh_steps = 2
+        self.__construct_alias_table()
+
+    def __construct_alias_table(self):
+        """Construct alias table for all words.
+        """
+        logger.info("Construct alias table for alias sampling method.")
+        vocab_size = self.__model.vocab_size()
+        self.__topic_indexes = [[] for _ in range(vocab_size)]
+        self.__alias_tables = [VoseAlias() for _ in range(vocab_size)]
+        self.__prob_sum = np.zeros(vocab_size)
+
+        # Construct each word's alias table (prior is not included).
+        for i in tqdm(range(vocab_size)):
+            dist = []
+            prob_sum = 0
+            for key in self.__model.word_topic(i):
+                topic_id = key
+                word_topic_count = self.__model.word_topic(i)[key]
+                topic_sum = self.__model.topic_sum_value(topic_id)
+
+                self.__topic_indexes[i].append(topic_id)
+                q = word_topic_count / (topic_sum + self.__model.beta_sum())
+                dist.append(q)
+                prob_sum += q
+            self.__prob_sum[i] = prob_sum
+            if len(dist) > 0:
+                dist = np.array(dist, dtype=np.float)
+                self.__alias_tables[i].initialize(dist)
+
+        # Build prior parameter beta's alias table.
+        beta_dist = self.__model.beta() / (
+            self.__model.topic_sum() + self.__model.beta_sum())
+        self.__beta_prior_sum = np.sum(beta_dist)
+        self.__beta_alias.initialize(beta_dist)
+
+    def sample_doc(self, doc):
+        if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
+            for i in range(doc.size()):
+                new_topic = self.__sample_token(doc, doc.token(i))
+                doc.set_topic(i, new_topic)
+        elif isinstance(doc, SLDADoc):
+            for i in range(doc.size()):
+                new_topic = self.__sample_sentence(doc, doc.sent(i))
+                doc.set_topic(i, new_topic)
+
+    def __sample_token(self, doc, token):
+        new_topic = token.topic
+        for i in range(self.__mh_steps):
+            doc_proposed_topic = self.__doc_proposal(doc, token)
+            new_topic = self.__word_proposal(doc, token, doc_proposed_topic)
+        return new_topic
+
+    def __sample_sentence(self, doc, sent):
+        new_topic = sent.topic
+        for i in range(self.__mh_steps):
+            doc_proposed_topic = self.__doc_proposal(doc, sent)
+            new_topic = self.__word_proposal(doc, sent, doc_proposed_topic)
+        return new_topic
+
+    def __doc_proposal(self, doc, token):
+        if isinstance(doc, LDADoc) and isinstance(token, Token):
+            old_topic = token.topic
+            dart = rand() * (doc.size() + self.__model.alpha_sum())
+            if dart < doc.size():
+                token_index = int(dart)
+                new_topic = doc.token(token_index).topic
+            else:
+                new_topic = rand_k(self.__model.num_topics())
+
+            if new_topic != old_topic:
+                proposal_old = self.__doc_proposal_distribution(doc, old_topic)
+                proposal_new = self.__doc_proposal_distribution(doc, new_topic)
+                proportion_old = self.__proportional_function(
+                    doc, token, old_topic)
+                proportion_new = self.__proportional_function(
+                    doc, token, new_topic)
+                transition_prob = float((proportion_new * proposal_old) /
+                                        (proportion_old * proposal_new))
+                rejection = rand()
+                mask = -(rejection < transition_prob)
+                return (new_topic & mask) | (old_topic & ~mask)
+
+            return new_topic
+
+        elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
+            sent = token
+            old_topic = sent.topic
+            dart = rand() * (doc.size() + self.__model.alpha_sum())
+            if dart < doc.size():
+                token_index = int(dart)
+                new_topic = doc.sent(token_index).topic
+            else:
+                new_topic = rand_k(self.__model.num_topics())
+
+            if new_topic != old_topic:
+                proportion_old = self.__proportional_function(
+                    doc, sent, old_topic)
+                proportion_new = self.__proportional_function(
+                    doc, sent, new_topic)
+                proposal_old = self.__doc_proposal_distribution(doc, old_topic)
+                proposal_new = self.__doc_proposal_distribution(doc, new_topic)
+                transition_prob = float((proportion_new * proposal_old) /
+                                        (proportion_old * proposal_new))
+                rejection = rand()
+                mask = -(rejection < transition_prob)
+                return (new_topic & mask) | (old_topic & ~mask)
+
+            return new_topic
+
+    def __word_proposal(self, doc, token, old_topic):
+        if isinstance(doc, LDADoc) and isinstance(token, Token):
+            new_topic = self.__propose(token.id)
+            if new_topic != old_topic:
+                proposal_old = self.__word_proposal_distribution(
+                    token.id, old_topic)
+                proposal_new = self.__word_proposal_distribution(
+                    token.id, new_topic)
+                proportion_old = self.__proportional_function(
+                    doc, token, old_topic)
+                proportion_new = self.__proportional_function(
+                    doc, token, new_topic)
+                transition_prob = float((proportion_new * proposal_old) /
+                                        (proportion_old * proposal_new))
+                rejection = rand()
+                mask = -(rejection < transition_prob)
+                return (new_topic & mask) | (old_topic & ~mask)
+            return new_topic
+
+        elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
+            sent = token
+            new_topic = old_topic
+            for word_id in sent.tokens:
+                new_topic = self.__propose(word_id)
+                if new_topic != old_topic:
+                    proportion_old = self.__proportional_function(
+                        doc, sent, old_topic)
+                    proportion_new = self.__proportional_function(
+                        doc, sent, new_topic)
+                    proposal_old = self.__word_proposal_distribution(
+                        word_id, old_topic)
+                    proposal_new = self.__word_proposal_distribution(
+                        word_id, new_topic)
+                    transition_prob = float((proportion_new * proposal_old) /
+                                            (proportion_old * proposal_new))
+                    rejection = rand()
+                    mask = -(rejection < transition_prob)
+                    new_topic = (new_topic & mask) | (old_topic & ~mask)
+            return new_topic
+
+    def __proportional_function(self, doc, token, new_topic):
+        if isinstance(doc, LDADoc) and isinstance(token, Token):
+            old_topic = token.topic
+            dt_alpha = doc.topic_sum(new_topic) + self.__model.alpha()
+            wt_beta = self.__model.word_topic_value(
+                token.id, new_topic) + self.__model.beta()
+            t_sum_beta_sum = self.__model.topic_sum_value(
+                new_topic) + self.__model.beta_sum()
+            if new_topic == old_topic and wt_beta > 1:
+                if dt_alpha > 1:
+                    dt_alpha -= 1
+                wt_beta -= 1
+                t_sum_beta_sum -= 1
+            return dt_alpha * wt_beta / t_sum_beta_sum
+
+        elif isinstance(doc, SLDADoc) and isinstance(token, Sentence):
+            sent = token
+            old_topic = sent.topic
+            result = doc.topic_sum(new_topic) + self.__model.alpha()
+            if new_topic == old_topic:
+                result -= 1
+            for word_id in sent.tokens:
+                wt_beta = self.__model.word_topic_value(
+                    word_id, new_topic) + self.__model.beta()
+                t_sum_beta_sum = self.__model.topic_sum_value(
+                    new_topic) + self.__model.beta_sum()
+                if new_topic == old_topic and wt_beta > 1:
+                    wt_beta -= 1
+                    t_sum_beta_sum -= 1
+                result *= wt_beta / t_sum_beta_sum
+            return result
+        else:
+            logger.error("Wrong input argument type!")
+
+    def __word_proposal_distribution(self, word_id, topic):
+        wt_beta = self.__model.word_topic_value(word_id,
+                                                topic) + self.__model.beta()
+        t_sum_beta_sum = self.__model.topic_sum_value(
+            topic) + self.__model.beta_sum()
+        return wt_beta / t_sum_beta_sum
+
+    def __doc_proposal_distribution(self, doc, topic):
+        return doc.topic_sum(topic) + self.__model.alpha()
+
+    def __propose(self, word_id):
+        dart = rand() * (self.__prob_sum[word_id] + self.__beta_prior_sum)
+        if dart < self.__prob_sum[word_id]:
+            idx = self.__alias_tables[word_id].generate()
+            topic = self.__topic_indexes[word_id][idx]
+        else:
+            topic = self.__beta_alias.generate()
+        return topic
+
+
+class GibbsSampler(Sampler):
+    def __init__(self, model):
+        super().__init__()
+        self.__model = model
+
+    def sample_doc(self, doc):
+        if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
+            for i in range(doc.size()):
+                new_topic = self.__sample_token(doc, doc.token(i))
+                doc.set_topic(i, new_topic)
+        elif isinstance(doc, SLDADoc):
+            for i in range(doc.size()):
+                new_topic = self.__sample_sentence(doc, doc.sent(i))
+                doc.set_topic(i, new_topic)
+
+    def __sample_token(self, doc, token):
+        old_topic = token.topic
+        num_topics = self.__model.num_topics()
+        accum_prob = np.zeros(num_topics)
+        prob = np.zeros(num_topics)
+        sum_ = 0
+        for i in range(num_topics):
+            dt_alpha = doc.topic_sum(i) + self.__model.alpha()
+            wt_beta = self.__model.word_topic_value(token.id,
+                                                    i) + self.__model.beta()
+            t_sum_beta_sum = self.__model.topic_sum(i) + self.__model.beta_sum()
+            if i == old_topic and wt_beta > 1:
+                if dt_alpha > 1:
+                    dt_alpha -= 1
+                wt_beta -= 1
+                t_sum_beta_sum -= 1
+            prob[i] = dt_alpha * wt_beta / t_sum_beta_sum
+            sum_ += prob[i]
+            accum_prob[i] = prob[i] if i == 0 else accum_prob[i - 1] + prob[i]
+
+        dart = rand() * sum_
+        if dart <= accum_prob[0]:
+            return 0
+        for i in range(1, num_topics):
+            if accum_prob[i - 1] < dart <= accum_prob[i]:
+                return i
+        return num_topics - 1
+
+    def __sample_sentence(self, doc, sent):
+        old_topic = sent.topic
+        num_topics = self.__model.num_topics()
+        accum_prob = np.zeros(num_topics)
+        prob = np.zeros(num_topics)
+        sum_ = 0
+        for t in range(num_topics):
+            dt_alpha = doc.topic_sum(t) + self.__model.alpha()
+            t_sum_beta_sum = self.__model.topic_sum(t) + self.__model.beta_sum()
+            if t == old_topic:
+                if dt_alpha > 1:
+                    dt_alpha -= 1
+                if t_sum_beta_sum > 1:
+                    t_sum_beta_sum -= 1
+            prob[t] = dt_alpha
+            for i in range(len(sent.tokens)):
+                w = sent.tokens[i]
+                wt_beta = self.__model.word_topic_value(
+                    w, t) + self.__model.beta()
+                if t == old_topic and wt_beta > 1:
+                    wt_beta -= 1
+                # Note: if the length of the sentence is too long, the probability will be
+                # too small and the accuracy will be lost if there are too many multiply items
+                prob[t] *= wt_beta / t_sum_beta_sum
+            sum_ += prob[t]
+            accum_prob[t] = prob[t] if t == 0 else accum_prob[t - 1] + prob[t]
+
+        dart = rand() * sum
+        if dart <= accum_prob[0]:
+            return 0
+        for t in range(1, num_topics):
+            if accum_prob[t - 1] < dart <= accum_prob[t]:
+                return t
+        return num_topics - 1
--- a/hub_module/modules/text/semantic_model/lda_webpage/semantic_matching.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/semantic_matching.py
+import os
+
+import numpy as np
+from paddlehub.common.logger import logger
+
+from lda_webpage.vocab import OOV
+
+EPS = 1e-06
+
+
+class WordAndDis(object):
+    def __init__(self):
+        self.word = None
+        self.distance = None
+
+
+class SemanticMatching(object):
+    def __init__(self):
+        pass
+
+    def l2_norm(self, vec):
+        """Calculate the length of vector.
+        """
+        result = np.sqrt(np.sum(vec**2))
+        return result
+
+    def cosine_similarity(self, vec1, vec2):
+        norm1 = self.l2_norm(vec1)
+        norm2 = self.l2_norm(vec2)
+        result = np.sum(vec1 * vec2) / norm1 / norm2
+        return result
+
+    def likelihood_based_similarity(self, terms, doc_topic_dist, model):
+        """
+        Args:
+            terms: list of strings
+            doc_topic_dist: list of Topic class
+            model: TopicModel class
+        """
+        num_of_term_in_vocab = 0
+        result = 0
+        for i in range(len(terms)):
+            term_id = model.term_id(terms[i])
+            if term_id == OOV:
+                continue
+            num_of_term_in_vocab += 1
+            for j in range(len(doc_topic_dist)):
+                topic_id = doc_topic_dist[j].tid
+                prob = doc_topic_dist[j].prob
+                result += model.word_topic_value(term_id, topic_id) * 1.0 / \
+                          model.topic_sum_value(topic_id) * prob
+
+        if num_of_term_in_vocab == 0:
+            return result
+        return result / num_of_term_in_vocab
+
+    def kullback_leibler_divergence(self, dist1, dist2):
+        assert dist1.shape == dist2.shape
+        dist2[dist2 < EPS] = EPS
+        result = np.sum(dist1 * np.log(dist1 / dist2))
+        return result
+
+    def jensen_shannon_divergence(self, dist1, dist2):
+        assert dist1.shape == dist2.shape
+        dist1[dist1 < EPS] = EPS
+        dist2[dist2 < EPS] = EPS
+        mean = (dist1 + dist2) * 0.5
+        jsd = self.kullback_leibler_divergence(dist1, mean) * 0.5 + \
+              self.kullback_leibler_divergence(dist2, mean) * 0.5
+        return jsd
+
+    def hellinger_distance(self, dist1, dist2):
+        assert dist1.shape == dist2.shape
+        result = np.sum((np.sqrt(dist1) - np.sqrt(dist2))**2)
+        result = np.sqrt(result) * 0.7071067812
+        return result
--- a/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py
+import os
+
+import numpy as np
+from paddlehub.common.logger import logger
+
+
+class Tokenizer(object):
+    """Base tokenizer class.
+    """
+
+    def __init__(self):
+        pass
+
+    def tokenize(self, text):
+        raise NotImplementedError
+
+
+class SimpleTokenizer(Tokenizer):
+    """Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
+       be used in topic model demo, but not in real business application scenarios.
+
+       Notes: This tokenizer can only recognize the words in the corresponding vocab file.
+    """
+
+    def __init__(self, vocab_path):
+        super().__init__()
+        self.__max_word_len = 0
+        self.__vocab = set()
+        self.__load_vocab(vocab_path)
+
+    def tokenize(self, text):
+        """Tokenize the input string `text`, and return the tokenize result.
+        """
+        text_len = len(text)
+        result = []
+        i = 0
+        while i < text_len:
+            word = found_word = ""
+            # Deal with English characters.
+            if self.__is_eng_char(text[i]):
+                for j in range(i, text_len + 1):
+                    if j < text_len and self.__is_eng_char(text[j]):
+                        word += self.__tolower(text[j])
+                    else:
+                        # Forward matching by character granularity.
+                        if word in self.__vocab:
+                            result.append(word)
+                        i = j - 1
+                        break
+            else:
+                for j in range(i, min(i + self.__max_word_len, text_len)):
+                    word += text[j]
+                    if word in self.__vocab:
+                        found_word = word
+                if len(found_word) > 0:
+                    result.append(found_word)
+                    i += len(found_word) - 1
+            i += 1
+        return result
+
+    def contains(self, word):
+        """Check whether the word is in the vocabulary.
+        """
+        return word in self.__vocab
+
+    def __load_vocab(self, vocab_path):
+        """Load the word dictionary.
+        """
+        with open(vocab_path, 'r') as fin:
+            vocab_size = 0
+            for line in fin.readlines():
+                fields = line.strip().split('\t')
+                assert len(fields) >= 2
+                word = fields[1]
+                self.__max_word_len = max(self.__max_word_len, len(word))
+                self.__vocab.add(word)
+                vocab_size += 1
+
+    def __is_eng_char(self, c):
+        """Check whether char c is an English character.
+        """
+        return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')
+
+    def __tolower(self, c):
+        """Return the lowercase character of the corresponding character, or return
+           the original character if there is no corresponding lowercase character.
+        """
+        return c.lower()
+
+
+class LACTokenizer(Tokenizer):
+    def __init__(self, vocab_path, lac):
+        super().__init__()
+        self.__max_word_len = 0
+        self.__vocab = set()
+        self.__lac = lac
+        self.__load_vocab(vocab_path)
+
+    def __load_vocab(self, vocab_path):
+        """Load the word dictionary.
+                """
+        with open(vocab_path, 'r') as fin:
+            vocab_size = 0
+            for line in fin.readlines():
+                fields = line.strip().split('\t')
+                assert len(fields) >= 2
+                word = fields[1]
+                self.__max_word_len = max(self.__max_word_len, len(word))
+                self.__vocab.add(word)
+                vocab_size += 1
+
+    def tokenize(self, text):
+        results = self.__lac.lexical_analysis(
+            texts=[text], use_gpu=True, batch_size=1, return_tag=True)
+        # Change English words to lower case.
+        # And just preserve the word in vocab.
+        words = results[0]["word"]
+        result = []
+        for word in words:
+            word = word.lower()
+            if word in self.__vocab:
+                result.append(word)
+        return result
+
+    def contains(self, word):
+        """Check whether the word is in the vocabulary.
+        """
+        return word in self.__vocab
--- a/hub_module/modules/text/semantic_model/lda_webpage/util.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/util.py
+import time
+import yaml
+
+import numpy as np
+from paddlehub.common.logger import logger
+
+from lda_webpage.config import ModelType
+
+
+def load_prototxt(config_file, config):
+    """
+    Args:
+        config_file: model configuration file.
+        config: ModelConfig class
+    """
+    logger.info("Loading LDA config.")
+    with open(config_file, 'r') as f:
+        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
+
+    # Assignment.
+    if yaml_dict["type"] == "LDA":
+        config.type = ModelType.LDA
+    else:
+        config.type = ModelType.SLDA
+    config.num_topics = yaml_dict["num_topics"]
+    config.alpha = yaml_dict["alpha"]
+    config.beta = yaml_dict["beta"]
+    config.word_topic_file = yaml_dict["word_topic_file"]
+    config.vocab_file = yaml_dict["vocab_file"]
+
+
+def fix_random_seed(seed=2147483647):
+    np.random.seed(seed)
+
+
+def rand(min_=0, max_=1):
+    return np.random.uniform(low=min_, high=max_)
+
+
+def rand_k(k):
+    """Returns an integer float number between [0, k - 1].
+    """
+    return int(rand() * k)
+
+
+def timeit(f):
+    """Return time cost of function f.
+    """
+
+    def timed(*args, **kwargs):
+        start_time = time.time()
+        result = f(*args, **kwargs)
+        end_time = time.time()
+        print("   [-] %s : %2.5f sec" % (f.__name__, end_time - start_time))
+        return result
+
+    return timed
--- a/hub_module/modules/text/semantic_model/lda_webpage/vocab.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/vocab.py
+from paddlehub.common.logger import logger
+
+OOV = -1
+
+
+class WordCount(object):
+    def __init__(self, word_id, count):
+        self.word_id = word_id
+        self.count = count
+
+
+class Vocab(object):
+    def __init__(self):
+        self.__term2id = {}
+        self.__id2term = {}
+
+    def get_id(self, word):
+        if word not in self.__term2id:
+            return OOV
+        return self.__term2id[word]
+
+    def load(self, vocab_file):
+        self.__term2id = {}
+        self.__id2term = {}
+        with open(vocab_file, 'r') as fin:
+            for line in fin.readlines():
+                fields = line.strip().split('\t')
+                assert len(
+                    fields) == 5, "Vocabulary file [%s] format error!" % (
+                        vocab_file)
+                term = fields[1]
+                id_ = int(fields[2])
+                if term in self.__term2id:
+                    logger.error("Duplicate word [%s] in vocab file!" % (term))
+                    continue
+                self.__term2id[term] = id_
+                self.__id2term[id_] = term
+
+    def size(self):
+        return len(self.__term2id)
+
+    def vocabulary(self):
+        return self.__id2term
--- a/hub_module/modules/text/semantic_model/lda_webpage/vose_alias.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/vose_alias.py
+import os
+
+import numpy as np
+from paddlehub.common.logger import logger
+
+from lda_webpage.util import rand, rand_k
+
+
+class VoseAlias(object):
+    """Vose's Alias Method.
+    """
+
+    def __init__(self):
+        self.__alias = None
+        self.__prob = None  # np.array
+
+    def initialize(self, distribution):
+        """Initialize the alias table according to the input distribution
+        Arg:
+            distribution: Numpy array.
+        """
+        size = distribution.shape[0]
+        self.__alias = np.zeros(size, dtype=np.int64)
+        self.__prob = np.zeros(size)
+        sum_ = np.sum(distribution)
+        p = distribution / sum_ * size  # Scale up probability.
+        large, small = [], []
+        for i, p_ in enumerate(p):
+            if p_ < 1.0:
+                small.append(i)
+            else:
+                large.append(i)
+
+        while large and small:
+            l = small[0]
+            g = large[0]
+            small.pop(0)
+            large.pop(0)
+            self.__prob[l] = p[l]
+            self.__alias[l] = g
+            p[g] = p[g] + p[l] - 1  # A more numerically stable option.
+            if p[g] < 1.0:
+                small.append(g)
+            else:
+                large.append(g)
+
+        while large:
+            g = large[0]
+            large.pop(0)
+            self.__prob[g] = 1.0
+
+        while small:
+            l = small[0]
+            small.pop(0)
+            self.__prob[l] = 1.0
+
+    def generate(self):
+        """Generate samples from given distribution.
+        """
+        dart1 = rand_k(self.size())
+        dart2 = int(rand())
+        return dart1 if dart2 > self.__prob[dart1] else self.__alias[dart1]
+
+    def size(self):
+        return self.__prob.shape[0]
--- a/hub_module/modules/text/semantic_model/slda_news/README.md
+++ b/hub_module/modules/text/semantic_model/slda_news/README.md
+## 模型概述
+
+主题模型(Topic Model)是以无监督学习的方式对文档的隐含语义结构进行聚类的统计模型，其中SLDA(Sentence-LDA)是主题模型的一种。SLDA是LDA主题模型的扩展，LDA假设每个单词对应一个主题，而SLDA假设每个句子对应一个主题。本Module基于的数据集为百度自建的新闻领域数据集。
+
+<p alian="center">
+<img src="https://bj.bcebos.com/paddlehub/model/nlp/semantic_model/slda.png" hspace='10'/> <br />
+</p>
+
+更多详情请参考[SLDA论文](https://pdfs.semanticscholar.org/c311/778adb9484c86250e915aecd9714f4206050.pdf)。
+
+注：该Module由第三方开发者DesmonDay贡献。
+
+## SLDA模型 API 说明
+
+### infer_doc_topic_distribution(document)
+
+用于推理出文档的主题分布。
+
+**参数**
+
+- document(str): 输入文档。
+
+**返回**
+
+- results(list): 包含主题分布下各个主题ID和对应的概率分布。其中，list的基本元素为dict，dict的key为主题ID，value为各个主题ID对应的概率。
+
+### show_topic_keywords(topic_id, k=10)
+
+用于展示出每个主题下对应的关键词，可配合推理主题分布的API使用。
+
+**参数**
+
+- topic_id(int): 主题ID。
+- k(int): 需要知道对应主题的前k个关键词。
+
+**返回**
+
+- results(dict): 返回对应文档的前k个关键词，以及各个关键词在文档中的出现概率。
+
+### 代码示例
+
+这里展示API的使用示例。
+
+``` python
+import paddlehub as hub
+
+slda_news = hub.Module(name="slda_news")
+
+topic_dist = slda_news.infer_doc_topic_distribution("百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息，找到所求。")
+# {378: 0.5, 804: 0.5}
+
+keywords = slda_news.show_topic_keywords(topic_id=804, k=10)
+# {'百度': 0.08269021676897842,
+# '搜索': 0.04154762385123992,
+# '推广': 0.026193527138926424,
+# '贴吧': 0.02125616298078334,
+# '排名': 0.019595252609963018,
+# '关键词': 0.015173719446828477,
+# '广告': 0.013552941381750894,
+# '搜索引擎': 0.010038529194616577,
+# '公司': 0.009388342219512786,
+# '网站': 0.009173721627932065}
+
+```
+
+## 查看代码
+https://github.com/baidu/Familia
+
+
+## 依赖
+
+paddlepaddle >= 1.8.2
+
+paddlehub >= 1.8.0
+
+## 更新历史
+
+* 1.0.0
+
+  初始发布
--- a/hub_module/modules/text/semantic_model/slda_news/__init__.py
+++ b/hub_module/modules/text/semantic_model/slda_news/__init__.py
--- a/hub_module/modules/text/semantic_model/slda_news/config.py
+++ b/hub_module/modules/text/semantic_model/slda_news/config.py
+"""
+This file defines the basic config information of LDA/SLDA model.
+"""
+
+
+class ModelType:
+    LDA = 0
+    SLDA = 1
+
+
+class ModelConfig:
+    type = None
+    num_topics = None
+    alpha = None
+    beta = None
+    word_topic_file = None
+    vocab_file = None
--- a/hub_module/modules/text/semantic_model/slda_news/document.py
+++ b/hub_module/modules/text/semantic_model/slda_news/document.py
+import numpy as np
+
+
+class Topic(object):
+    """Basic data structure of topic, contains topic id and
+       corresponding probability.
+    """
+
+    def __init__(self, tid, prob):
+        self.tid = tid  # topic id
+        self.prob = prob  # topic probability
+
+
+class Token(object):
+    """Basic storage unit of LDA documents, contains word id
+       and corresponding topic.
+    """
+
+    def __init__(self, topic, id):
+        self.topic = topic
+        self.id = id
+
+
+class Sentence(object):
+    """Basic storage unit of SentenceLDA documents, contains word ids
+       of the sentence and its corresponding topic id.
+    """
+
+    def __init__(self, topic, tokens):
+        self.topic = topic
+        self.tokens = tokens
+
+
+class LDADoc(object):
+    """The storage structure of LDA model's inference result.
+    """
+
+    def __init__(self):
+        self._num_topics = None  # Number of topics.
+        self._num_accum = None  # Number of accumulated sample rounds.
+        self._alpha = None  # Document prior parameter.
+        self._tokens = None  # Storage structure of inference results.
+        self._topic_sum = None  # Document's topic sum in one round samples.
+        self._accum_topic_sum = None  # Accumulated results of topic sum.
+
+    def init(self, num_topics):
+        """Initialize the LDADoc according to num_topics.
+        """
+        self._num_topics = num_topics
+        self._num_accum = 0
+        self._tokens = []
+        self._topic_sum = np.zeros(self._num_topics)
+        self._accum_topic_sum = np.zeros(self._num_topics)
+
+    def add_token(self, token):
+        """Add new word to current LDADoc.
+        Arg:
+            token: Token class object.
+        """
+        assert token.topic >= 0, "Topic %d out of range!" % token.topic
+        assert token.topic < self._num_topics, "Topic %d out of range!" % token.topic
+        self._tokens.append(token)
+        self._topic_sum[token.topic] += 1
+
+    def token(self, index):
+        return self._tokens[index]
+
+    def set_topic(self, index, new_topic):
+        """Set the index word's topic to new_topic, and update the corresponding
+           topic distribution.
+        """
+        assert new_topic >= 0, "Topic %d out of range!" % new_topic
+        assert new_topic < self._num_topics, "Topic %d out of range!" % new_topic
+        old_topic = self._tokens[index].topic
+        if new_topic == old_topic:
+            return
+        self._tokens[index].topic = new_topic
+        self._topic_sum[old_topic] -= 1
+        self._topic_sum[new_topic] += 1
+
+    def set_alpha(self, alpha):
+        self._alpha = alpha
+
+    def size(self):
+        """Return number of words in LDADoc.
+        """
+        return len(self._tokens)
+
+    def topic_sum(self, topic_id):
+        return self._topic_sum[topic_id]
+
+    def sparse_topic_dist(self, sort=True):
+        """Return the topic distribution of documents in sparse format.
+           By default, it is sorted according to the topic probability
+           under the descending order.
+        """
+        topic_dist = []
+        sum_ = np.sum(self._accum_topic_sum)
+        if sum_ == 0:
+            return
+        for i in range(0, self._num_topics):
+            if self._accum_topic_sum[i] == 0:
+                continue
+            topic_dist.append(Topic(i, self._accum_topic_sum[i] * 1.0 / sum_))
+        if sort:
+
+            def take_elem(topic):
+                return topic.prob
+
+            topic_dist.sort(key=take_elem, reverse=True)
+            if topic_dist is None:
+                topic_dist = []
+
+        return topic_dist
+
+    def dense_topic_dist(self):
+        """Return the distribution of document topics in dense format,
+           taking into account the prior parameter alpha.
+        """
+        dense_dist = np.zeros(self._num_topics)
+        if self.size() == 0:
+            return dense_dist
+        dense_dist = (
+            self._accum_topic_sum * 1.0 / self._num_accum + self._alpha) / (
+                self.size() + self._alpha * self._num_topics)
+        return dense_dist
+
+    def accumulate_topic_num(self):
+        self._accum_topic_sum += self._topic_sum
+        self._num_accum += 1
+
+
+class SLDADoc(LDADoc):
+    """Sentence LDA Document, inherited from LDADoc.
+       Add add_sentence interface.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.__sentences = None
+
+    def init(self, num_topics):
+        """Initialize the SLDADoc according to num_topics.
+        """
+        self._num_topics = num_topics
+        self.__sentences = []
+        self._num_accum = 0
+        self._topic_sum = np.zeros(self._num_topics)
+        self._accum_topic_sum = np.zeros(self._num_topics)
+
+    def add_sentence(self, sent):
+        """Add new sentence to current SLDADoc.
+        Arg:
+            sent: Sentence class object.
+        """
+        assert sent.topic >= 0, "Topic %d out of range!" % (sent.topic)
+        assert sent.topic < self._num_topics, "Topic %d out of range!" % (
+            sent.topic)
+        self.__sentences.append(sent)
+        self._topic_sum[sent.topic] += 1
+
+    def set_topic(self, index, new_topic):
+        assert new_topic >= 0, "Topic %d out of range!" % (new_topic)
+        assert new_topic < self._num_topics, "Topic %d out of range!" % (
+            new_topic)
+        old_topic = self.__sentences[index].topic
+        if new_topic == old_topic:
+            return
+        self.__sentences[index].topic = new_topic
+        self._topic_sum[old_topic] -= 1
+        self._topic_sum[new_topic] += 1
+
+    def size(self):
+        """Return number of sentences in SLDADoc.
+        """
+        return len(self.__sentences)
+
+    def sent(self, index):
+        return self.__sentences[index]
--- a/hub_module/modules/text/semantic_model/slda_news/inference_engine.py
+++ b/hub_module/modules/text/semantic_model/slda_news/inference_engine.py
+import os
+
+from paddlehub.common.logger import logger
+
+from slda_news.config import ModelConfig
+from slda_news.util import load_prototxt, fix_random_seed, rand_k
+from slda_news.model import TopicModel
+from slda_news.sampler import GibbsSampler, MHSampler
+from slda_news.document import LDADoc, SLDADoc, Token, Sentence
+from slda_news.vocab import OOV
+
+
+class SamplerType:
+    GibbsSampling = 0
+    MetropolisHastings = 1
+
+
+class InferenceEngine(object):
+    def __init__(self,
+                 model_dir,
+                 conf_file,
+                 type=SamplerType.MetropolisHastings):
+        # Read model configuration.
+        config = ModelConfig()
+        conf_file_path = os.path.join(model_dir, conf_file)
+        load_prototxt(conf_file_path, config)
+        self.__model = TopicModel(model_dir, config)
+        self.__config = config
+
+        # Initialize the sampler according to the configuration.
+        if type == SamplerType.GibbsSampling:
+            self.__sampler = GibbsSampler(self.__model)
+        elif type == SamplerType.MetropolisHastings:
+            self.__sampler = MHSampler(self.__model)
+
+    def infer(self, input, doc):
+        """Perform LDA topic inference on input, and store the results in doc.
+        Args:
+            input: a list of strings after tokenization.
+            doc: LDADoc type or SLDADoc type.
+        """
+        fix_random_seed()
+        if isinstance(doc, LDADoc) and not isinstance(doc, SLDADoc):
+            doc.init(self.__model.num_topics())
+            doc.set_alpha(self.__model.alpha())
+            for token in input:
+                id_ = self.__model.term_id(token)
+                if id_ != OOV:
+                    init_topic = rand_k(self.__model.num_topics())
+                    doc.add_token(Token(init_topic, id_))
+            self.lda_infer(doc, 20, 50)
+        elif isinstance(doc, SLDADoc):
+            doc.init(self.__model.num_topics())
+            doc.set_alpha(self.__model.alpha())
+            for sent in input:
+                words = []
+                for token in sent:
+                    id_ = self.__model.term_id(token)
+                    if id_ != OOV:
+                        words.append(id_)
+                init_topic = rand_k(self.__model.num_topics())
+                doc.add_sentence(Sentence(init_topic, words))
+            self.slda_infer(doc, 20, 50)
+        else:
+            logger.error("Wrong Doc Type!")
+
+    def lda_infer(self, doc, burn_in_iter, total_iter):
+        assert burn_in_iter >= 0
+        assert total_iter > 0
+        assert total_iter > burn_in_iter
+
+        for iter_ in range(total_iter):
+            self.__sampler.sample_doc(doc)
+            if iter_ >= burn_in_iter:
+                doc.accumulate_topic_num()
+
+    def slda_infer(self, doc, burn_in_iter, total_iter):
+        assert burn_in_iter >= 0
+        assert total_iter > 0
+        assert total_iter > burn_in_iter
+
+        for iter_ in range(total_iter):
+            self.__sampler.sample_doc(doc)
+            if iter_ >= burn_in_iter:
+                doc.accumulate_topic_num()
+
+    def model_type(self):
+        return self.__model.type()
+
+    def get_model(self):
+        return self.__model
+
+    def get_config(self):
+        return self.__config
--- a/hub_module/modules/text/semantic_model/slda_news/model.py
+++ b/hub_module/modules/text/semantic_model/slda_news/model.py
+import os
+from collections import OrderedDict
+
+import numpy as np
+from tqdm import tqdm
+from paddlehub.common.logger import logger
+
+from slda_news.vocab import Vocab, WordCount
+
+
+class TopicModel(object):
+    """Storage Structure of Topic model, including vocabulary and word topic count.
+    """
+
+    def __init__(self, model_dir, config):
+        """
+        Args:
+            model_dir: the path of model directory
+            config: ModelConfig class.
+        """
+        self.__word_topic = None  # Model parameter of word topic.
+        self.__vocab = Vocab()  # Vocab data structure of model.
+        self.__num_topics = config.num_topics  # Number of topics.
+        self.__alpha = config.alpha
+        self.__alpha_sum = self.__alpha * self.__num_topics
+        self.__beta = config.beta
+        self.__beta_sum = None
+        self.__type = config.type  # Model type.
+        self.__topic_sum = np.zeros(
+            self.__num_topics,
+            dtype="int64")  # Accum sum of each topic in word topic.
+        self.__topic_words = [[] for _ in range(self.__num_topics)]
+        word_topic_path = os.path.join(model_dir, config.word_topic_file)
+        vocab_path = os.path.join(model_dir, config.vocab_file)
+        self.load_model(word_topic_path, vocab_path)
+
+    def term_id(self, term):
+        return self.__vocab.get_id(term)
+
+    def load_model(self, word_topic_path, vocab_path):
+
+        # Loading vocabulary
+        self.__vocab.load(vocab_path)
+
+        self.__beta_sum = self.__beta * self.__vocab.size()
+        self.__word_topic = [{} for _ in range(self.__vocab.size())]  # 字典列表
+        self.__load_word_dict(word_topic_path)
+        logger.info(
+            "Model Info: #num_topics=%d #vocab_size=%d alpha=%f beta=%f" %
+            (self.num_topics(), self.vocab_size(), self.alpha(), self.beta()))
+
+    def word_topic_value(self, word_id, topic_id):
+        """Return value of specific word under specific topic in the model.
+        """
+        word_dict = self.__word_topic[word_id]
+        if topic_id not in word_dict:
+            return 0
+        return word_dict[topic_id]
+
+    def word_topic(self, term_id):
+        """Return the topic distribution of a word.
+        """
+        return self.__word_topic[term_id]
+
+    def topic_sum_value(self, topic_id):
+        return self.__topic_sum[topic_id]
+
+    def topic_sum(self):
+        return self.__topic_sum
+
+    def num_topics(self):
+        return self.__num_topics
+
+    def vocab_size(self):
+        return self.__vocab.size()
+
+    def alpha(self):
+        return self.__alpha
+
+    def alpha_sum(self):
+        return self.__alpha_sum
+
+    def beta(self):
+        return self.__beta
+
+    def beta_sum(self):
+        return self.__beta_sum
+
+    def type(self):
+        return self.__type
+
+    def __load_word_dict(self, word_dict_path):
+        """Load the word topic parameters.
+        """
+        logger.info("Loading word topic.")
+        with open(word_dict_path, 'r') as f:
+            for line in tqdm(f.readlines()):
+                fields = line.strip().split(" ")
+                assert len(fields) > 0, "Model file format error!"
+                term_id = int(fields[0])
+                assert term_id < self.vocab_size(), "Term id out of range!"
+                assert term_id >= 0, "Term id out of range!"
+                for i in range(1, len(fields)):
+                    topic_count = fields[i].split(":")
+                    assert len(topic_count) == 2, "Topic count format error!"
+
+                    topic_id = int(topic_count[0])
+                    assert topic_id >= 0, "Topic out of range!"
+                    assert topic_id < self.__num_topics, "Topic out of range!"
+
+                    count = int(topic_count[1])
+                    assert count >= 0, "Topic count error!"
+
+                    self.__word_topic[term_id][topic_id] = count
+                    self.__topic_sum[topic_id] += count
+                    self.__topic_words[topic_id].append(
+                        WordCount(term_id, count))
+                new_dict = OrderedDict()
+                for key in sorted(self.__word_topic[term_id]):
+                    new_dict[key] = self.__word_topic[term_id][key]
+                self.__word_topic[term_id] = new_dict
+
+    def get_vocab(self):
+        return self.__vocab.vocabulary()
+
+    def topic_words(self):
+        return self.__topic_words
--- a/hub_module/modules/text/semantic_model/slda_news/module.py
+++ b/hub_module/modules/text/semantic_model/slda_news/module.py
+import os
+
+import paddlehub as hub
+from paddlehub.module.module import moduleinfo
+from paddlehub.common.logger import logger
+
+from slda_news.inference_engine import InferenceEngine
+from slda_news.document import SLDADoc
+from slda_news.semantic_matching import SemanticMatching, WordAndDis
+from slda_news.tokenizer import LACTokenizer, SimpleTokenizer
+from slda_news.config import ModelType
+from slda_news.vocab import Vocab, WordCount
+
+
+@moduleinfo(
+    name="slda_news",
+    version="1.0.0",
+    summary=
+    "This is a PaddleHub Module for SLDA topic model in news dataset, where we can infer the topic distribution of document.",
+    author="DesmonDay",
+    author_email="",
+    type="nlp/semantic_model")
+class TopicModel(hub.Module):
+    def _initialize(self):
+        """Initialize with the necessary elements.
+        """
+        self.model_dir = os.path.join(self.directory, 'news')
+        self.conf_file = 'slda.conf'
+        self.__engine = InferenceEngine(self.model_dir, self.conf_file)
+        self.vocab_path = os.path.join(self.model_dir, 'vocab_info.txt')
+        lac = hub.Module(name="lac")
+        # self.__tokenizer = SimpleTokenizer(self.vocab_path)
+        self.__tokenizer = LACTokenizer(self.vocab_path, lac)
+
+        self.vocabulary = self.__engine.get_model().get_vocab()
+        self.config = self.__engine.get_config()
+        self.topic_words = self.__engine.get_model().topic_words()
+        self.topic_sum_table = self.__engine.get_model().topic_sum()
+
+        def take_elem(word_count):
+            return word_count.count
+
+        for i in range(self.config.num_topics):
+            self.topic_words[i].sort(key=take_elem, reverse=True)
+
+        logger.info("Finish Initialization.")
+
+    def infer_doc_topic_distribution(self, document):
+        """
+        This interface infers the topic distribution of document.
+
+        Args:
+            document(str): the input document text.
+
+        Returns:
+            results(list): returns the topic distribution of document.
+        """
+        tokens = self.__tokenizer.tokenize(document)
+        if tokens == []:
+            return []
+        results = []
+        sentences = []
+        sent = []
+        for i in range(len(tokens)):
+            sent.append(tokens[i])
+            if len(sent) % 5 == 0:
+                sentences.append(sent)
+                sent = []
+        if len(sent) > 0:
+            sentences.append(sent)
+        doc = SLDADoc()
+        self.__engine.infer(sentences, doc)
+        topics = doc.sparse_topic_dist()
+        for topic in topics:
+            results.append({"topic id": topic.tid, "distribution": topic.prob})
+        return results
+
+    def show_topic_keywords(self, topic_id, k=10):
+        """
+        This interface returns the k keywords under specific topic.
+
+        Args:
+            topic_id(int): topic information we want to know.
+            k(int): top k keywords.
+
+        Returns:
+            results(dict): contains specific topic's keywords and corresponding
+                           probability.
+        """
+        EPS = 1e-8
+        results = {}
+        if 0 <= topic_id < self.config.num_topics:
+            k = min(k, len(self.topic_words[topic_id]))
+            for i in range(k):
+                prob = self.topic_words[topic_id][i].count / \
+                       (self.topic_sum_table[topic_id] + EPS)
+                results[self.vocabulary[self.topic_words[topic_id]
+                                        [i].word_id]] = prob
+            return results
+        else:
+            logger.error("%d is out of range!" % topic_id)
--- a/hub_module/modules/text/semantic_model/slda_news/sampler.py
+++ b/hub_module/modules/text/semantic_model/slda_news/sampler.py
--- a/hub_module/modules/text/semantic_model/slda_news/semantic_matching.py
+++ b/hub_module/modules/text/semantic_model/slda_news/semantic_matching.py
+import os
+
+import numpy as np
+from paddlehub.common.logger import logger
+
+from slda_news.vocab import OOV
+
+EPS = 1e-06
+
+
+class WordAndDis(object):
+    def __init__(self):
+        self.word = None
+        self.distance = None
+
+
+class SemanticMatching(object):
+    def __init__(self):
+        pass
+
+    def l2_norm(self, vec):
+        """Calculate the length of vector.
+        """
+        result = np.sqrt(np.sum(vec**2))
+        return result
+
+    def cosine_similarity(self, vec1, vec2):
+        norm1 = self.l2_norm(vec1)
+        norm2 = self.l2_norm(vec2)
+        result = np.sum(vec1 * vec2) / norm1 / norm2
+        return result
+
+    def likelihood_based_similarity(self, terms, doc_topic_dist, model):
+        """
+        Args:
+            terms: list of strings
+            doc_topic_dist: list of Topic class
+            model: TopicModel class
+        """
+        num_of_term_in_vocab = 0
+        result = 0
+        for i in range(len(terms)):
+            term_id = model.term_id(terms[i])
+            if term_id == OOV:
+                continue
+            num_of_term_in_vocab += 1
+            for j in range(len(doc_topic_dist)):
+                topic_id = doc_topic_dist[j].tid
+                prob = doc_topic_dist[j].prob
+                result += model.word_topic_value(term_id, topic_id) * 1.0 / \
+                          model.topic_sum_value(topic_id) * prob
+
+        if num_of_term_in_vocab == 0:
+            return result
+        return result / num_of_term_in_vocab
+
+    def kullback_leibler_divergence(self, dist1, dist2):
+        assert dist1.shape == dist2.shape
+        dist2[dist2 < EPS] = EPS
+        result = np.sum(dist1 * np.log(dist1 / dist2))
+        return result
+
+    def jensen_shannon_divergence(self, dist1, dist2):
+        assert dist1.shape == dist2.shape
+        dist1[dist1 < EPS] = EPS
+        dist2[dist2 < EPS] = EPS
+        mean = (dist1 + dist2) * 0.5
+        jsd = self.kullback_leibler_divergence(dist1, mean) * 0.5 + \
+              self.kullback_leibler_divergence(dist2, mean) * 0.5
+        return jsd
+
+    def hellinger_distance(self, dist1, dist2):
+        assert dist1.shape == dist2.shape
+        result = np.sum((np.sqrt(dist1) - np.sqrt(dist2))**2)
+        result = np.sqrt(result) * 0.7071067812
+        return result
--- a/hub_module/modules/text/semantic_model/slda_news/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/slda_news/tokenizer.py
--- a/hub_module/modules/text/semantic_model/slda_news/util.py
+++ b/hub_module/modules/text/semantic_model/slda_news/util.py
+import time
+import yaml
+
+import numpy as np
+from paddlehub.common.logger import logger
+
+from slda_news.config import ModelType
+
+
+def load_prototxt(config_file, config):
+    """
+    Args:
+        config_file: model configuration file.
+        config: ModelConfig class
+    """
+    logger.info("Loading SLDA config.")
+    with open(config_file, 'r') as f:
+        yaml_dict = yaml.load(f, Loader=yaml.FullLoader)
+
+    # Assignment.
+    if yaml_dict["type"] == "LDA":
+        config.type = ModelType.LDA
+    else:
+        config.type = ModelType.SLDA
+    config.num_topics = yaml_dict["num_topics"]
+    config.alpha = yaml_dict["alpha"]
+    config.beta = yaml_dict["beta"]
+    config.word_topic_file = yaml_dict["word_topic_file"]
+    config.vocab_file = yaml_dict["vocab_file"]
+
+
+def fix_random_seed(seed=2147483647):
+    np.random.seed(seed)
+
+
+def rand(min_=0, max_=1):
+    return np.random.uniform(low=min_, high=max_)
+
+
+def rand_k(k):
+    """Returns an integer float number between [0, k - 1].
+    """
+    return int(rand() * k)
+
+
+def timeit(f):
+    """Return time cost of function f.
+    """
+
+    def timed(*args, **kwargs):
+        start_time = time.time()
+        result = f(*args, **kwargs)
+        end_time = time.time()
+        print("   [-] %s : %2.5f sec" % (f.__name__, end_time - start_time))
+        return result
+
+    return timed
--- a/hub_module/modules/text/semantic_model/slda_news/vocab.py
+++ b/hub_module/modules/text/semantic_model/slda_news/vocab.py
--- a/hub_module/modules/text/semantic_model/slda_news/vose_alias.py
+++ b/hub_module/modules/text/semantic_model/slda_news/vose_alias.py
--- a/hub_module/modules/text/semantic_model/slda_novel/README.md
+++ b/hub_module/modules/text/semantic_model/slda_novel/README.md
--- a/hub_module/modules/text/semantic_model/slda_novel/__init__.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/__init__.py
--- a/hub_module/modules/text/semantic_model/slda_novel/config.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/config.py
+"""
+This file defines the basic config information of LDA/SLDA model.
+"""
+
+
+class ModelType:
+    LDA = 0
+    SLDA = 1
+
+
+class ModelConfig:
+    type = None
+    num_topics = None
+    alpha = None
+    beta = None
+    word_topic_file = None
+    vocab_file = None
--- a/hub_module/modules/text/semantic_model/slda_novel/document.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/document.py
--- a/hub_module/modules/text/semantic_model/slda_novel/inference_engine.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/inference_engine.py
--- a/hub_module/modules/text/semantic_model/slda_novel/model.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/model.py
--- a/hub_module/modules/text/semantic_model/slda_novel/module.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/module.py
--- a/hub_module/modules/text/semantic_model/slda_novel/sampler.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/sampler.py
--- a/hub_module/modules/text/semantic_model/slda_novel/semantic_matching.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/semantic_matching.py
--- a/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py
--- a/hub_module/modules/text/semantic_model/slda_novel/util.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/util.py
--- a/hub_module/modules/text/semantic_model/slda_novel/vocab.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/vocab.py
--- a/hub_module/modules/text/semantic_model/slda_novel/vose_alias.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/vose_alias.py
--- a/hub_module/modules/text/semantic_model/slda_webpage/README.md
+++ b/hub_module/modules/text/semantic_model/slda_webpage/README.md
--- a/hub_module/modules/text/semantic_model/slda_webpage/__init__.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/__init__.py
--- a/hub_module/modules/text/semantic_model/slda_webpage/config.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/config.py
--- a/hub_module/modules/text/semantic_model/slda_webpage/document.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/document.py
--- a/hub_module/modules/text/semantic_model/slda_webpage/inference_engine.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/inference_engine.py
--- a/hub_module/modules/text/semantic_model/slda_webpage/model.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/model.py
--- a/hub_module/modules/text/semantic_model/slda_webpage/module.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/module.py
--- a/hub_module/modules/text/semantic_model/slda_webpage/sampler.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/sampler.py
--- a/hub_module/modules/text/semantic_model/slda_webpage/semantic_matching.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/semantic_matching.py
--- a/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py
--- a/hub_module/modules/text/semantic_model/slda_webpage/util.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/util.py
--- a/hub_module/modules/text/semantic_model/slda_webpage/vocab.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/vocab.py
--- a/hub_module/modules/text/semantic_model/slda_webpage/vose_alias.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/vose_alias.py
--- a/hub_module/modules/text/semantic_model/slda_weibo/README.md
+++ b/hub_module/modules/text/semantic_model/slda_weibo/README.md
--- a/hub_module/modules/text/semantic_model/slda_weibo/__init__.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/__init__.py
--- a/hub_module/modules/text/semantic_model/slda_weibo/config.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/config.py
--- a/hub_module/modules/text/semantic_model/slda_weibo/document.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/document.py
--- a/hub_module/modules/text/semantic_model/slda_weibo/inference_engine.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/inference_engine.py
--- a/hub_module/modules/text/semantic_model/slda_weibo/model.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/model.py
--- a/hub_module/modules/text/semantic_model/slda_weibo/module.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/module.py
--- a/hub_module/modules/text/semantic_model/slda_weibo/sampler.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/sampler.py
--- a/hub_module/modules/text/semantic_model/slda_weibo/semantic_matching.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/semantic_matching.py
--- a/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py
--- a/hub_module/modules/text/semantic_model/slda_weibo/util.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/util.py
--- a/hub_module/modules/text/semantic_model/slda_weibo/vocab.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/vocab.py
--- a/hub_module/modules/text/semantic_model/slda_weibo/vose_alias.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/vose_alias.py
--- a/hub_module/scripts/configs/lda_news.yml
+++ b/hub_module/scripts/configs/lda_news.yml
--- a/hub_module/scripts/configs/lda_novel.yml
+++ b/hub_module/scripts/configs/lda_novel.yml
--- a/hub_module/scripts/configs/lda_webpage.yml
+++ b/hub_module/scripts/configs/lda_webpage.yml
--- a/hub_module/scripts/configs/slda_news.yml
+++ b/hub_module/scripts/configs/slda_news.yml
--- a/hub_module/scripts/configs/slda_novel.yml
+++ b/hub_module/scripts/configs/slda_novel.yml
--- a/hub_module/scripts/configs/slda_webpage.yml
+++ b/hub_module/scripts/configs/slda_webpage.yml
--- a/hub_module/scripts/configs/slda_weibo.yml
+++ b/hub_module/scripts/configs/slda_weibo.yml