From 311c49d99e6f13f8bc5d4bc6691e63543320bf2e Mon Sep 17 00:00:00 2001
From: SiMing Dai <908660116@qq.com>
Date: Wed, 29 Jul 2020 20:12:01 +0800
Subject: [PATCH] Fix topic model (#784)

* fix bug

* fix lac gpu setting

* refine README

* fix README
---
 .../text/semantic_model/lda_news/README.md    | 19 ++++++++++++
 .../text/semantic_model/lda_news/tokenizer.py |  2 +-
 .../text/semantic_model/lda_novel/README.md   | 31 +++++++++++++++++++
 .../semantic_model/lda_novel/tokenizer.py     |  2 +-
 .../text/semantic_model/lda_webpage/README.md | 21 +++++++++++++
 .../semantic_model/lda_webpage/tokenizer.py   |  2 +-
 .../semantic_model/slda_news/tokenizer.py     |  2 +-
 .../text/semantic_model/slda_novel/README.md  | 26 ++++++++++++++++
 .../semantic_model/slda_novel/tokenizer.py    |  2 +-
 .../semantic_model/slda_webpage/README.md     | 11 +++++++
 .../semantic_model/slda_webpage/tokenizer.py  |  2 +-
 .../text/semantic_model/slda_weibo/README.md  | 25 +++++++++++++++
 .../semantic_model/slda_weibo/tokenizer.py    |  2 +-
 13 files changed, 140 insertions(+), 7 deletions(-)

diff --git a/hub_module/modules/text/semantic_model/lda_news/README.md b/hub_module/modules/text/semantic_model/lda_news/README.md
index 90b20da2..9e3078d6 100644
--- a/hub_module/modules/text/semantic_model/lda_news/README.md
+++ b/hub_module/modules/text/semantic_model/lda_news/README.md
@@ -100,6 +100,25 @@ results = lda_news.cal_doc_keywords_similarity('百度是全球最大的中文
 #  {'word': '中文', 'similarity': 0.020187103312009513},
 #  {'word': '搜索引擎', 'similarity': 0.007092890537169911}]
 
+results = lda_news.infer_doc_topic_distribution("最近有学者新出了一篇论文，关于自然语言处理的，可厉害了")
+# [{'topic id': 216, 'distribution': 0.5222222222222223},
+#  {'topic id': 1789, 'distribution': 0.18888888888888888},
+#  {'topic id': 98, 'distribution': 0.1111111111111111},
+#  {'topic id': 805, 'distribution': 0.044444444444444446},
+#  {'topic id': 56, 'distribution': 0.03333333333333333}, ...]
+
+keywords = lda_news.show_topic_keywords(topic_id=216)
+# {'研究': 0.1753955534055716,
+#  '学术': 0.13158917246453747,
+#  '论文': 0.1178632702247961,
+#  '课题': 0.057840811145163484,
+#  '发表': 0.05614630212471184,
+#  '成果': 0.03587086607950555,
+#  '期刊': 0.030608728068521086,
+#  '科研': 0.0216061375112729,
+#  '学者': 0.017739360125774,
+#  '科学': 0.015553720885167896}
+
 ```
 
 ## 查看代码
diff --git a/hub_module/modules/text/semantic_model/lda_news/tokenizer.py b/hub_module/modules/text/semantic_model/lda_news/tokenizer.py
index 419fbab7..e9aaa06f 100644
--- a/hub_module/modules/text/semantic_model/lda_news/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/lda_news/tokenizer.py
@@ -109,7 +109,7 @@ class LACTokenizer(Tokenizer):
 
     def tokenize(self, text):
         results = self.__lac.lexical_analysis(
-            texts=[text], use_gpu=True, batch_size=1, return_tag=True)
+            texts=[text], use_gpu=False, batch_size=1, return_tag=True)
         # Change English words to lower case.
         # And just preserve the word in vocab.
         words = results[0]["word"]
diff --git a/hub_module/modules/text/semantic_model/lda_novel/README.md b/hub_module/modules/text/semantic_model/lda_novel/README.md
index 698e51c0..5f330594 100644
--- a/hub_module/modules/text/semantic_model/lda_novel/README.md
+++ b/hub_module/modules/text/semantic_model/lda_novel/README.md
@@ -88,6 +88,37 @@ jsd, hd = lda_novel.cal_doc_distance(doc_text1="老人幸福地看着自己的
 lda_sim = lda_novel.cal_query_doc_similarity(query='亲孙女', document='老人激动地打量着面前的女孩，似乎找到了自己的亲孙女一般，双手止不住地颤抖着。')
 # LDA similarity = 0.0
 
+results = lda_novel.cal_doc_keywords_similarity('百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息，找到所求。百度超过千亿的中文网页数据库，可以瞬间找到相关的搜索结果。')
+# [{'word': '信息', 'similarity': 0.014140977159719738},
+#  {'word': '找到', 'similarity': 0.012251022010382823},
+#  {'word': '搜索', 'similarity': 0.004262275169349261},
+#  {'word': '网页', 'similarity': 0.0026937499565468327},
+#  {'word': '百度', 'similarity': 0.0021199508577209015},
+#  {'word': '全球', 'similarity': 0.0010464078137351785},
+#  {'word': '中文', 'similarity': 0.0009866259107630141},
+#  {'word': '瞬间', 'similarity': 0.0009262589016537221},
+#  {'word': '超过', 'similarity': 0.0008362863020592123},
+#  {'word': '相关', 'similarity': 0.000793663877590302}]
+
+results = lda_novel.infer_doc_topic_distribution("妈妈告诉女儿，今天爸爸过生日，放学后要早点回家一起庆祝")
+# [{'topic id': 0, 'distribution': 0.7166666666666667},
+#  {'topic id': 64, 'distribution': 0.11666666666666667},
+#  {'topic id': 125, 'distribution': 0.020833333333333332},
+#  {'topic id': 131, 'distribution': 0.016666666666666666},
+#  {'topic id': 137, 'distribution': 0.016666666666666666}, ...]
+
+keywords = lda_novel.show_topic_keywords(topic_id=0)
+# {'妈妈': 0.36114392028319225,
+#  '爸爸': 0.18456064543161096,
+#  '女儿': 0.03591842787260316,
+#  '孩子': 0.01567368390197123,
+#  '家里': 0.014277018999815379,
+#  '回家': 0.013514888275429099,
+#  '回来': 0.013275213681108526,
+#  '爸妈': 0.007931677222119656,
+#  '告诉': 0.006841933742906693,
+#  '父母': 0.00627464639375944}
+
 ```
 
 ## 查看代码
diff --git a/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py b/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py
index b0a097b6..562eaa63 100644
--- a/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py
@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer):
 
     def tokenize(self, text):
         results = self.__lac.lexical_analysis(
-            texts=[text], use_gpu=True, batch_size=1, return_tag=True)
+            texts=[text], use_gpu=False, batch_size=1, return_tag=True)
         # Change English words to lower case.
         # And just preserve the word in vocab.
         words = results[0]["word"]
diff --git a/hub_module/modules/text/semantic_model/lda_webpage/README.md b/hub_module/modules/text/semantic_model/lda_webpage/README.md
index 527e1b96..a859876d 100644
--- a/hub_module/modules/text/semantic_model/lda_webpage/README.md
+++ b/hub_module/modules/text/semantic_model/lda_webpage/README.md
@@ -95,6 +95,27 @@ results = lda_webpage.cal_doc_keywords_similarity('百度首页推荐着各种
 #   {'word': '功能', 'similarity': 0.011409342579361237},
 #   {'word': '搜索引擎', 'similarity': 0.010392479335778413}]
 
+out = lda_webpage.cal_query_doc_similarity(query='百度搜索引擎', document='百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息，找到所求。百度超过千亿的中文网页数据库，可以瞬间找到相关的搜索结果。')
+# out = 0.0283
+
+results = lda_webpage.infer_doc_topic_distribution("百度文库非常的好用，我们不仅在里面找到需要的文档，同时可以通过续费畅读精品文档。")
+# [{'topic id': 3458, 'distribution': 0.5277777777777778},
+#  {'topic id': 1927, 'distribution': 0.17777777777777778},
+#  {'topic id': 1497, 'distribution': 0.05},
+#  {'topic id': 1901, 'distribution': 0.03333333333333333}...]
+
+keywords = lda_webpage.show_topic_keywords(3458)
+# {'price': 0.10977647395316775,
+#  '文档': 0.06445075002937038,
+#  '财富值': 0.04012675135746289,
+#  '文库': 0.03953267826572788,
+#  'len': 0.038856163693739426,
+#  'tag': 0.03868762622172197,
+#  'current': 0.03728225157463761,
+#  'cut': 0.03448665775467454,
+#  '尺寸': 0.03250387028891812,
+#  '财富': 0.02902896727051734}
+
 ```
 
 ## 查看代码
diff --git a/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py b/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py
index b0a097b6..562eaa63 100644
--- a/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py
@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer):
 
     def tokenize(self, text):
         results = self.__lac.lexical_analysis(
-            texts=[text], use_gpu=True, batch_size=1, return_tag=True)
+            texts=[text], use_gpu=False, batch_size=1, return_tag=True)
         # Change English words to lower case.
         # And just preserve the word in vocab.
         words = results[0]["word"]
diff --git a/hub_module/modules/text/semantic_model/slda_news/tokenizer.py b/hub_module/modules/text/semantic_model/slda_news/tokenizer.py
index b0a097b6..562eaa63 100644
--- a/hub_module/modules/text/semantic_model/slda_news/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/slda_news/tokenizer.py
@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer):
 
     def tokenize(self, text):
         results = self.__lac.lexical_analysis(
-            texts=[text], use_gpu=True, batch_size=1, return_tag=True)
+            texts=[text], use_gpu=False, batch_size=1, return_tag=True)
         # Change English words to lower case.
         # And just preserve the word in vocab.
         words = results[0]["word"]
diff --git a/hub_module/modules/text/semantic_model/slda_novel/README.md b/hub_module/modules/text/semantic_model/slda_novel/README.md
index c98ee04f..a08ae571 100644
--- a/hub_module/modules/text/semantic_model/slda_novel/README.md
+++ b/hub_module/modules/text/semantic_model/slda_novel/README.md
@@ -37,6 +37,32 @@
 
 - results(dict): 返回对应文档的前k个关键词，以及各个关键词在文档中的出现概率。
 
+### 代码示例
+
+这里展示部分API的使用示例。
+
+``` python
+import paddlehub as hub
+
+slda_novel = hub.Module("slda_novel")
+
+topic_dist = slda_novel.infer_doc_topic_distribution("妈妈告诉女儿，今天爸爸过生日，放学后要早点回家一起庆祝")
+# [{'topic id': 222, 'distribution': 0.5}, {'topic id': 362, 'distribution': 0.5}]
+
+keywords = slda_novel.show_topic_keywords(topic_id=222)
+# {'回来': 0.044502306717752,
+#  '回去': 0.036457065533017245,
+#  '回家': 0.029136327306669554,
+#  '明天': 0.028762575780517493,
+#  '休息': 0.022904260192395567,
+#  '晚上': 0.021970839714261954,
+#  '时间': 0.020756626422891028,
+#  '好好': 0.019726413882856498,
+#  '电话': 0.017195445214734463,
+#  '吃饭': 0.01521839547511471}
+
+```
+
 ## 查看代码
 https://github.com/baidu/Familia
 
diff --git a/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py b/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py
index b0a097b6..562eaa63 100644
--- a/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py
@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer):
 
     def tokenize(self, text):
         results = self.__lac.lexical_analysis(
-            texts=[text], use_gpu=True, batch_size=1, return_tag=True)
+            texts=[text], use_gpu=False, batch_size=1, return_tag=True)
         # Change English words to lower case.
         # And just preserve the word in vocab.
         words = results[0]["word"]
diff --git a/hub_module/modules/text/semantic_model/slda_webpage/README.md b/hub_module/modules/text/semantic_model/slda_webpage/README.md
index 3f9bd197..8ab7891e 100644
--- a/hub_module/modules/text/semantic_model/slda_webpage/README.md
+++ b/hub_module/modules/text/semantic_model/slda_webpage/README.md
@@ -54,6 +54,17 @@ topic_dist = slda_webpage.infer_doc_topic_distribution("百度是全球最大的
 #  {'topic id': 4410, 'distribution': 0.016666666666666666},
 #  {'topic id': 4676, 'distribution': 0.016666666666666666}]
 
+keywords = slda_webpage.show_topic_keywords(topic_id=4687)
+# {'市场': 0.07413332566788851,
+#  '增长': 0.045259383167567974,
+#  '规模': 0.030225253512468797,
+#  '用户': 0.02278765317990645,
+#  '超过': 0.019395970334729278,
+#  '份额': 0.019091932266952005,
+#  '全球': 0.018879934814238216,
+#  '手机': 0.01252139322404175,
+#  '美元': 0.01202885155424257,
+#  '收入': 0.011096560279140084}
 
 ```
 
diff --git a/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py b/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py
index b0a097b6..562eaa63 100644
--- a/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py
@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer):
 
     def tokenize(self, text):
         results = self.__lac.lexical_analysis(
-            texts=[text], use_gpu=True, batch_size=1, return_tag=True)
+            texts=[text], use_gpu=False, batch_size=1, return_tag=True)
         # Change English words to lower case.
         # And just preserve the word in vocab.
         words = results[0]["word"]
diff --git a/hub_module/modules/text/semantic_model/slda_weibo/README.md b/hub_module/modules/text/semantic_model/slda_weibo/README.md
index 707a7446..edd7f737 100644
--- a/hub_module/modules/text/semantic_model/slda_weibo/README.md
+++ b/hub_module/modules/text/semantic_model/slda_weibo/README.md
@@ -37,6 +37,31 @@
 
 - results(dict): 返回对应文档的前k个关键词，以及各个关键词在文档中的出现概率。
 
+### 代码示例
+
+这里展示API的使用示例。
+
+``` python
+import paddlehub as hub
+
+slda_weibo = hub.Module(name="slda_weibo")
+
+topic_dist = slda_weibo.infer_doc_topic_distribution("百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息，找到所求。")
+# [{'topic id': 874, 'distribution': 0.5}, {'topic id': 1764, 'distribution': 0.5}]
+
+keywords = slda_weibo.show_topic_keywords(topic_id=874)
+# {'数据': 0.07850538018570305,
+#  '更新': 0.04504777051711974,
+#  '出口': 0.023363758946167185,
+#  '信息': 0.020567061200812687,
+#  '全国': 0.015975367546781145,
+#  '双十一': 0.014998636225687216,
+#  '地理': 0.013257422965959297,
+#  '官方': 0.012913598174463106,
+#  '支持': 0.01177359809763076,
+#  '说话': 0.011205999070328388}
+
+```
 ## 查看代码
 https://github.com/baidu/Familia
 
diff --git a/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py b/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py
index b0a097b6..562eaa63 100644
--- a/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py
+++ b/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py
@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer):
 
     def tokenize(self, text):
         results = self.__lac.lexical_analysis(
-            texts=[text], use_gpu=True, batch_size=1, return_tag=True)
+            texts=[text], use_gpu=False, batch_size=1, return_tag=True)
         # Change English words to lower case.
         # And just preserve the word in vocab.
         words = results[0]["word"]
-- 
GitLab