From 311c49d99e6f13f8bc5d4bc6691e63543320bf2e Mon Sep 17 00:00:00 2001 From: SiMing Dai <908660116@qq.com> Date: Wed, 29 Jul 2020 20:12:01 +0800 Subject: [PATCH] Fix topic model (#784) * fix bug * fix lac gpu setting * refine README * fix README --- .../text/semantic_model/lda_news/README.md | 19 ++++++++++++ .../text/semantic_model/lda_news/tokenizer.py | 2 +- .../text/semantic_model/lda_novel/README.md | 31 +++++++++++++++++++ .../semantic_model/lda_novel/tokenizer.py | 2 +- .../text/semantic_model/lda_webpage/README.md | 21 +++++++++++++ .../semantic_model/lda_webpage/tokenizer.py | 2 +- .../semantic_model/slda_news/tokenizer.py | 2 +- .../text/semantic_model/slda_novel/README.md | 26 ++++++++++++++++ .../semantic_model/slda_novel/tokenizer.py | 2 +- .../semantic_model/slda_webpage/README.md | 11 +++++++ .../semantic_model/slda_webpage/tokenizer.py | 2 +- .../text/semantic_model/slda_weibo/README.md | 25 +++++++++++++++ .../semantic_model/slda_weibo/tokenizer.py | 2 +- 13 files changed, 140 insertions(+), 7 deletions(-) diff --git a/hub_module/modules/text/semantic_model/lda_news/README.md b/hub_module/modules/text/semantic_model/lda_news/README.md index 90b20da2..9e3078d6 100644 --- a/hub_module/modules/text/semantic_model/lda_news/README.md +++ b/hub_module/modules/text/semantic_model/lda_news/README.md @@ -100,6 +100,25 @@ results = lda_news.cal_doc_keywords_similarity('百度是全球最大的中文 # {'word': '中文', 'similarity': 0.020187103312009513}, # {'word': '搜索引擎', 'similarity': 0.007092890537169911}] +results = lda_news.infer_doc_topic_distribution("最近有学者新出了一篇论文,关于自然语言处理的,可厉害了") +# [{'topic id': 216, 'distribution': 0.5222222222222223}, +# {'topic id': 1789, 'distribution': 0.18888888888888888}, +# {'topic id': 98, 'distribution': 0.1111111111111111}, +# {'topic id': 805, 'distribution': 0.044444444444444446}, +# {'topic id': 56, 'distribution': 0.03333333333333333}, ...] + +keywords = lda_news.show_topic_keywords(topic_id=216) +# {'研究': 0.1753955534055716, +# '学术': 0.13158917246453747, +# '论文': 0.1178632702247961, +# '课题': 0.057840811145163484, +# '发表': 0.05614630212471184, +# '成果': 0.03587086607950555, +# '期刊': 0.030608728068521086, +# '科研': 0.0216061375112729, +# '学者': 0.017739360125774, +# '科学': 0.015553720885167896} + ``` ## 查看代码 diff --git a/hub_module/modules/text/semantic_model/lda_news/tokenizer.py b/hub_module/modules/text/semantic_model/lda_news/tokenizer.py index 419fbab7..e9aaa06f 100644 --- a/hub_module/modules/text/semantic_model/lda_news/tokenizer.py +++ b/hub_module/modules/text/semantic_model/lda_news/tokenizer.py @@ -109,7 +109,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"] diff --git a/hub_module/modules/text/semantic_model/lda_novel/README.md b/hub_module/modules/text/semantic_model/lda_novel/README.md index 698e51c0..5f330594 100644 --- a/hub_module/modules/text/semantic_model/lda_novel/README.md +++ b/hub_module/modules/text/semantic_model/lda_novel/README.md @@ -88,6 +88,37 @@ jsd, hd = lda_novel.cal_doc_distance(doc_text1="老人幸福地看着自己的 lda_sim = lda_novel.cal_query_doc_similarity(query='亲孙女', document='老人激动地打量着面前的女孩,似乎找到了自己的亲孙女一般,双手止不住地颤抖着。') # LDA similarity = 0.0 +results = lda_novel.cal_doc_keywords_similarity('百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。百度超过千亿的中文网页数据库,可以瞬间找到相关的搜索结果。') +# [{'word': '信息', 'similarity': 0.014140977159719738}, +# {'word': '找到', 'similarity': 0.012251022010382823}, +# {'word': '搜索', 'similarity': 0.004262275169349261}, +# {'word': '网页', 'similarity': 0.0026937499565468327}, +# {'word': '百度', 'similarity': 0.0021199508577209015}, +# {'word': '全球', 'similarity': 0.0010464078137351785}, +# {'word': '中文', 'similarity': 0.0009866259107630141}, +# {'word': '瞬间', 'similarity': 0.0009262589016537221}, +# {'word': '超过', 'similarity': 0.0008362863020592123}, +# {'word': '相关', 'similarity': 0.000793663877590302}] + +results = lda_novel.infer_doc_topic_distribution("妈妈告诉女儿,今天爸爸过生日,放学后要早点回家一起庆祝") +# [{'topic id': 0, 'distribution': 0.7166666666666667}, +# {'topic id': 64, 'distribution': 0.11666666666666667}, +# {'topic id': 125, 'distribution': 0.020833333333333332}, +# {'topic id': 131, 'distribution': 0.016666666666666666}, +# {'topic id': 137, 'distribution': 0.016666666666666666}, ...] + +keywords = lda_novel.show_topic_keywords(topic_id=0) +# {'妈妈': 0.36114392028319225, +# '爸爸': 0.18456064543161096, +# '女儿': 0.03591842787260316, +# '孩子': 0.01567368390197123, +# '家里': 0.014277018999815379, +# '回家': 0.013514888275429099, +# '回来': 0.013275213681108526, +# '爸妈': 0.007931677222119656, +# '告诉': 0.006841933742906693, +# '父母': 0.00627464639375944} + ``` ## 查看代码 diff --git a/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py b/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py index b0a097b6..562eaa63 100644 --- a/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py +++ b/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py @@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"] diff --git a/hub_module/modules/text/semantic_model/lda_webpage/README.md b/hub_module/modules/text/semantic_model/lda_webpage/README.md index 527e1b96..a859876d 100644 --- a/hub_module/modules/text/semantic_model/lda_webpage/README.md +++ b/hub_module/modules/text/semantic_model/lda_webpage/README.md @@ -95,6 +95,27 @@ results = lda_webpage.cal_doc_keywords_similarity('百度首页推荐着各种 # {'word': '功能', 'similarity': 0.011409342579361237}, # {'word': '搜索引擎', 'similarity': 0.010392479335778413}] +out = lda_webpage.cal_query_doc_similarity(query='百度搜索引擎', document='百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。百度超过千亿的中文网页数据库,可以瞬间找到相关的搜索结果。') +# out = 0.0283 + +results = lda_webpage.infer_doc_topic_distribution("百度文库非常的好用,我们不仅在里面找到需要的文档,同时可以通过续费畅读精品文档。") +# [{'topic id': 3458, 'distribution': 0.5277777777777778}, +# {'topic id': 1927, 'distribution': 0.17777777777777778}, +# {'topic id': 1497, 'distribution': 0.05}, +# {'topic id': 1901, 'distribution': 0.03333333333333333}...] + +keywords = lda_webpage.show_topic_keywords(3458) +# {'price': 0.10977647395316775, +# '文档': 0.06445075002937038, +# '财富值': 0.04012675135746289, +# '文库': 0.03953267826572788, +# 'len': 0.038856163693739426, +# 'tag': 0.03868762622172197, +# 'current': 0.03728225157463761, +# 'cut': 0.03448665775467454, +# '尺寸': 0.03250387028891812, +# '财富': 0.02902896727051734} + ``` ## 查看代码 diff --git a/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py b/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py index b0a097b6..562eaa63 100644 --- a/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py +++ b/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py @@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"] diff --git a/hub_module/modules/text/semantic_model/slda_news/tokenizer.py b/hub_module/modules/text/semantic_model/slda_news/tokenizer.py index b0a097b6..562eaa63 100644 --- a/hub_module/modules/text/semantic_model/slda_news/tokenizer.py +++ b/hub_module/modules/text/semantic_model/slda_news/tokenizer.py @@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"] diff --git a/hub_module/modules/text/semantic_model/slda_novel/README.md b/hub_module/modules/text/semantic_model/slda_novel/README.md index c98ee04f..a08ae571 100644 --- a/hub_module/modules/text/semantic_model/slda_novel/README.md +++ b/hub_module/modules/text/semantic_model/slda_novel/README.md @@ -37,6 +37,32 @@ - results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。 +### 代码示例 + +这里展示部分API的使用示例。 + +``` python +import paddlehub as hub + +slda_novel = hub.Module("slda_novel") + +topic_dist = slda_novel.infer_doc_topic_distribution("妈妈告诉女儿,今天爸爸过生日,放学后要早点回家一起庆祝") +# [{'topic id': 222, 'distribution': 0.5}, {'topic id': 362, 'distribution': 0.5}] + +keywords = slda_novel.show_topic_keywords(topic_id=222) +# {'回来': 0.044502306717752, +# '回去': 0.036457065533017245, +# '回家': 0.029136327306669554, +# '明天': 0.028762575780517493, +# '休息': 0.022904260192395567, +# '晚上': 0.021970839714261954, +# '时间': 0.020756626422891028, +# '好好': 0.019726413882856498, +# '电话': 0.017195445214734463, +# '吃饭': 0.01521839547511471} + +``` + ## 查看代码 https://github.com/baidu/Familia diff --git a/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py b/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py index b0a097b6..562eaa63 100644 --- a/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py +++ b/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py @@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"] diff --git a/hub_module/modules/text/semantic_model/slda_webpage/README.md b/hub_module/modules/text/semantic_model/slda_webpage/README.md index 3f9bd197..8ab7891e 100644 --- a/hub_module/modules/text/semantic_model/slda_webpage/README.md +++ b/hub_module/modules/text/semantic_model/slda_webpage/README.md @@ -54,6 +54,17 @@ topic_dist = slda_webpage.infer_doc_topic_distribution("百度是全球最大的 # {'topic id': 4410, 'distribution': 0.016666666666666666}, # {'topic id': 4676, 'distribution': 0.016666666666666666}] +keywords = slda_webpage.show_topic_keywords(topic_id=4687) +# {'市场': 0.07413332566788851, +# '增长': 0.045259383167567974, +# '规模': 0.030225253512468797, +# '用户': 0.02278765317990645, +# '超过': 0.019395970334729278, +# '份额': 0.019091932266952005, +# '全球': 0.018879934814238216, +# '手机': 0.01252139322404175, +# '美元': 0.01202885155424257, +# '收入': 0.011096560279140084} ``` diff --git a/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py b/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py index b0a097b6..562eaa63 100644 --- a/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py +++ b/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py @@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"] diff --git a/hub_module/modules/text/semantic_model/slda_weibo/README.md b/hub_module/modules/text/semantic_model/slda_weibo/README.md index 707a7446..edd7f737 100644 --- a/hub_module/modules/text/semantic_model/slda_weibo/README.md +++ b/hub_module/modules/text/semantic_model/slda_weibo/README.md @@ -37,6 +37,31 @@ - results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。 +### 代码示例 + +这里展示API的使用示例。 + +``` python +import paddlehub as hub + +slda_weibo = hub.Module(name="slda_weibo") + +topic_dist = slda_weibo.infer_doc_topic_distribution("百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。") +# [{'topic id': 874, 'distribution': 0.5}, {'topic id': 1764, 'distribution': 0.5}] + +keywords = slda_weibo.show_topic_keywords(topic_id=874) +# {'数据': 0.07850538018570305, +# '更新': 0.04504777051711974, +# '出口': 0.023363758946167185, +# '信息': 0.020567061200812687, +# '全国': 0.015975367546781145, +# '双十一': 0.014998636225687216, +# '地理': 0.013257422965959297, +# '官方': 0.012913598174463106, +# '支持': 0.01177359809763076, +# '说话': 0.011205999070328388} + +``` ## 查看代码 https://github.com/baidu/Familia diff --git a/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py b/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py index b0a097b6..562eaa63 100644 --- a/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py +++ b/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py @@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"] -- GitLab