diff --git a/hub_module/modules/text/semantic_model/lda_news/README.md b/hub_module/modules/text/semantic_model/lda_news/README.md index 90b20da21d66d2f1b49b72123a1229b8ed46e1a5..9e3078d676515ecc2e57cc833023cf8b1451471a 100644 --- a/hub_module/modules/text/semantic_model/lda_news/README.md +++ b/hub_module/modules/text/semantic_model/lda_news/README.md @@ -100,6 +100,25 @@ results = lda_news.cal_doc_keywords_similarity('百度是全球最大的中文 # {'word': '中文', 'similarity': 0.020187103312009513}, # {'word': '搜索引擎', 'similarity': 0.007092890537169911}] +results = lda_news.infer_doc_topic_distribution("最近有学者新出了一篇论文,关于自然语言处理的,可厉害了") +# [{'topic id': 216, 'distribution': 0.5222222222222223}, +# {'topic id': 1789, 'distribution': 0.18888888888888888}, +# {'topic id': 98, 'distribution': 0.1111111111111111}, +# {'topic id': 805, 'distribution': 0.044444444444444446}, +# {'topic id': 56, 'distribution': 0.03333333333333333}, ...] + +keywords = lda_news.show_topic_keywords(topic_id=216) +# {'研究': 0.1753955534055716, +# '学术': 0.13158917246453747, +# '论文': 0.1178632702247961, +# '课题': 0.057840811145163484, +# '发表': 0.05614630212471184, +# '成果': 0.03587086607950555, +# '期刊': 0.030608728068521086, +# '科研': 0.0216061375112729, +# '学者': 0.017739360125774, +# '科学': 0.015553720885167896} + ``` ## 查看代码 diff --git a/hub_module/modules/text/semantic_model/lda_news/tokenizer.py b/hub_module/modules/text/semantic_model/lda_news/tokenizer.py index 419fbab7edbc8033ad1a57cb794e68d68daec307..e9aaa06fea1c20efc8789b70b2c771413853b8aa 100644 --- a/hub_module/modules/text/semantic_model/lda_news/tokenizer.py +++ b/hub_module/modules/text/semantic_model/lda_news/tokenizer.py @@ -109,7 +109,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"] diff --git a/hub_module/modules/text/semantic_model/lda_novel/README.md b/hub_module/modules/text/semantic_model/lda_novel/README.md index 698e51c0af1d0a89a25a2f3e2f97dd481a5a0119..5f33059452a2e3fbd8f970410e867afe8c6d20c6 100644 --- a/hub_module/modules/text/semantic_model/lda_novel/README.md +++ b/hub_module/modules/text/semantic_model/lda_novel/README.md @@ -88,6 +88,37 @@ jsd, hd = lda_novel.cal_doc_distance(doc_text1="老人幸福地看着自己的 lda_sim = lda_novel.cal_query_doc_similarity(query='亲孙女', document='老人激动地打量着面前的女孩,似乎找到了自己的亲孙女一般,双手止不住地颤抖着。') # LDA similarity = 0.0 +results = lda_novel.cal_doc_keywords_similarity('百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。百度超过千亿的中文网页数据库,可以瞬间找到相关的搜索结果。') +# [{'word': '信息', 'similarity': 0.014140977159719738}, +# {'word': '找到', 'similarity': 0.012251022010382823}, +# {'word': '搜索', 'similarity': 0.004262275169349261}, +# {'word': '网页', 'similarity': 0.0026937499565468327}, +# {'word': '百度', 'similarity': 0.0021199508577209015}, +# {'word': '全球', 'similarity': 0.0010464078137351785}, +# {'word': '中文', 'similarity': 0.0009866259107630141}, +# {'word': '瞬间', 'similarity': 0.0009262589016537221}, +# {'word': '超过', 'similarity': 0.0008362863020592123}, +# {'word': '相关', 'similarity': 0.000793663877590302}] + +results = lda_novel.infer_doc_topic_distribution("妈妈告诉女儿,今天爸爸过生日,放学后要早点回家一起庆祝") +# [{'topic id': 0, 'distribution': 0.7166666666666667}, +# {'topic id': 64, 'distribution': 0.11666666666666667}, +# {'topic id': 125, 'distribution': 0.020833333333333332}, +# {'topic id': 131, 'distribution': 0.016666666666666666}, +# {'topic id': 137, 'distribution': 0.016666666666666666}, ...] + +keywords = lda_novel.show_topic_keywords(topic_id=0) +# {'妈妈': 0.36114392028319225, +# '爸爸': 0.18456064543161096, +# '女儿': 0.03591842787260316, +# '孩子': 0.01567368390197123, +# '家里': 0.014277018999815379, +# '回家': 0.013514888275429099, +# '回来': 0.013275213681108526, +# '爸妈': 0.007931677222119656, +# '告诉': 0.006841933742906693, +# '父母': 0.00627464639375944} + ``` ## 查看代码 diff --git a/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py b/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py index b0a097b601e4195f9e167028d35330683d19e29c..562eaa633c45c6150a85a67347ee4c08539d94b2 100644 --- a/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py +++ b/hub_module/modules/text/semantic_model/lda_novel/tokenizer.py @@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"] diff --git a/hub_module/modules/text/semantic_model/lda_webpage/README.md b/hub_module/modules/text/semantic_model/lda_webpage/README.md index 527e1b96bc8aa0e7825237f9dae72598282569ec..a859876de56a24e59a37dc72de31c02dda49606c 100644 --- a/hub_module/modules/text/semantic_model/lda_webpage/README.md +++ b/hub_module/modules/text/semantic_model/lda_webpage/README.md @@ -95,6 +95,27 @@ results = lda_webpage.cal_doc_keywords_similarity('百度首页推荐着各种 # {'word': '功能', 'similarity': 0.011409342579361237}, # {'word': '搜索引擎', 'similarity': 0.010392479335778413}] +out = lda_webpage.cal_query_doc_similarity(query='百度搜索引擎', document='百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。百度超过千亿的中文网页数据库,可以瞬间找到相关的搜索结果。') +# out = 0.0283 + +results = lda_webpage.infer_doc_topic_distribution("百度文库非常的好用,我们不仅在里面找到需要的文档,同时可以通过续费畅读精品文档。") +# [{'topic id': 3458, 'distribution': 0.5277777777777778}, +# {'topic id': 1927, 'distribution': 0.17777777777777778}, +# {'topic id': 1497, 'distribution': 0.05}, +# {'topic id': 1901, 'distribution': 0.03333333333333333}...] + +keywords = lda_webpage.show_topic_keywords(3458) +# {'price': 0.10977647395316775, +# '文档': 0.06445075002937038, +# '财富值': 0.04012675135746289, +# '文库': 0.03953267826572788, +# 'len': 0.038856163693739426, +# 'tag': 0.03868762622172197, +# 'current': 0.03728225157463761, +# 'cut': 0.03448665775467454, +# '尺寸': 0.03250387028891812, +# '财富': 0.02902896727051734} + ``` ## 查看代码 diff --git a/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py b/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py index b0a097b601e4195f9e167028d35330683d19e29c..562eaa633c45c6150a85a67347ee4c08539d94b2 100644 --- a/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py +++ b/hub_module/modules/text/semantic_model/lda_webpage/tokenizer.py @@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"] diff --git a/hub_module/modules/text/semantic_model/slda_news/tokenizer.py b/hub_module/modules/text/semantic_model/slda_news/tokenizer.py index b0a097b601e4195f9e167028d35330683d19e29c..562eaa633c45c6150a85a67347ee4c08539d94b2 100644 --- a/hub_module/modules/text/semantic_model/slda_news/tokenizer.py +++ b/hub_module/modules/text/semantic_model/slda_news/tokenizer.py @@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"] diff --git a/hub_module/modules/text/semantic_model/slda_novel/README.md b/hub_module/modules/text/semantic_model/slda_novel/README.md index c98ee04f962f37e4e0d98adfbef33a39b0cc17e6..a08ae571b0691a3dd99ecb53cdeea323f3cc67aa 100644 --- a/hub_module/modules/text/semantic_model/slda_novel/README.md +++ b/hub_module/modules/text/semantic_model/slda_novel/README.md @@ -37,6 +37,32 @@ - results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。 +### 代码示例 + +这里展示部分API的使用示例。 + +``` python +import paddlehub as hub + +slda_novel = hub.Module("slda_novel") + +topic_dist = slda_novel.infer_doc_topic_distribution("妈妈告诉女儿,今天爸爸过生日,放学后要早点回家一起庆祝") +# [{'topic id': 222, 'distribution': 0.5}, {'topic id': 362, 'distribution': 0.5}] + +keywords = slda_novel.show_topic_keywords(topic_id=222) +# {'回来': 0.044502306717752, +# '回去': 0.036457065533017245, +# '回家': 0.029136327306669554, +# '明天': 0.028762575780517493, +# '休息': 0.022904260192395567, +# '晚上': 0.021970839714261954, +# '时间': 0.020756626422891028, +# '好好': 0.019726413882856498, +# '电话': 0.017195445214734463, +# '吃饭': 0.01521839547511471} + +``` + ## 查看代码 https://github.com/baidu/Familia diff --git a/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py b/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py index b0a097b601e4195f9e167028d35330683d19e29c..562eaa633c45c6150a85a67347ee4c08539d94b2 100644 --- a/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py +++ b/hub_module/modules/text/semantic_model/slda_novel/tokenizer.py @@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"] diff --git a/hub_module/modules/text/semantic_model/slda_webpage/README.md b/hub_module/modules/text/semantic_model/slda_webpage/README.md index 3f9bd197997d046744ae833b59ddd63064f3058e..8ab7891e8503b3a2c31f4bb71a68351f9955dcba 100644 --- a/hub_module/modules/text/semantic_model/slda_webpage/README.md +++ b/hub_module/modules/text/semantic_model/slda_webpage/README.md @@ -54,6 +54,17 @@ topic_dist = slda_webpage.infer_doc_topic_distribution("百度是全球最大的 # {'topic id': 4410, 'distribution': 0.016666666666666666}, # {'topic id': 4676, 'distribution': 0.016666666666666666}] +keywords = slda_webpage.show_topic_keywords(topic_id=4687) +# {'市场': 0.07413332566788851, +# '增长': 0.045259383167567974, +# '规模': 0.030225253512468797, +# '用户': 0.02278765317990645, +# '超过': 0.019395970334729278, +# '份额': 0.019091932266952005, +# '全球': 0.018879934814238216, +# '手机': 0.01252139322404175, +# '美元': 0.01202885155424257, +# '收入': 0.011096560279140084} ``` diff --git a/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py b/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py index b0a097b601e4195f9e167028d35330683d19e29c..562eaa633c45c6150a85a67347ee4c08539d94b2 100644 --- a/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py +++ b/hub_module/modules/text/semantic_model/slda_webpage/tokenizer.py @@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"] diff --git a/hub_module/modules/text/semantic_model/slda_weibo/README.md b/hub_module/modules/text/semantic_model/slda_weibo/README.md index 707a7446ecdbb7048a479350170fb4826b976897..edd7f73763cc460e6365adbf08dfb5b9ae3d657a 100644 --- a/hub_module/modules/text/semantic_model/slda_weibo/README.md +++ b/hub_module/modules/text/semantic_model/slda_weibo/README.md @@ -37,6 +37,31 @@ - results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。 +### 代码示例 + +这里展示API的使用示例。 + +``` python +import paddlehub as hub + +slda_weibo = hub.Module(name="slda_weibo") + +topic_dist = slda_weibo.infer_doc_topic_distribution("百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。") +# [{'topic id': 874, 'distribution': 0.5}, {'topic id': 1764, 'distribution': 0.5}] + +keywords = slda_weibo.show_topic_keywords(topic_id=874) +# {'数据': 0.07850538018570305, +# '更新': 0.04504777051711974, +# '出口': 0.023363758946167185, +# '信息': 0.020567061200812687, +# '全国': 0.015975367546781145, +# '双十一': 0.014998636225687216, +# '地理': 0.013257422965959297, +# '官方': 0.012913598174463106, +# '支持': 0.01177359809763076, +# '说话': 0.011205999070328388} + +``` ## 查看代码 https://github.com/baidu/Familia diff --git a/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py b/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py index b0a097b601e4195f9e167028d35330683d19e29c..562eaa633c45c6150a85a67347ee4c08539d94b2 100644 --- a/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py +++ b/hub_module/modules/text/semantic_model/slda_weibo/tokenizer.py @@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): def tokenize(self, text): results = self.__lac.lexical_analysis( - texts=[text], use_gpu=True, batch_size=1, return_tag=True) + texts=[text], use_gpu=False, batch_size=1, return_tag=True) # Change English words to lower case. # And just preserve the word in vocab. words = results[0]["word"]