未验证 提交 311c49d9 编写于 作者: S SiMing Dai 提交者: GitHub

Fix topic model (#784)

* fix bug

* fix lac gpu setting

* refine README

* fix README
上级 e35ff5ec
...@@ -100,6 +100,25 @@ results = lda_news.cal_doc_keywords_similarity('百度是全球最大的中文 ...@@ -100,6 +100,25 @@ results = lda_news.cal_doc_keywords_similarity('百度是全球最大的中文
# {'word': '中文', 'similarity': 0.020187103312009513}, # {'word': '中文', 'similarity': 0.020187103312009513},
# {'word': '搜索引擎', 'similarity': 0.007092890537169911}] # {'word': '搜索引擎', 'similarity': 0.007092890537169911}]
results = lda_news.infer_doc_topic_distribution("最近有学者新出了一篇论文,关于自然语言处理的,可厉害了")
# [{'topic id': 216, 'distribution': 0.5222222222222223},
# {'topic id': 1789, 'distribution': 0.18888888888888888},
# {'topic id': 98, 'distribution': 0.1111111111111111},
# {'topic id': 805, 'distribution': 0.044444444444444446},
# {'topic id': 56, 'distribution': 0.03333333333333333}, ...]
keywords = lda_news.show_topic_keywords(topic_id=216)
# {'研究': 0.1753955534055716,
# '学术': 0.13158917246453747,
# '论文': 0.1178632702247961,
# '课题': 0.057840811145163484,
# '发表': 0.05614630212471184,
# '成果': 0.03587086607950555,
# '期刊': 0.030608728068521086,
# '科研': 0.0216061375112729,
# '学者': 0.017739360125774,
# '科学': 0.015553720885167896}
``` ```
## 查看代码 ## 查看代码
......
...@@ -109,7 +109,7 @@ class LACTokenizer(Tokenizer): ...@@ -109,7 +109,7 @@ class LACTokenizer(Tokenizer):
def tokenize(self, text): def tokenize(self, text):
results = self.__lac.lexical_analysis( results = self.__lac.lexical_analysis(
texts=[text], use_gpu=True, batch_size=1, return_tag=True) texts=[text], use_gpu=False, batch_size=1, return_tag=True)
# Change English words to lower case. # Change English words to lower case.
# And just preserve the word in vocab. # And just preserve the word in vocab.
words = results[0]["word"] words = results[0]["word"]
......
...@@ -88,6 +88,37 @@ jsd, hd = lda_novel.cal_doc_distance(doc_text1="老人幸福地看着自己的 ...@@ -88,6 +88,37 @@ jsd, hd = lda_novel.cal_doc_distance(doc_text1="老人幸福地看着自己的
lda_sim = lda_novel.cal_query_doc_similarity(query='亲孙女', document='老人激动地打量着面前的女孩,似乎找到了自己的亲孙女一般,双手止不住地颤抖着。') lda_sim = lda_novel.cal_query_doc_similarity(query='亲孙女', document='老人激动地打量着面前的女孩,似乎找到了自己的亲孙女一般,双手止不住地颤抖着。')
# LDA similarity = 0.0 # LDA similarity = 0.0
results = lda_novel.cal_doc_keywords_similarity('百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。百度超过千亿的中文网页数据库,可以瞬间找到相关的搜索结果。')
# [{'word': '信息', 'similarity': 0.014140977159719738},
# {'word': '找到', 'similarity': 0.012251022010382823},
# {'word': '搜索', 'similarity': 0.004262275169349261},
# {'word': '网页', 'similarity': 0.0026937499565468327},
# {'word': '百度', 'similarity': 0.0021199508577209015},
# {'word': '全球', 'similarity': 0.0010464078137351785},
# {'word': '中文', 'similarity': 0.0009866259107630141},
# {'word': '瞬间', 'similarity': 0.0009262589016537221},
# {'word': '超过', 'similarity': 0.0008362863020592123},
# {'word': '相关', 'similarity': 0.000793663877590302}]
results = lda_novel.infer_doc_topic_distribution("妈妈告诉女儿,今天爸爸过生日,放学后要早点回家一起庆祝")
# [{'topic id': 0, 'distribution': 0.7166666666666667},
# {'topic id': 64, 'distribution': 0.11666666666666667},
# {'topic id': 125, 'distribution': 0.020833333333333332},
# {'topic id': 131, 'distribution': 0.016666666666666666},
# {'topic id': 137, 'distribution': 0.016666666666666666}, ...]
keywords = lda_novel.show_topic_keywords(topic_id=0)
# {'妈妈': 0.36114392028319225,
# '爸爸': 0.18456064543161096,
# '女儿': 0.03591842787260316,
# '孩子': 0.01567368390197123,
# '家里': 0.014277018999815379,
# '回家': 0.013514888275429099,
# '回来': 0.013275213681108526,
# '爸妈': 0.007931677222119656,
# '告诉': 0.006841933742906693,
# '父母': 0.00627464639375944}
``` ```
## 查看代码 ## 查看代码
......
...@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): ...@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer):
def tokenize(self, text): def tokenize(self, text):
results = self.__lac.lexical_analysis( results = self.__lac.lexical_analysis(
texts=[text], use_gpu=True, batch_size=1, return_tag=True) texts=[text], use_gpu=False, batch_size=1, return_tag=True)
# Change English words to lower case. # Change English words to lower case.
# And just preserve the word in vocab. # And just preserve the word in vocab.
words = results[0]["word"] words = results[0]["word"]
......
...@@ -95,6 +95,27 @@ results = lda_webpage.cal_doc_keywords_similarity('百度首页推荐着各种 ...@@ -95,6 +95,27 @@ results = lda_webpage.cal_doc_keywords_similarity('百度首页推荐着各种
# {'word': '功能', 'similarity': 0.011409342579361237}, # {'word': '功能', 'similarity': 0.011409342579361237},
# {'word': '搜索引擎', 'similarity': 0.010392479335778413}] # {'word': '搜索引擎', 'similarity': 0.010392479335778413}]
out = lda_webpage.cal_query_doc_similarity(query='百度搜索引擎', document='百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。百度超过千亿的中文网页数据库,可以瞬间找到相关的搜索结果。')
# out = 0.0283
results = lda_webpage.infer_doc_topic_distribution("百度文库非常的好用,我们不仅在里面找到需要的文档,同时可以通过续费畅读精品文档。")
# [{'topic id': 3458, 'distribution': 0.5277777777777778},
# {'topic id': 1927, 'distribution': 0.17777777777777778},
# {'topic id': 1497, 'distribution': 0.05},
# {'topic id': 1901, 'distribution': 0.03333333333333333}...]
keywords = lda_webpage.show_topic_keywords(3458)
# {'price': 0.10977647395316775,
# '文档': 0.06445075002937038,
# '财富值': 0.04012675135746289,
# '文库': 0.03953267826572788,
# 'len': 0.038856163693739426,
# 'tag': 0.03868762622172197,
# 'current': 0.03728225157463761,
# 'cut': 0.03448665775467454,
# '尺寸': 0.03250387028891812,
# '财富': 0.02902896727051734}
``` ```
## 查看代码 ## 查看代码
......
...@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): ...@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer):
def tokenize(self, text): def tokenize(self, text):
results = self.__lac.lexical_analysis( results = self.__lac.lexical_analysis(
texts=[text], use_gpu=True, batch_size=1, return_tag=True) texts=[text], use_gpu=False, batch_size=1, return_tag=True)
# Change English words to lower case. # Change English words to lower case.
# And just preserve the word in vocab. # And just preserve the word in vocab.
words = results[0]["word"] words = results[0]["word"]
......
...@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): ...@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer):
def tokenize(self, text): def tokenize(self, text):
results = self.__lac.lexical_analysis( results = self.__lac.lexical_analysis(
texts=[text], use_gpu=True, batch_size=1, return_tag=True) texts=[text], use_gpu=False, batch_size=1, return_tag=True)
# Change English words to lower case. # Change English words to lower case.
# And just preserve the word in vocab. # And just preserve the word in vocab.
words = results[0]["word"] words = results[0]["word"]
......
...@@ -37,6 +37,32 @@ ...@@ -37,6 +37,32 @@
- results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。 - results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。
### 代码示例
这里展示部分API的使用示例。
``` python
import paddlehub as hub
slda_novel = hub.Module("slda_novel")
topic_dist = slda_novel.infer_doc_topic_distribution("妈妈告诉女儿,今天爸爸过生日,放学后要早点回家一起庆祝")
# [{'topic id': 222, 'distribution': 0.5}, {'topic id': 362, 'distribution': 0.5}]
keywords = slda_novel.show_topic_keywords(topic_id=222)
# {'回来': 0.044502306717752,
# '回去': 0.036457065533017245,
# '回家': 0.029136327306669554,
# '明天': 0.028762575780517493,
# '休息': 0.022904260192395567,
# '晚上': 0.021970839714261954,
# '时间': 0.020756626422891028,
# '好好': 0.019726413882856498,
# '电话': 0.017195445214734463,
# '吃饭': 0.01521839547511471}
```
## 查看代码 ## 查看代码
https://github.com/baidu/Familia https://github.com/baidu/Familia
......
...@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): ...@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer):
def tokenize(self, text): def tokenize(self, text):
results = self.__lac.lexical_analysis( results = self.__lac.lexical_analysis(
texts=[text], use_gpu=True, batch_size=1, return_tag=True) texts=[text], use_gpu=False, batch_size=1, return_tag=True)
# Change English words to lower case. # Change English words to lower case.
# And just preserve the word in vocab. # And just preserve the word in vocab.
words = results[0]["word"] words = results[0]["word"]
......
...@@ -54,6 +54,17 @@ topic_dist = slda_webpage.infer_doc_topic_distribution("百度是全球最大的 ...@@ -54,6 +54,17 @@ topic_dist = slda_webpage.infer_doc_topic_distribution("百度是全球最大的
# {'topic id': 4410, 'distribution': 0.016666666666666666}, # {'topic id': 4410, 'distribution': 0.016666666666666666},
# {'topic id': 4676, 'distribution': 0.016666666666666666}] # {'topic id': 4676, 'distribution': 0.016666666666666666}]
keywords = slda_webpage.show_topic_keywords(topic_id=4687)
# {'市场': 0.07413332566788851,
# '增长': 0.045259383167567974,
# '规模': 0.030225253512468797,
# '用户': 0.02278765317990645,
# '超过': 0.019395970334729278,
# '份额': 0.019091932266952005,
# '全球': 0.018879934814238216,
# '手机': 0.01252139322404175,
# '美元': 0.01202885155424257,
# '收入': 0.011096560279140084}
``` ```
......
...@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): ...@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer):
def tokenize(self, text): def tokenize(self, text):
results = self.__lac.lexical_analysis( results = self.__lac.lexical_analysis(
texts=[text], use_gpu=True, batch_size=1, return_tag=True) texts=[text], use_gpu=False, batch_size=1, return_tag=True)
# Change English words to lower case. # Change English words to lower case.
# And just preserve the word in vocab. # And just preserve the word in vocab.
words = results[0]["word"] words = results[0]["word"]
......
...@@ -37,6 +37,31 @@ ...@@ -37,6 +37,31 @@
- results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。 - results(dict): 返回对应文档的前k个关键词,以及各个关键词在文档中的出现概率。
### 代码示例
这里展示API的使用示例。
``` python
import paddlehub as hub
slda_weibo = hub.Module(name="slda_weibo")
topic_dist = slda_weibo.infer_doc_topic_distribution("百度是全球最大的中文搜索引擎、致力于让网民更便捷地获取信息,找到所求。")
# [{'topic id': 874, 'distribution': 0.5}, {'topic id': 1764, 'distribution': 0.5}]
keywords = slda_weibo.show_topic_keywords(topic_id=874)
# {'数据': 0.07850538018570305,
# '更新': 0.04504777051711974,
# '出口': 0.023363758946167185,
# '信息': 0.020567061200812687,
# '全国': 0.015975367546781145,
# '双十一': 0.014998636225687216,
# '地理': 0.013257422965959297,
# '官方': 0.012913598174463106,
# '支持': 0.01177359809763076,
# '说话': 0.011205999070328388}
```
## 查看代码 ## 查看代码
https://github.com/baidu/Familia https://github.com/baidu/Familia
......
...@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer): ...@@ -111,7 +111,7 @@ class LACTokenizer(Tokenizer):
def tokenize(self, text): def tokenize(self, text):
results = self.__lac.lexical_analysis( results = self.__lac.lexical_analysis(
texts=[text], use_gpu=True, batch_size=1, return_tag=True) texts=[text], use_gpu=False, batch_size=1, return_tag=True)
# Change English words to lower case. # Change English words to lower case.
# And just preserve the word in vocab. # And just preserve the word in vocab.
words = results[0]["word"] words = results[0]["word"]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册