提交 dac98aa8 编写于 作者: H Hai Liang Wang

Closed #43 smoothing scores in compare API

上级 3e872fed
# 2.3
* 计算相似度时增加平滑策略
# v1.6
* use ```jieba``` instead of ```thulac``` as tokeninzer.
* refine console log for Jupyter notebook.
\ No newline at end of file
......@@ -12,7 +12,7 @@ Chinese Synonyms for Natural Language Processing and Understanding.
```
pip install -U synonyms
```
兼容py2和py3,当前稳定版本 v2.2。**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**
兼容py2和py3,当前稳定版本 v2.3。**同时,Node.js 用户可以使用 [node-synonyms](https://www.npmjs.com/package/node-synonyms)了。**
```
npm install node-synonyms
......@@ -103,10 +103,10 @@ data is built based on [wikidata-corpus](https://github.com/Samurais/wikidata-co
## Valuation
### 同义词词林
《同义词词林》是梅家驹等人于1983年编纂而成,现在使用广泛的是哈工大社会计算与信息检索研究中心维护的《同义词词林扩展版》,它精细的将中文词汇划分成大类和小类,梳理了词汇间的关系,同义词词林扩展版包含词语77,343条,其中32,470被以开放数据形式共享。
《同义词词林》是梅家驹等人于1983年编纂而成,现在使用广泛的是哈工大社会计算与信息检索研究中心维护的《同义词词林扩展版》,它精细的将中文词汇划分成大类和小类,梳理了词汇间的关系,同义词词林扩展版包含词语7万余条,其中3万余条被以开放数据形式共享。
### 知网, HowNet
HowNet,也被称为知网,它并不只是一个语义字典,而是一个知识系统,词汇之间的关系是其一个基本使用场景。知网包含词语8,265条。
HowNet,也被称为知网,它并不只是一个语义字典,而是一个知识系统,词汇之间的关系是其一个基本使用场景。知网包含词语8条。
国际上对词语相似度算法的评价标准普遍采用 Miller&Charles 发布的英语词对集的人工判定值。该词对集由十对高度相关、十对中度相关、十对低度相关共 30 个英语词对组成,然后让38个受试者对这30对进行语义相关度判断,最后取他们的平均值作为人工判定标准。然后不同近义词工具也对这些词汇进行相似度评分,与人工判定标准做比较,比如使用皮尔森相关系数。在中文领域,使用这个词表的翻译版进行中文近义词比较也是常用的办法。
......@@ -115,7 +115,7 @@ Synonyms的词表容量是125,792,下面选择一些在同义词词林、知
![](./assets/5.png)
注:同义词林及知网数据、分数来源, https://github.com/yaleimeng/Final_word_Similarity
注:同义词林及知网数据、分数来源, https://github.com/yaleimeng/Final_word_Similarity;Synonyms也在不断优化中,新的分数可能和上图不一致。
## Benchmark
......
synonyms>=2.0
\ No newline at end of file
synonyms>=2.3
\ No newline at end of file
......@@ -36,9 +36,9 @@ import synonyms # https://github.com/huyingxi/Synonyms
import numpy
import unittest
# run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample
compare_ = lambda x,y,z: "%s vs %s: %f" % (x, y, synonyms.compare(x, y, seg=z))
# run testcase: python /Users/hain/ai/Synonyms/demo.py Test.testExample
class Test(unittest.TestCase):
'''
......@@ -50,6 +50,39 @@ class Test(unittest.TestCase):
def tearDown(self):
pass
def test_pairs(self):
print("test_pairs")
print("*"* 30)
print(compare_("轿车", "汽车", True))
print("*"* 30)
print(compare_("宝石", "宝物", True))
print("*"* 30)
print(compare_("旅游", "游历", True))
print("*"* 30)
print(compare_("男孩子", "小伙子", True))
print("*"* 30)
print(compare_("海岸", "海滨", True))
print("*"* 30)
print(compare_("庇护所", "精神病院", True))
print("*"* 30)
print(compare_("魔术师", "巫师", True))
print("*"* 30)
print(compare_("中午", "正午", True))
print("*"* 30)
print(compare_("火炉", "炉灶", True))
print("*"* 30)
print(compare_("食物", "水果", True))
print("*"* 30)
print(compare_("鸡", "公鸡", True))
print("*"* 30)
print(compare_("鸟", "鹤", True))
print("*"* 30)
print(compare_("工具", "器械", True))
print("*"* 30)
print(compare_("兄弟", "和尚", True))
print("*"* 30)
print(compare_("起重机", "器械", True))
def test_similarity(self):
'''
Generate sentence similarity
......@@ -73,6 +106,12 @@ class Test(unittest.TestCase):
print("发生历史性变革 vs 发生历史性变革:", r)
# assert r > 0, "the similarity should be bigger then zero"
sen1 = "骨折"
sen2 = "巴赫"
r = synonyms.compare(sen1, sen2, seg=True)
print("%s vs %s" % (sen1, sen2), r)
def test_nearby(self):
synonyms.display("人脸") # synonyms.display calls synonyms.nearby
......
......@@ -13,7 +13,7 @@ Welcome
setup(
name='synonyms',
version='2.2',
version='2.3',
description='Chinese Synonyms for Natural Language Processing and Understanding',
long_description=LONGDOC,
author='Hai Liang Wang, Hu Ying Xi',
......
......@@ -47,6 +47,7 @@ import shutil
from synonyms.word2vec import KeyedVectors
from synonyms.utils import any2utf8
from synonyms.utils import any2unicode
from synonyms.utils import sigmoid
import jieba.posseg as _tokenizer
import jieba
......@@ -58,7 +59,6 @@ _size = 0
_vectors = None
_stopwords = set()
'''
nearby
'''
......@@ -195,10 +195,13 @@ def _levenshtein_distance(sentence1, sentence2):
Based on:
http://rosettacode.org/wiki/Levenshtein_distance#Python
'''
first = sentence1.split()
second = sentence2.split()
if len(first) > len(second):
first = any2utf8(sentence1).decode('utf-8', 'ignore')
second = any2utf8(sentence2).decode('utf-8', 'ignore')
sentence1_len, sentence2_len = len(first), len(second)
maxlen = max(sentence1_len, sentence2_len)
if sentence1_len > sentence2_len:
first, second = second, first
distances = range(len(first) + 1)
for index2, char2 in enumerate(second):
new_distances = [index2 + 1]
......@@ -211,8 +214,13 @@ def _levenshtein_distance(sentence1, sentence2):
new_distances[-1])))
distances = new_distances
levenshtein = distances[-1]
return 2 ** (-1 * levenshtein)
dis = float((maxlen - levenshtein)/maxlen)
# smoothing
s = (sigmoid(dis * 6) - 0.5) * 2
# print("smoothing[%s| %s]: %s -> %s" % (sentence1, sentence2, dis, s))
return s
_smooth = lambda x, y, z: (x * y) + z
def _similarity_distance(s1, s2):
'''
......@@ -223,9 +231,21 @@ def _similarity_distance(s1, s2):
# https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html
g = 1 / (np.linalg.norm(a - b) + 1)
u = _levenshtein_distance(s1, s2)
r = g * 5 + u * 0.8
r = min(r, 1.0)
# print("g: %s, u: %s" % (g, u))
if u > 0.8:
r = _smooth(g, 0.05, u)
elif u > 0.7:
r = _smooth(g, 0.1, u)
elif u > 0.6:
r = _smooth(g, 0.2, u)
elif u > 0.5:
r = _smooth(g, 1, u)
elif u > 0.4:
r = _smooth(g, 4, u)
else:
r = _smooth(g, 10, u)
r = min(r, 1.0)
return float("%.3f" % r)
......
......@@ -239,6 +239,8 @@ def any2unicode(text, encoding='utf8', errors='strict'):
to_unicode = any2unicode
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def call_on_class_only(*args, **kwargs):
"""Raise exception when load methods are called on instance"""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册