计算编辑距离，去停用词

b23e1c3b · Hai Liang Wang · 99799847 · b23e1c3b · b23e1c3b · b23e1c3b
隐藏空白更改
内联并排

Showing with 31 addition and 5 deletion

CHANGELOG.md CHANGELOG.md +6 -0

Requirements.txt Requirements.txt +1 -1

setup.py setup.py +1 -1

synonyms/synonyms.py synonyms/synonyms.py +23 -3

未找到文件。
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# 3.10
+* 计算编辑距离时去停用词
+
+# 3.9
+* fix bug
+
 # 3.8
 * 获得一个分词后句子的向量，向量以BoW方式组成


--- a/Requirements.txt
+++ b/Requirements.txt
-synonyms>=3.6
\ No newline at end of file
+synonyms>=3.10
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@ Welcome

 setup(
    name='synonyms',
-    version='3.8.0',
+    version='3.10.0',
    description='Chinese Synonyms for Natural Language Processing and Understanding',
    long_description=LONGDOC,
    author='Hai Liang Wang, Hu Ying Xi',

--- a/synonyms/synonyms.py
+++ b/synonyms/synonyms.py
@@ -78,7 +78,7 @@ tokenizer settings
 '''
 tokenizer_dict = os.path.join(curdir, 'data', 'vocab.txt')
 if "SYNONYMS_WORDSEG_DICT" in ENVIRON:
-    if os.exist(ENVIRON["SYNONYMS_WORDSEG_DICT"]):
+    if os.path.exists(ENVIRON["SYNONYMS_WORDSEG_DICT"]):
        print("info: set wordseg dict with %s" % tokenizer_dict)
        tokenizer_dict = ENVIRON["SYNONYMS_WORDSEG_DICT"]
    else: print("warning: can not find dict at [%s]" % tokenizer_dict)
@@ -303,23 +303,43 @@ def nearby(word):
    _cache_nearby[w] = (words, scores)
    return words, scores

-def compare(s1, s2, seg=True, ignore=False):
+def compare(s1, s2, seg=True, ignore=False, stopwords=False):
    '''
    compare similarity
    s1 : sentence1
    s2 : sentence2
    seg : True : The original sentences need jieba.cut
          Flase : The original sentences have been cut.
+    ignore: True: ignore OOV words
+            False: get vector randomly for OOV words
    '''
    if s1 == s2: return 1.0
+    
+    s1_words = []
+    s2_words = []
+
    if seg:
        s1 = [x for x in jieba.cut(s1)]
        s2 = [x for x in jieba.cut(s2)]
    else:
        s1 = s1.split()
        s2 = s2.split()
+
+    # check stopwords
+    if not stopwords:
+        global _stopwords
+        for x in s1: 
+            if not x in _stopwords:
+                s1_words.append(x)
+        for x in s2:
+            if not x in _stopwords:
+                s2_words.append(x)
+    else:
+        s1_words = s1 
+        s2_words = s2
+
    assert len(s1) > 0 and len(s2) > 0, "The length of s1 and s2 should > 0."
-    return _similarity_distance(s1, s2, ignore)
+    return _similarity_distance(s1_words, s2_words, ignore)

 def display(word):
    print("'%s'近义词：" % word)