1.3

5c7da8be · hgcvg · fdcf87cd · 5c7da8be
隐藏空白更改
内联并排

Showing with 4 addition and 33 deletion

main.py main.py +4 -33

未找到文件。
--- a/main.py
+++ b/main.py
@@ -7,18 +7,9 @@ import json
 import sys


-# 分词
-# def splitWords(text):
-#     with open(text, 'r', encoding='UTF-8') as f1:
-#         f2 = f1.read()
-#     pattern = re.compile(u"[^a-zA-Z0-9\u4e00-\u9fa5]")  # 匹配过滤
-#     s = pattern.sub("", f2)
-#     f1.close()
-#     length = len(list(jieba.lcut(s)))  # length为分词后词的个数
-#     string = jieba.analyse.extract_tags(s, topK=length)  # 提取主题词
-#     return string

-def getword(text):
+
+def  getword(text):
    with open(text, 'r', encoding='UTF-8') as f1:
        f2 = f1.read()
    f1.close()
@@ -28,7 +19,7 @@ def getword(text):


 # simhash
-def getSimh(s):
+def    getSimh(s):
    i = 0
    weight = len(s)
    fv = [0] * 128  # feature vector
@@ -48,7 +39,7 @@ def getSimh(s):
        i += 1
    simh = ''
    for k in range(len(fv)):  # 降维
-        if fv[k] >= 0:  # 对于n-bit签名的累加结果，大于0则置1，否则置0
+        if fv[k] >=  0:  # 对于n-bit签名的累加结果，大于0则置1，否则置0
            simh += '1'
        else:
            simh += '0'
@@ -68,26 +59,6 @@ def getSimilarity(simh1, simh2):
    return s


-# def main_project():
-#     input()
-#     path1 = ','.join(sys.argv[1:2])  # 获取命令行参数 将列表转换为字符串
-#     path2 = ','.join(sys.argv[2:3])
-#     path3 = ','.join(sys.argv[3:])
-#     if not os.path.exists(path1):
-#         print("论文原文不存在！")
-#         exit()
-#     if not os.path.exists(path2):
-#         print("抄袭论文不存在！")
-#         exit()
-#     simhash1 = getSimh(splitWords(path1))
-#     simhash2 = getSimh(splitWords(path2))
-#     s1 = getSimilarity(simhash1, simhash2)
-#     s2 = round(s1, 2)  # 精确到小数点后两位
-#     print('文章相似度为:%f' % s2)
-#     with open(path3, 'a', encoding='utf-8')as f:  # 将结果写入指定路径path3
-#         f.write(path2 + '与原文的相似度为：')
-#         f.write(json.dumps(s2, ensure_ascii=False) + '\n')
-#     return s2


 def test():