测试文件

d4b17d2f · Sugar_chestnut03 · 1ce8549b · d4b17d2f
隐藏空白更改
内联并排

Showing with 71 addition and 0 deletion

test/main.py test/main.py +71 -0

未找到文件。
--- a/test/main.py
+++ b/test/main.py
+import jieba
+#载入jieba，分词
+import gensim
+#调入genism，运算余弦相似度
+import re
+#正则匹配，去除多余特殊符号
+import os
+#用于查询绝对路径文件是否存在
+
+def get_file_contents(path):#获取指定路径的文件内容
+    str = ''
+    f = open(path, 'r', encoding='UTF-8')
+    line = f.readline()
+    while line:
+        str = str + line
+        line = f.readline()
+    f.close()
+    return str
+
+#将读取到的文件内容先进行jieba分词，然后再把标点符号、转义符号等特殊符号过滤掉
+def filter(str):
+    str = jieba.lcut(str)#进行分词
+    result = []
+    for tags in str:
+        if (re.match(u"[a-zA-Z0-9\u4e00-\u9fa5]", tags)):#正则匹配，中英文及数字，其余全部排除
+            result.append(tags)
+        else:
+           pass
+    return result
+
+#传入过滤之后的数据，通过调用gensim.similarities.Similarity以计算余弦相似度
+def calc_similarity(text1,text2):
+    texts=[text1,text2]
+    dictionary = gensim.corpora.Dictionary(texts)
+    corpus = [dictionary.doc2bow(text) for text in texts]
+    similarity = gensim.similarities.Similarity('-Similarity-index', corpus, num_features=len(dictionary))
+    test_corpus_1 = dictionary.doc2bow(text1)
+    cosine_sim = similarity[test_corpus_1][1]
+    return cosine_sim
+
+def main_test():
+    orig =input("请输入原文的绝对路径：")#将原文输入，测试用C:\Users\YUKI\PycharmProjects\pythonProject3\test\orig.txt
+    if not os.path.exists(orig):#检测原文绝对路径是否存在
+     print("原文文文件不存在！")
+     exit()
+    print("成功装载原文")
+    copy = input("请输入疑似抄袭文的绝对路径：")#导入抄袭文,测试用：C:\Users\YUKI\PycharmProjects\pythonProject3\test\orig_0.8_dis_15.txt
+    if not os.path.exists(copy):#检测疑似抄袭文绝对路径是否存在
+     print("抄袭论文文件不存在！")
+     exit()
+    print("成功装载抄袭文件")
+    save_path = r"C:\Users\YUKI\PycharmProjects\pythonProject3\test\result.txt"#导出结果文件,为绝对路径，请务必更改！！！
+    str1 = get_file_contents(orig)#将源文件读入字符串str1
+    str2 = get_file_contents(copy)#将抄袭文件读入字符串str2
+    text1 = filter(str1)#过滤掉多余字符串
+    text2 = filter(str2)
+    similarity = calc_similarity(text1, text2)#计算相似度
+    print("文章相似度： %.2f"%similarity)#作业要求输出浮点型2位小数
+    #将相似度结果写入指定文件result
+    result=result=round(similarity.item(),2)
+    f = open(save_path, 'w', encoding="utf-8")
+    f.write("python" + " " + "main.py\n" + " " + orig + "\n " + copy + " \n" + "文章相似度： %.2f" % similarity)
+    f.close()
+    return result
+
+if __name__ == '__main__':
+    main_test()#本来是为了方便单元测试以改成这样，但后来发现影响也不大，遂懒得改了.jpg
+
+
+
+