提交 d4b17d2f 编写于 作者: Sugar_chestnut03's avatar Sugar_chestnut03

测试文件

上级 1ce8549b
import jieba
#载入jieba,分词
import gensim
#调入genism,运算余弦相似度
import re
#正则匹配,去除多余特殊符号
import os
#用于查询绝对路径文件是否存在
def get_file_contents(path):#获取指定路径的文件内容
str = ''
f = open(path, 'r', encoding='UTF-8')
line = f.readline()
while line:
str = str + line
line = f.readline()
f.close()
return str
#将读取到的文件内容先进行jieba分词,然后再把标点符号、转义符号等特殊符号过滤掉
def filter(str):
str = jieba.lcut(str)#进行分词
result = []
for tags in str:
if (re.match(u"[a-zA-Z0-9\u4e00-\u9fa5]", tags)):#正则匹配,中英文及数字,其余全部排除
result.append(tags)
else:
pass
return result
#传入过滤之后的数据,通过调用gensim.similarities.Similarity以计算余弦相似度
def calc_similarity(text1,text2):
texts=[text1,text2]
dictionary = gensim.corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
similarity = gensim.similarities.Similarity('-Similarity-index', corpus, num_features=len(dictionary))
test_corpus_1 = dictionary.doc2bow(text1)
cosine_sim = similarity[test_corpus_1][1]
return cosine_sim
def main_test():
orig =input("请输入原文的绝对路径:")#将原文输入,测试用C:\Users\YUKI\PycharmProjects\pythonProject3\test\orig.txt
if not os.path.exists(orig):#检测原文绝对路径是否存在
print("原文文文件不存在!")
exit()
print("成功装载原文")
copy = input("请输入疑似抄袭文的绝对路径:")#导入抄袭文,测试用:C:\Users\YUKI\PycharmProjects\pythonProject3\test\orig_0.8_dis_15.txt
if not os.path.exists(copy):#检测疑似抄袭文绝对路径是否存在
print("抄袭论文文件不存在!")
exit()
print("成功装载抄袭文件")
save_path = r"C:\Users\YUKI\PycharmProjects\pythonProject3\test\result.txt"#导出结果文件,为绝对路径,请务必更改!!!
str1 = get_file_contents(orig)#将源文件读入字符串str1
str2 = get_file_contents(copy)#将抄袭文件读入字符串str2
text1 = filter(str1)#过滤掉多余字符串
text2 = filter(str2)
similarity = calc_similarity(text1, text2)#计算相似度
print("文章相似度: %.2f"%similarity)#作业要求输出浮点型2位小数
#将相似度结果写入指定文件result
result=result=round(similarity.item(),2)
f = open(save_path, 'w', encoding="utf-8")
f.write("python" + " " + "main.py\n" + " " + orig + "\n " + copy + " \n" + "文章相似度: %.2f" % similarity)
f.close()
return result
if __name__ == '__main__':
main_test()#本来是为了方便单元测试以改成这样,但后来发现影响也不大,遂懒得改了.jpg
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册