提交 5c7da8be 编写于 作者: hgcvg's avatar hgcvg

1.3

上级 fdcf87cd
......@@ -7,18 +7,9 @@ import json
import sys
# 分词
# def splitWords(text):
# with open(text, 'r', encoding='UTF-8') as f1:
# f2 = f1.read()
# pattern = re.compile(u"[^a-zA-Z0-9\u4e00-\u9fa5]") # 匹配过滤
# s = pattern.sub("", f2)
# f1.close()
# length = len(list(jieba.lcut(s))) # length为分词后词的个数
# string = jieba.analyse.extract_tags(s, topK=length) # 提取主题词
# return string
def getword(text):
def getword(text):
with open(text, 'r', encoding='UTF-8') as f1:
f2 = f1.read()
f1.close()
......@@ -28,7 +19,7 @@ def getword(text):
# simhash
def getSimh(s):
def getSimh(s):
i = 0
weight = len(s)
fv = [0] * 128 # feature vector
......@@ -48,7 +39,7 @@ def getSimh(s):
i += 1
simh = ''
for k in range(len(fv)): # 降维
if fv[k] >= 0: # 对于n-bit签名的累加结果,大于0则置1,否则置0
if fv[k] >= 0: # 对于n-bit签名的累加结果,大于0则置1,否则置0
simh += '1'
else:
simh += '0'
......@@ -68,26 +59,6 @@ def getSimilarity(simh1, simh2):
return s
# def main_project():
# input()
# path1 = ','.join(sys.argv[1:2]) # 获取命令行参数 将列表转换为字符串
# path2 = ','.join(sys.argv[2:3])
# path3 = ','.join(sys.argv[3:])
# if not os.path.exists(path1):
# print("论文原文不存在!")
# exit()
# if not os.path.exists(path2):
# print("抄袭论文不存在!")
# exit()
# simhash1 = getSimh(splitWords(path1))
# simhash2 = getSimh(splitWords(path2))
# s1 = getSimilarity(simhash1, simhash2)
# s2 = round(s1, 2) # 精确到小数点后两位
# print('文章相似度为:%f' % s2)
# with open(path3, 'a', encoding='utf-8')as f: # 将结果写入指定路径path3
# f.write(path2 + '与原文的相似度为:')
# f.write(json.dumps(s2, ensure_ascii=False) + '\n')
# return s2
def test():
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册