From 26e5eeac92ff06fe892fb875c07da7560749ce59 Mon Sep 17 00:00:00 2001 From: boomyuan0000 Date: Wed, 8 Mar 2023 14:20:25 +0800 Subject: [PATCH] =?UTF-8?q?=E5=B7=B2=E5=88=A0=E9=99=A4simhash/.idea/.gitig?= =?UTF-8?q?nore,=20simhash/.idea/compiler.xml,=20simhash/.idea/encodings.x?= =?UTF-8?q?ml,=20simhash/.idea/jarRepositories.xml,=20simhash/.idea/misc.x?= =?UTF-8?q?ml,=20simhash/src/main/java/exceptions/FileAnalyseException.jav?= =?UTF-8?q?a,=20simhash/src/main/java/exceptions/HashException.java,=20sim?= =?UTF-8?q?hash/src/main/java/exceptions/NotExistFileException.java,=20sim?= =?UTF-8?q?hash/src/main/java/utils/CalculationUtils.java,=20simhash/src/m?= =?UTF-8?q?ain/java/utils/CommonUtils.java,=20simhash/src/main/java/Main.j?= =?UTF-8?q?ava,=20simhash/src/test/java/MainTest.java,=20simhash/target/cl?= =?UTF-8?q?asses/exceptions/FileAnalyseException.class,=20simhash/target/c?= =?UTF-8?q?lasses/exceptions/HashException.class,=20simhash/target/classes?= =?UTF-8?q?/exceptions/NotExistFileException.class,=20simhash/target/class?= =?UTF-8?q?es/utils/CalculationUtils.class,=20simhash/target/classes/utils?= =?UTF-8?q?/CommonUtils.class,=20simhash/target/classes/Main.class,=20simh?= =?UTF-8?q?ash/target/classes/classpath.index,=20simhash/target/test-class?= =?UTF-8?q?es/MainTest.class,=20simhash/target/test-classes/classpath.inde?= =?UTF-8?q?x,=20simhash/pom.xml?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- simhash/.idea/.gitignore | 8 - simhash/.idea/compiler.xml | 13 - simhash/.idea/encodings.xml | 7 - simhash/.idea/jarRepositories.xml | 20 -- simhash/.idea/misc.xml | 14 - simhash/pom.xml | 50 ---- simhash/src/main/java/Main.java | 42 --- .../java/exceptions/FileAnalyseException.java | 10 - .../main/java/exceptions/HashException.java | 12 - .../exceptions/NotExistFileException.java | 11 - .../src/main/java/utils/CalculationUtils.java | 141 ---------- simhash/src/main/java/utils/CommonUtils.java | 71 ----- simhash/src/test/java/MainTest.java | 252 ------------------ simhash/target/classes/Main.class | Bin 2599 -> 0 bytes simhash/target/classes/classpath.index | Bin 136 -> 0 bytes .../exceptions/FileAnalyseException.class | Bin 374 -> 0 bytes .../classes/exceptions/HashException.class | Bin 372 -> 0 bytes .../exceptions/NotExistFileException.class | Bin 387 -> 0 bytes .../classes/utils/CalculationUtils.class | Bin 5051 -> 0 bytes .../target/classes/utils/CommonUtils.class | Bin 3869 -> 0 bytes simhash/target/test-classes/MainTest.class | Bin 7780 -> 0 bytes simhash/target/test-classes/classpath.index | Bin 96 -> 0 bytes 22 files changed, 651 deletions(-) delete mode 100644 simhash/.idea/.gitignore delete mode 100644 simhash/.idea/compiler.xml delete mode 100644 simhash/.idea/encodings.xml delete mode 100644 simhash/.idea/jarRepositories.xml delete mode 100644 simhash/.idea/misc.xml delete mode 100644 simhash/pom.xml delete mode 100644 simhash/src/main/java/Main.java delete mode 100644 simhash/src/main/java/exceptions/FileAnalyseException.java delete mode 100644 simhash/src/main/java/exceptions/HashException.java delete mode 100644 simhash/src/main/java/exceptions/NotExistFileException.java delete mode 100644 simhash/src/main/java/utils/CalculationUtils.java delete mode 100644 simhash/src/main/java/utils/CommonUtils.java delete mode 100644 simhash/src/test/java/MainTest.java delete mode 100644 simhash/target/classes/Main.class delete mode 100644 simhash/target/classes/classpath.index delete mode 100644 simhash/target/classes/exceptions/FileAnalyseException.class delete mode 100644 simhash/target/classes/exceptions/HashException.class delete mode 100644 simhash/target/classes/exceptions/NotExistFileException.class delete mode 100644 simhash/target/classes/utils/CalculationUtils.class delete mode 100644 simhash/target/classes/utils/CommonUtils.class delete mode 100644 simhash/target/test-classes/MainTest.class delete mode 100644 simhash/target/test-classes/classpath.index diff --git a/simhash/.idea/.gitignore b/simhash/.idea/.gitignore deleted file mode 100644 index 35410ca..0000000 --- a/simhash/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# 默认忽略的文件 -/shelf/ -/workspace.xml -# 基于编辑器的 HTTP 客户端请求 -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml diff --git a/simhash/.idea/compiler.xml b/simhash/.idea/compiler.xml deleted file mode 100644 index e5fa1de..0000000 --- a/simhash/.idea/compiler.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - - - - - - - - - \ No newline at end of file diff --git a/simhash/.idea/encodings.xml b/simhash/.idea/encodings.xml deleted file mode 100644 index aa00ffa..0000000 --- a/simhash/.idea/encodings.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - \ No newline at end of file diff --git a/simhash/.idea/jarRepositories.xml b/simhash/.idea/jarRepositories.xml deleted file mode 100644 index 5a2f139..0000000 --- a/simhash/.idea/jarRepositories.xml +++ /dev/null @@ -1,20 +0,0 @@ - - - - - - - - - - - \ No newline at end of file diff --git a/simhash/.idea/misc.xml b/simhash/.idea/misc.xml deleted file mode 100644 index 82dbec8..0000000 --- a/simhash/.idea/misc.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - - - - - - - \ No newline at end of file diff --git a/simhash/pom.xml b/simhash/pom.xml deleted file mode 100644 index e30812d..0000000 --- a/simhash/pom.xml +++ /dev/null @@ -1,50 +0,0 @@ - - - 4.0.0 - - org.example - simhash - 1.0-SNAPSHOT - - - org.apache.commons - commons-lang3 - 3.5 - - - cn.hutool - hutool-all - 5.7.13 - - - com.hankcs.nlp - hanlp-lucene-plugin - 1.1.7 - - - junit - junit - 4.13.2 - - - cn.hutool - hutool-http - 5.8.14 - - - org.junit.jupiter - junit-jupiter-api - 5.8.2 - test - - - - - 17 - 17 - UTF-8 - - - \ No newline at end of file diff --git a/simhash/src/main/java/Main.java b/simhash/src/main/java/Main.java deleted file mode 100644 index 1355d7e..0000000 --- a/simhash/src/main/java/Main.java +++ /dev/null @@ -1,42 +0,0 @@ -import cn.hutool.core.date.DateUtil; -import exceptions.FileAnalyseException; -import exceptions.NotExistFileException; -import utils.CalculationUtils; -import utils.CommonUtils; -import java.util.Map; - -public class Main { - //合法参数个数为3 - static final int ARGS_NUM = 3; - public static void main(String[] args){ - // 读取并解析参数 - if (args.length != ARGS_NUM) { - throw new IllegalArgumentException("参数个数不正确"); - } - // 解析文件,处理分词 - Map originWordCount = null; - Map compareWordCount = null; - try { - //得到原文本的关键词和词频 - originWordCount = CommonUtils.analyseText(CommonUtils.readFileToStr(args[0])); - //以及比对文本的关键词的关键词和词频 - compareWordCount = CommonUtils.analyseText(CommonUtils.readFileToStr(args[1])); - } catch (FileAnalyseException | NotExistFileException e) { - e.printStackTrace(); - } - // 获取simHash值 - String simHash1 = CalculationUtils.calculateSimHash(originWordCount); - String simHash2 = CalculationUtils.calculateSimHash(compareWordCount); - //计算相似度,保留两位小数 - double result = CalculationUtils.getSimilarity(simHash1, simHash2); - String format = String.format("相似度为:%.2f", result); - System.out.println(format); - String writeFileContent = "---------------------------------------" + "\n" + - "原文件:" + args[0] + "\n" + - "对比文件:" + args[1] + "\n" + - format + "\n" + - "比较时间为:" + DateUtil.now() + "\n"; - ; - CommonUtils.writeFile(args[2],writeFileContent); - } -} diff --git a/simhash/src/main/java/exceptions/FileAnalyseException.java b/simhash/src/main/java/exceptions/FileAnalyseException.java deleted file mode 100644 index 31a1e64..0000000 --- a/simhash/src/main/java/exceptions/FileAnalyseException.java +++ /dev/null @@ -1,10 +0,0 @@ -package exceptions; - -/** - * 文件解析异常 - */ -public class FileAnalyseException extends Exception { - public FileAnalyseException(String message) { - super(message); - } -} \ No newline at end of file diff --git a/simhash/src/main/java/exceptions/HashException.java b/simhash/src/main/java/exceptions/HashException.java deleted file mode 100644 index 4259661..0000000 --- a/simhash/src/main/java/exceptions/HashException.java +++ /dev/null @@ -1,12 +0,0 @@ -package exceptions; - -import java.security.NoSuchAlgorithmException; - -/** - * MD5算法hash异常 - */ -public class HashException extends NoSuchAlgorithmException { - public HashException(String message) { - super(message); - } -} \ No newline at end of file diff --git a/simhash/src/main/java/exceptions/NotExistFileException.java b/simhash/src/main/java/exceptions/NotExistFileException.java deleted file mode 100644 index 377c24e..0000000 --- a/simhash/src/main/java/exceptions/NotExistFileException.java +++ /dev/null @@ -1,11 +0,0 @@ -package exceptions; - -import java.io.FileNotFoundException; -/** - * 找不到文件的文件解析异常 - */ -public class NotExistFileException extends FileNotFoundException { - public NotExistFileException(String message) { - super(message); - } -} \ No newline at end of file diff --git a/simhash/src/main/java/utils/CalculationUtils.java b/simhash/src/main/java/utils/CalculationUtils.java deleted file mode 100644 index da8a6c3..0000000 --- a/simhash/src/main/java/utils/CalculationUtils.java +++ /dev/null @@ -1,141 +0,0 @@ -package utils; - -import cn.hutool.core.util.StrUtil; -import exceptions.HashException; -import java.nio.charset.StandardCharsets; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import java.util.Map; - -/** - * 与计算有关的工具类 - */ -public class CalculationUtils { - //hash码长度为128 - static final int HASH_BIT = 128; - /** - * 采用MD5算法对关键词进行hash,得到的hash值使用16进制解析,再利用算法取128位二进制数作为hash值 - * @param word 词语 - * @return 128位二进制hash值 - */ - public static String wordHash(String word) throws HashException { - //如果传入词语为null或“”或“ ” - if (word == null || StrUtil.isBlank(word) || StrUtil.isEmpty(word)) { - throw new HashException("词语为空"); - } - try { - // 采用MD5算法进行hash - MessageDigest digest = MessageDigest.getInstance("MD5"); - digest.update(word.getBytes(StandardCharsets.UTF_8)); - // hash值转为32位16进制的散列值 - StringBuilder hash = new StringBuilder(); - for (byte b : digest.digest()) { - hash.append(String.format("%02x", b)); - } - // 16进制的散列值转为128位二进制码 - StringBuilder finalHash = new StringBuilder(); - String strTemp; - for (int i = 0; i < hash.length(); i++) { - // 每一位16进制数加上0000,最后截取后4位,得到便是这位数的二进制 - strTemp = "0000" + Integer.toBinaryString(Integer.parseInt(hash.substring(i, i + 1), 16)); - finalHash.append(strTemp.substring(strTemp.length() - 4)); - } - // 不为128则为hash异常 - if (finalHash.length() != HASH_BIT) { - throw new HashException("hash值长度不为128"); - } - return finalHash.toString(); - } catch (NoSuchAlgorithmException e) { - throw new HashException("MD5算法异常"); - } - } - - /** - * 给二进制hash值加权 - * @param hash 二进制哈希值 - * @param weight 权重 - * @return 加权后的二进制哈希值 - */ - public static int[] hashWeight(String hash, int weight) { - // 新建一个数组用于存放加权后的二进制哈希值 - int[] hashArray = new int[HASH_BIT]; - // 遍历二进制哈希值,0则是-1,1则是1,将每一位加权后存入数组 - for (int i = 0; i < hash.length(); i++) { - if (hash.charAt(i) == '1') { - hashArray[i] = weight; - } else { - hashArray[i] = -1 * weight; - } - } - return hashArray; - } - - /** - * 合并后的hash进行降维,最终得到simHash - * @param mergeHash 合并后的hash值 - * @return sim哈希值 - */ - public static String getSimHash(int[] mergeHash){ - // 使用StringBuilder存储simHash - StringBuilder simHash = new StringBuilder(); - // 遍历合并后的hash值,大于0则是1,小于0则是0 - for (int hash : mergeHash) { - if (hash > 0) { - simHash.append("1"); - } else { - simHash.append("0"); - } - } - return simHash.toString(); - } - - /** - * 根据词语得到simHash - * @param wordCount 词语及其出现次数 - * @return simHash - */ - public static String calculateSimHash(Map wordCount){ - // 新建一个数组用于存放合并后的hash值,初始值为0 - int[] mergeHash = new int[HASH_BIT]; - for (int i = 0; i < HASH_BIT; i++) { - mergeHash[i] = 0; - } - // 遍历词语及其出现次数,对每一个词语进行hash加权,然后合并 - wordCount.forEach((word,count) -> { - try { - int[] tempHash = hashWeight(wordHash(word),count);//加权后的hash值 - for (int i = 0; i < tempHash.length; i++) { - mergeHash[i] += tempHash[i]; - } - } catch (HashException e) { - e.printStackTrace(); - } - }); - // 降维得到simHash - return getSimHash(mergeHash); - } - - /** - * 计算两个simHash的相似度 - * @param simHash1 simHash1 - * @param simHash2 simHash2 - * @return 相似度 - */ - public static double getSimilarity(String simHash1, String simHash2) { - // 得到两个simHash的汉明距离 - // 遍历simHash1和simHash2,不相同则汉明距离加1 - int hamingDistance = 0; - int same=0; - for (int i = 0; i < simHash1.length(); i++) { - if (simHash1.charAt(i) != simHash2.charAt(i)) { - hamingDistance++; - } - if (simHash1.charAt(i)=='1' && simHash2.charAt(i)=='1') { - same++; - } - } - System.out.println("两个simHash的汉明距离为:" + hamingDistance); - // 用杰卡德系数计算文本相似度 - return (double)same/(hamingDistance+same); - } -} \ No newline at end of file diff --git a/simhash/src/main/java/utils/CommonUtils.java b/simhash/src/main/java/utils/CommonUtils.java deleted file mode 100644 index e78d1ae..0000000 --- a/simhash/src/main/java/utils/CommonUtils.java +++ /dev/null @@ -1,71 +0,0 @@ -package utils; - -import cn.hutool.core.io.FileUtil; -import cn.hutool.core.util.StrUtil; -import com.hankcs.hanlp.HanLP; -import com.hankcs.hanlp.seg.common.Term; -import exceptions.FileAnalyseException; -import exceptions.NotExistFileException; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - - -/** - * 非计算的工具类 - */ -public class CommonUtils { - //最少关键词数为3 - public static final int SHORT_WORD_LENGTH = 3; - /** - * 从文件中读取文本 - * @param filePath 文件路径 - * @return 读取出的文本 - */ - public static String readFileToStr(String filePath) throws NotExistFileException { - try { - return FileUtil.readUtf8String(filePath);//返回读取的文本 - } catch (Exception e) { - throw new NotExistFileException("该绝对路径的文件不存在"); - } - } - - /** - * 把文本解析并过滤后转为map - * @param text 读取的文本 - * @return 存放词语和词频的map - */ - public static Map analyseText(String text) throws FileAnalyseException { - //文本内容为null或“”或“ ”时,文件解析异常 - if (text == null || StrUtil.isBlank(text) || StrUtil.isEmpty(text)) { - throw new FileAnalyseException("文件解析异常,解析内容为空"); - } - // 提取关键词 - List keyList = HanLP.extractKeyword(text, text.length()); - //提取出的关键词小于3 - if (keyList.size() <= SHORT_WORD_LENGTH) { - throw new FileAnalyseException("文件解析异常,关键词太少"); - } - // 分词,找出所有词语 - List termList = HanLP.segment(text); - List allWords = termList.stream().map(term -> term.word).collect(Collectors.toList()); - // 用于存放关键词和词频的map - Map wordCount = new HashMap<>(keyList.size()); - // 遍历全部词语,获取关键词词频,返回存词语和词频的map - for (String s:keyList) { - wordCount.put(s, Collections.frequency(allWords, s)); - } - return wordCount; - } - - /** - * 将查重结果写入指定文件 - * @param filePath 文件路径 - * @param content 查重结果内容 - */ - public static void writeFile(String filePath, String content) { - FileUtil.appendString(content, filePath, "utf-8"); - } -} \ No newline at end of file diff --git a/simhash/src/test/java/MainTest.java b/simhash/src/test/java/MainTest.java deleted file mode 100644 index 71146b6..0000000 --- a/simhash/src/test/java/MainTest.java +++ /dev/null @@ -1,252 +0,0 @@ -import com.hankcs.hanlp.HanLP; -import exceptions.HashException; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; -import exceptions.FileAnalyseException; -import exceptions.NotExistFileException; -import utils.CalculationUtils; -import utils.CommonUtils; -import java.util.Arrays; -import java.util.Map; - - -public class MainTest { - //读取文件后得到的文本 - static String analyseStr; - //两个示例句子 - static String originSentence = "今天是星期天,天气晴,今天晚上我要去看电影。"; - static String compareSentence = "今天是周天,天气晴朗,我晚上要去看电影。"; - //比对结果写入的文件 - static String writeFilePath = "E:\\测试文本\\write.txt"; - //原文件 - static String OrigFilePath = "E:\\测试文本\\orig.txt"; - //5个比对文件 - static String CopyFilePath1 = "E:\\测试文本\\orig_0.8_add.txt"; - static String CopyFilePath2 = "E:\\测试文本\\orig_0.8_del.txt"; - static String CopyFilePath3 = "E:\\测试文本\\orig_0.8_dis_1.txt"; - static String CopyFilePath4 = "E:\\测试文本\\orig_0.8_dis_10.txt"; - static String CopyFilePath5 = "E:\\测试文本\\orig_0.8_dis_15.txt"; - - /** - * 测试写入文件 - */ - @Test - void testWriteFile(){ - CommonUtils.writeFile(writeFilePath, "------successfully content entry------"); - try { - String s = CommonUtils.readFileToStr(writeFilePath); - Assertions.assertTrue(s.contains("------successfully content entry------"),"写入文件失败"); - } catch (NotExistFileException e) { - e.printStackTrace(); - Assertions.fail("写入文件失败"); - } - } - - /** - * 测试读取不存在的文件 - */ - @Test - void testReadFileNotExist(){ - try { - CommonUtils.readFileToStr("E:\\not existing.txt"); - Assertions.fail("没有抛出异常"); - } catch (NotExistFileException e) { - e.printStackTrace(); - Assertions.assertTrue(true); - } - } - - /** - * 测试文件解析异常(为null,为“”,为“ ”) - */ - @Test - void testFileAnalyseException(){ - try { - CommonUtils.analyseText(null); - Assertions.fail("没有抛出异常"); - } catch (FileAnalyseException e) { - e.printStackTrace(); - Assertions.assertTrue(true); - } - try { - CommonUtils.analyseText(""); - Assertions.fail("没有抛出异常"); - } catch (FileAnalyseException e) { - e.printStackTrace(); - Assertions.assertTrue(true); - } - try { - CommonUtils.analyseText(" "); - Assertions.fail("没有抛出异常"); - } catch (FileAnalyseException e) { - e.printStackTrace(); - Assertions.assertTrue(true); - } - } - - /** - * 测试读取文件并查看分词结果 - */ - @Test - void testReadFile(){ - try { - //测试句子分词 - System.out.println("分词结果为:"+CommonUtils.analyseText(originSentence)); - //测试文本分词 - analyseStr = CommonUtils.readFileToStr(OrigFilePath); - System.out.println("分词结果为:"+CommonUtils.analyseText(analyseStr)); - } catch (Exception e) { - e.printStackTrace(); - Assertions.fail("分词结果有误"); - } - } - - /** - * 测试MD5算法hash计算hash,检查所得到hash值是否为128位 - */ - @Test - void testWordHash(){ - HanLP.extractKeyword(originSentence, originSentence.length()).forEach( - word -> { - try { - String hash = CalculationUtils.wordHash(word); - System.out.println(word +" : "+ hash); - Assertions.assertEquals(128, hash.length(), "hash值长度不是128"); - } catch (HashException e) { - Assertions.fail("哈希出错"); - e.printStackTrace(); - } - } - ); - } - - /** - * 测试哈希异常(得到hash值为空) - */ - @Test - void testHashException(){ - try { - CalculationUtils.wordHash(""); - Assertions.fail("没有抛出异常"); - } catch (HashException e) { - e.printStackTrace(); - Assertions.assertTrue(true); - } - try { - CalculationUtils.wordHash(null); - Assertions.fail("没有抛出异常"); - } catch (HashException e) { - e.printStackTrace(); - Assertions.assertTrue(true); - } - try { - CalculationUtils.wordHash(" "); - Assertions.fail("没有抛出异常"); - } catch (HashException e) { - e.printStackTrace(); - Assertions.assertTrue(true); - } - } - - /** - * 测试加权算法 - */ - @Test - void testHashWeight(){ - Map map = null; - try { - map = CommonUtils.analyseText(originSentence); - } catch (FileAnalyseException e) { - e.printStackTrace(); - Assertions.fail("解析错误"); - } - map.forEach((word, count) -> { - try { - String hash = CalculationUtils.wordHash(word); - int[] hashWeight = CalculationUtils.hashWeight(hash,count); - //打印加权后的hash值 - System.out.println(word +" : "+ Arrays.toString(hashWeight)); - Assertions.assertEquals(128, hashWeight.length, "加权后的hash值长度不是128"); - } catch (HashException e) { - Assertions.fail("哈希出错"); - e.printStackTrace(); - } - }); - } - - /** - * 测试计算simHash - */ - @Test - void testCalculateSimHash() { - try { - String hash1 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(originSentence)); - System.out.println("原句子\"" + originSentence + "\"的simHash值为:" + hash1); - Assertions.assertEquals(hash1.length(), 128, "hash值长度不是128"); - String hash2=CalculationUtils.calculateSimHash(CommonUtils.analyseText((CommonUtils.readFileToStr(OrigFilePath)))); - System.out.println("原文本的simHash值为:" + hash2); - Assertions.assertEquals(hash2.length(), 128, "hash值长度不是128"); - } catch (FileAnalyseException | NotExistFileException e) { - e.printStackTrace(); - } - } - /** - * 测试计算句子相似度 - */ - @Test - void testGetSimilarity1(){ - String hash1 = null; - String hash2 = null; - try { - hash1 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(originSentence)); - hash2 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(compareSentence)); - } catch (FileAnalyseException e) { - e.printStackTrace(); - Assertions.fail("解析错误"); - } - double similarity = CalculationUtils.getSimilarity(hash1, hash2); - String format = String.format("两个句子的相似度为:%.2f", similarity); - System.out.println(format); - Assertions.assertTrue(0 <= similarity && similarity <= 1, "相似度不在0-1之间"); - } - /** - * 测试计算文本相似度 - */ - @Test - void testGetSimilarity2(){ - String hash1; - String hash2; - try { - hash1 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(CommonUtils.readFileToStr(OrigFilePath))); - hash2 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(CommonUtils.readFileToStr(CopyFilePath1))); - double similarity = CalculationUtils.getSimilarity(hash1, hash2); - String format = String.format("两个文本的相似度为:%.2f", similarity); - System.out.println(format); - Assertions.assertTrue(0 <= similarity && similarity <= 1, "相似度不在0-1之间"); - } catch (FileAnalyseException | NotExistFileException e) { - e.printStackTrace(); - } - } - - /** - * 测试主函数 - */ - @Test - void testMain(){ - String[] args = new String[3]; - args[0] = OrigFilePath; - args[1]=CopyFilePath1; - args[2] = writeFilePath; - Main.main(args); - - args[1]=CopyFilePath2; - Main.main(args); - args[1]=CopyFilePath3; - Main.main(args); - args[1]=CopyFilePath4; - Main.main(args); - args[1]=CopyFilePath5; - Main.main(args); - args[0] = CopyFilePath3; - } -} \ No newline at end of file diff --git a/simhash/target/classes/Main.class b/simhash/target/classes/Main.class deleted file mode 100644 index 0f561640eb051ae4284750fe8ad3a87322ec6501..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2599 zcma)8-*XdH6#j0LcGG1`ODL2gP<|9>N=u{)N(0rB@S~U(BqfM|c$?g&OE-Jt?xr;U zM0oH?eNl1rfpMJC8D|u6oDvJ-yMISp^vVA~J)2F_WEvfs>AiRFp7Y&vzWd#C_xHc& z9|1UuTPhmhQ{dOoh$e=%YkY=>4Q^(_XC|(Rl*7<;L^pM3h{4wrN-792_(!>JDo`1A zu9%M)hRASZ*vjOy!gP+$rbN!sr5S+6uwn7zn@gYmxKRAA)PA`1{nwAb`awlA*18L~ zl0{E^1uNlLx|xZDk}B3As9?Q@c5Gl+mv?l-4o77+E6wxnn4!fIJbgkp#JD6khCTIw zR%5%!9U3;FlVJ@vxlypixR|AUcl|ejyL@<*=OPN8V(1dfIoe@2fME}%lDUGd4BM-W z#w5|A+m35ou~N~6?Fx2i*y(P-`V|%`CJe1PBE(5JJT*OT@sv=}jok|NXxNJ$hK*%j zxRJ^m+-1E?PFt!xB@%k}6t|~V(yQbWsz**YdNu4rAMu+J4ms-vw{)k#u(u99s}Vx6 z0QOThkH0J~Jbbt~|II>i{;!8$?A|{xsp4rIP%xn389ZB~xR}gO7=l5W;YMDZnRHbV zs}9dIh^RP(=M+4z;V=ek5Io_KW+$bU<+PcH>qL90DqK(Gk?KXt;%mkc991x+;RU={ zyGX%yM3&Bj%+m@RJTzT~&w6`AddyOY#}HKz({S8Xx3{acHk4;+R}E9eNt{wJqTw`N zV%U;0!&7-jN+X<-mI$Z0Bf>Fi+)brrO?jQ6y{AshK|0f;8pd#jp(D$uMO2z8?p)BF zDH_=hHyt`Lm+Np+MM*jTCz+`ef_tuB)^H99hUV)e8R4q^iDi>y2n?S)nYcQ3ew2ZY zkp7k361d%DXA<)~yJ6 zXQ`)C(^IX4a@rw^fDv?gTT7L&au8i#Ze<8-eH~YdM_PJDH!nyl9hG^L)Y0nESUGGY zq?F9&xFxEL0=9QJ4=@B*(@u<(-w#j@may{%0karERrn>294fk$Q*Hi*+%gA;Y6Cg4 zdY!775kkudD>6h0x0I?56}=Lf3MLf1L9?YYwVW1%nvi)bRo?4nm*@Si1;uzwO2>9A zo*Na;luX;LO_)(}9kU7wu2kP-=w7LLx;Z1KMcCU6PrWC2%8?dTp3kdwTQ>ASPjNGC z2)jEj<#aw5smf%vm}?xziJ&^>)tg(VLX;t*+%OU(@JIk}GhFF^vOjh4qdQBVeX#Ig zj_M&(7w`VEboZ0038vEXACGP=-JbjN_Rn6~WOUVPBc>^=sKISp*b3fb*u9FjdTc0o zpJChoo4RMCL>+cOp$`Ip(akuH^VBElsR0-0pShn1)V@M9Vd@$6jlK85?v+d~QtQ43 z0(g~L%`?CyT&AaHT)}HJL(c35*)`K#q4!bK9ke_^^Tptr`)K)%#o1`DZ=i^_A}P7o zUmlHqi@;d_9JcK%V%Jz7HG}^CPJa=h!;Qgy|F3Ac=-bp-L^x5z!9nFGL^_o@3=b*bOwqzdyledM-N?ao6G$c_~9mS8g313p#ayS&HebGAANdY8#>5- zGkI)6A3Es@+k%7GiU{4_QEVrcc2IgdF^N6Mk>xBxB@!-EqJ4yY6|cjsT?KGVP#;7F zDWr*zb_n{VDWZdjk$;BlA{fFHjr@dt2s*A2^GWQY>n=n4lOxP&8UZde<*670!e$E%taWJ#(Ydy|?i2Qo zvXx+1SK1c6<57$T;gFYlmkzYyORLPx@eh~qs=Q-DJJXhLrjs#Ow`y#J>11WD%t%$* z<8Lyo@3kXzvwwVhgm}Wvsbb;uD;^;kmeVR{ABu4BSGF^Mg)U+#!#|QQb;xWA>I>`n n=;ZVj^deh;M5eC}Xo#|j4cYd;RA^ySkm#|6_QGphP=K9JU+GyL diff --git a/simhash/target/classes/exceptions/HashException.class b/simhash/target/classes/exceptions/HashException.class deleted file mode 100644 index 2fbf9f364c478780d4d255df5ac107eac899f56b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 372 zcmZ`#O-sZu6r9vhmi5&_ z$Oos4W{A#|R&K(ueVBDS71u?YyGH3^d^C%(%CIZH^)|iXw*0XkUYa|}Fvyga)3%<= z=9bSZVn&$}yqfVwdHh9&u2hy`FZ<7M%n;YoT3&P(e-2_ua?>_KUaRhtU)P*^Irb4# y6#g+P3WNF@p$V~`kB(1Xz@BLekWlx*`QF1i6|nQELLVE1NRLenI8^U3kn4^M=Q~y}!HyxJExgfVCK57ac@|!+Z6pQnu-sOFgk}Y#Xx}t#bX~*cw7~ z#fIIGuy>lZ)>4@wow|xmF*uvWSSK9mZ-1Ri#AwB}^Kn07!W+BLgkHu*PnzXiSGQ_j ziqg+)uF9FJ*z+$jbaz}6PO|@k2ZVU3>sl3U)UN{x$<#JguKhWLgFmmk^lLaqEY0{w w@}(P@8^VU7dObS7cm_QQ3XsV3JwhBEL7Rg1KV9fzOPKhvjb59zBP_t~2ct+{VgLXD diff --git a/simhash/target/classes/utils/CalculationUtils.class b/simhash/target/classes/utils/CalculationUtils.class deleted file mode 100644 index fab98327300ab60c841461105c940433d2e89267..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5051 zcmcIoYj_-G89lQ*+1YF-CAk$ z_dD--&->0d`@%aD2LY_c8v%IWRUoSsuQg(NmcGx5=~h=NYiWckp{6~HD^^LUbqp6yb||zcX-s& zX>WlGx|pHH;jo~iSw#z$GSazpRI})4NyBisaY#X{gt;f;<`@s>j6_t=1Q3E(EIwPs za;)IusLPq$K(l_lPlCL-b=lPcoP%=}w5vD|t0kOr!kz%gkZ;!#IlXVIgxL+@#sZm$ zVcXU7Ra}4$?y%L&BsE6moFZ`;E?DU(x4qDqs{?_$Q^i`Wqd6^|)>BalEk(@DTBGeeE>UqQ`XyAJ z;H4{N=`lUSD5ixd*c3(z{Bxb@YUG}M7*sKYjbzp`!$wNWj5@<25sK|w#k}97;xY^~ zxJhk9?=({p&Dv;KaXtyD3l%OYvR_drmf%7m#}5g|H!9dnsG&ZIy3*v=zQ-f{TAb%*nL0XLx{i4+4U^00&a%H*okkF*4- z%@1-fwN^obr6PyzLMstbiX@!bFg!`BUF){yQW4=|C~OG!a!EbYJ_}ca73{+*K7x-* zD6_HJvwJjdpisu^vB6{m;9Hzi4aF>c*;@wnkZ^w!m85ziEq9FS>bo6hyBHT5=cyXJuNden?c`0BOsp>W(T4G4c7^3ZV%2wQ<{A;>JOlFF>of}ccQkc$X zt;~R)WI(HCrY5Ht66&3SDVa`txzLm>7?3b;#`?)x!$Pu^DV=aaE)ld#)0D|~Q@*e+ zB0*AY!l*pkLj_Mr@C*}wU@A-KukR{14y6K8M<%0&73G}<&Nu<$wAa}rblHoJsFvm)2E&{gS7U79@b1zLfhBS zVrjn-OKDavL;5Z{3E}^0rAQqjvrax52};N!UuPB3Z~-PaLmSspw=RoqCmxrvfXB5Y z4=Za8QG|uO<*b(EZC*l4Zi#A(P9}U2U(Z37fth<0^h`3*IK;hJ0MbqzYrEg=a7D&S zB2IK>7qLU(^qaX%L|<=+B&t5S%r6y{f`un+niexeOZVtj+>B<+eRwH=pW_z_ekm%< zuOuw8i;7*njMR2>MCWNpoYtHd(t0gonHkc0XCbZbM91g`o}ChUc2T!!j^xsX@pri{ zz&~x_0A*EV`hPh?xF+avqTk@z#E;*Qt5*J#=-tZ!yn^2;_`Ql(@dp<78Qyi!fe9l7+1o|Nox8Ck@jE9dZj6T0@}7~ji>@(&)$KjP-q(Oo-^?7R8Mowpquzvt*f`+3NCYxmGV>YZ1Pt7@>tcM$C@#OH|{8r_8foNUEFK0X&%G+-8j3M zt4rj~W4NfIB9GoY1}5+U;9R+7o;*(;!{thJ3?H1b@j?riRpOfaT$GwX$-c1kD~!Q`Car? zK0k(L*hQvDoSDXlh})l2!s*^4=N5MLpSJ8AhL?19Xq-Z@-%H$$|LBg^qHk*vV& zcm!V~>lqB<>wNOkP5_V6x`@+D*vqxRUe;CGVdYMbs*5=T~i+*uzn096>aw; z&|E*iZx2eF>pk=1eLK92(W}h@hr`+eTOX_G;rTRu0YB9`$Wfu^b}ZEeWJ{t87IIT`-Ngm)=J-cRTQXyTX7 zGPYM?lMVlLx?0CTK4J$={GO2~RB+7-e2bxyY|slJA14DPgl$J;TtwtwsFmJ^?@il@ z3G8>$igNSv4+z?~e8NN@l*-4gxgyqFXkccLj1aZNtKo--&nwaR4SQ3&#UvRRSBui#z*4=+*E21FXI%YKYsP(^@U5<^QVgMzgB$X-4GhFT4udQU`@5anCor$oPuT*w_}~a zx=-brW0DbO;U=pjvwH~JH0AupVpCdXNPjA)n+ZMIaZsAPL&XMZj~YFxv^}HXPJtF* zRx+F2re@4!{Yl*=aF3h9&2wize(R;;?MFAHrtbt6+AT^l4`N36iQG zvol)4eq5h9YGuX++WhHyr1pEPIuON4P;?o#;eG|5SMh+1W`)D<;=09OC z!m0swtM~%G$coR}EZvl((l6((O3Jwz4YPmtsEFYafjTXnHfG4;68VXonUG~0>GfWS zLN6Xw(5Ipwj|tpXMq!C!>kp12|gFA#L6=*8PkPrx$?UzIt^putz zVEH-)e^6@ZDki|e6fm9T)g{~hctc3yu!3n723Qhn%9cH7$S_fCn8>(o&NA5S$D}M3 z^^H1rxJEnJcNWqLo>Gwk&A7lg{`*Z^pVXPMazrz7`rw32zSLQM2_~Pz5d}w890T1i zFNa8AnQc19oJ+;z&#g*8l}q(_h83Q(C${X6)Bl?)zJ+fKsH}P29A`H;v%R-Ullg49 zVtxd?6|X|PgCp92@8Ven-&1iMCphdY#t^%jM~$2fkzqYULIUm>3RgNKnnQhq`-TsV z4(@y8P`qd0(cwM;(NBb8>#ttel*ojXovU_S&VW%%n@mwE9G)&0W64J<_`w+XEG1x8gb~)gJPxF_0?8 zeeS##augeel9Q%p=Q8Btv0J0}e;7WR;dqj4^MtNq*cVT!<~ad5LD}CD+SP`Z8XMPI zef+j=WuoozTLR)Aubyp(aM>4k-CA8BG-Ty62`~7jSozGoPtFkD8unP0o#la_-mBYF z)_Arth>Ib7fZr+jP{l|1JsmmY#u+O~^N2OAN8(OG)X*k)Q?N3eNy}3HufO_X@|0W$ zyk(3VdbTxgS<|_6m3iSqbBW?G3(znAzi+xOKU>!iv}ZZSMj0aU(I(7&>S|jW?!H*P zCBjz-;X%x+co*lJaGrCt-!%11%+Rt~J*(gfZ&Ve4R>MZYH6GcYEy}~)1ri(Sw1#s| zpc&8MB%j~sR}Fr^6PME+LH<6^l?b1L&$`Yzhz}f*7x*i;fmL{szp5*MQ}`jj8t@~o z)HupdqSkTJ)R~8JF}{Gt1L2i(SasFW2++-PN=SPv_yq3Y$_CnQaqO<8ZV(~7gqI!p zCj1yDDD@Nmr~e*>+OP0Ez}-gr{V86h6$UMib4}sSna)3BC4H|wjSwH}U&rzVv>XU; zoWtE$J3B9-HIL5t<^tGW!L#1xc4sq>oqO*s;9-Gz>;?+xsX-o(7x1MTT*J131?)fI zD$ZfFE5O%-0$p{@bp?D~;B~agH{OP?p{*;}9B2-1X%6Nwm4{h?U5n9kpZu*kaM6ig z=6^pi13D4rg>^k|+-=xO=(k}fugBe_>L9lB+Od;Fb>TGcvu|RTgHbn;I7uFUMr49A zW4z{I)P`pm#Azba%?r@a`Bp<4y?BFfwY0RI7TzLCzi_+;=ugU@S(N|fqWqaf`Cn09 zV!sJ>;f8-fDM~F z!%ye&jnQ&($v;7X9^Nf_7YjiamP@~UJI5qNQ%LrVXRchQAK^O^*BYm;+&X=5Nu9oj j-*CSczjdVE#|1uR4bO7z5=mabWv+y{@&{bSAJP0D(O^pJ diff --git a/simhash/target/test-classes/MainTest.class b/simhash/target/test-classes/MainTest.class deleted file mode 100644 index 940d97752b520cd25ea4271b532957d8c47adc3f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7780 zcmcIpd3;p$wg3LIFvD;|!hk>kA?#TRL^c_WvIGfDfRYfzNcA$gNk%3!aTX-)Tiit! z#kzq;(JJCvgeVRHwC!73?cToqwRSNR@NK@W+Sb?lzUSUMlRL?1+ds+&?!EI{&-tG3 z`Towmy#3q$Hvn9rYCXt7t_yh@@-a%G=th0FULDj!9o3t5-e~w^3ZqsALV;MVLT+hU z%V@YU+69k>G0+s;jd~!|Y(zP%;O0mmW~>VYjm>(jQ^DIXv{X|p5(stp+!(7cugd%r zjr;vZG};~y275~U;ZV#7#Y*@u(qr{|P=s+Vcr}d21ch<&SRfd!UK8%>3Wv7Jv%=`~ zjtcWjM{YN=rL0Akov2|FCez}Gp|^>_&0!iy%d)IdHgXXUreLZI(==R)=^2PwZW0yo zhrrOrsm?Xmbn1~N8Qm$p} z@f+Rdtm=*7SlymLG$shtrJ6mMk8%wa!iBNRYT zJQ*8~1UdqtCK6nN?`7^aXlTSng-KodE@MqN#;B$|OS_SkjGOUU z7q)0%DhfJaO>C*#`!n=w)y(o;V?Sqifg+KQ$$2ggQhes44Xx7-)B)LqjJ5 zzXn^xPbdv?IGf@Uo$@HK<9@J&mt(Kg*#i{jrw9|#pD@=B=__Xx&+ z%ETz)A2A@!sz4zb|;_RiO-zxszrZ6+Ro2vs3NAJL$F5D%K-c3y$a+Haq zFe|Hfj`@~<_h{HBWG?dCZW&F1E-~ASTpSRlA5>VLC0tpQYGjie_Yt>FE|f*YHI=Ko~3;th*_$%lR;=w7!1iVr4B}&6p=a3Jx_K;nhueaDh7AG+}9 z9Ty%;44gY>`q2OWxku+NsA>1$Nj&Akmoz+$XEKClZ8*L&NKtp~)`M|lQ@f<_TBifB=nQAKwn?hO0D}AzU{(yG<+A|OGVn0j#ZI} z-b2>9V^WRqe3X;CH6>@J&e~9TS*jgym3R9fBW%P0W*wlnn2IqFk`B!MkrJ12pNir!j!_w283eD2Yp(`Ci zYbUOw#g-CZEmt&gOVtCZgHs`25}I|hd9PtfB^F-oJc!uoi8_uYwKm%*PIb#vTVwS| z2SYhNi^Nj3f_m4^HhoSe7IU~yX%;=p;!ar$2`zRj|?jYh09+!ieyr6zk+v6|vi zQ+dv-X$o_OR6G#c9o}VBTQRW;?{%yy!x0|rhn;TP1Fc0_gSG{Y=$wXdcvrmJ=Wt|X za5E-0lOb7#JMqtQ4%jHgTT-lJA=H%rapIn0a{abc-yhsdp# zC|t@>>53@JkY!ACRkET)QSs_SAtSOTDAjD#rTE4>cSJ<9kt=L zfO7aAN{8eHk56uSS2gb}h|N2{@tAkM8Jc%ieCC}sn0aT#W!_m|nfI@n_d>pK@naIN z7uj0PJ71}hU;aAONj~J@Jzk|7D8Rq)s#yc@ulP6q0{)%7Ip*ZUyo-@#r{MAyBv8nE zg74cE-h$I8_RdIP=5dT|s5p%>`BAy;B$slt7QU3=j~X7M*Lo7cH0#60$uo4wOf z%2$UnUMox^N;tldU$y?jG-o<-c_0628dhc-_Fr7bg`6EO%=h5G`9pm8AKrogOPR!lZQYw zk&715hm=jlnKo^rP3~bf{UT+Pa@jVmR|zc3H0cU4$y@sg?N?=QKS`)SsK&y_FKM$0 zs*hpzp_czF&^Pj{9k7ZtSj}5lHI5#ecRReFOYshLV*y$-c=tJ)af5vLV4uO> zopjUBShq2*?dix?n{4}~$&3{a7s$4d3$4T1_N%nvbu?V)G>KO8?`hnSKwCd<1kNCo zgQ0Yj&%8hvdDLU>YNgyHd#_9~|2nn&5OIVdu}tQCL6}2AWo!2SN%nS&IKurNGFyn# zV-u$*gE+U+g4>u2cX8zHSgN_Z2uB_j#3bxBx9apl041+6Q=#5 zOa&%GJ}{#=+m2ceHT*aDE!X{+t4T!77P_M#2{0ZxAG0~f1-FY3GMfBMkP&JgHYufj zhnt8eO_m=C?4?JN40K_?nF+boL&L1^O`_0VVaA6)bKsN7UwwbO^!T|jeL9Gg0G|_>A_Vh$Mm*_ zpVM;1L5|6zGFti5;fQZy8ov>g;M?T*cTtJ&5smLtLVtkk@Iw=YS{gjf z=7WT6N16|HP8LvBprUyB~UK{cDE zT&0HLua#PjK3R z{4`Di4@kj}d1y1E+;SN~FLw$qrbMpA{!!=o{#lVr)S4>t2v<`DIpb=oNZ=LueLDSg zCjE3S{q&~w#Mx(>hBUpF>Ijxh`7D^EZW+fMn8cc8DuXwZg~?o&B^CGu_3;DhddCDZ$B z34DDc>0e%zz&HCzRgnSeMAaMKA5bYo550dQT1aX5f>r!T$(0iPd6Yij7@c^QN$?zl z^gK(lmpS4ElK({p_$8J?Up9?h%u)H&+5$7_$1)Iasj)Pu(oBYrOjnkvLiWm^WvWPx zquD}t9;CY{f%Fl;Qp{{s#?K^y=8 diff --git a/simhash/target/test-classes/classpath.index b/simhash/target/test-classes/classpath.index deleted file mode 100644 index c91390c74815757960dbd0886b529702b8074878..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 96 ecmb1OU}QiAa{E_3@Vo8Et%xd2BroI{szLxpkOffy -- GitLab