diff --git a/helloworld.txt b/helloworld.txt deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/simhash/.idea/.gitignore b/simhash/.idea/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..35410cacdc5e87f985c93a96520f5e11a5c822e4 --- /dev/null +++ b/simhash/.idea/.gitignore @@ -0,0 +1,8 @@ +# 默认忽略的文件 +/shelf/ +/workspace.xml +# 基于编辑器的 HTTP 客户端请求 +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/simhash/.idea/compiler.xml b/simhash/.idea/compiler.xml new file mode 100644 index 0000000000000000000000000000000000000000..e5fa1de71db3fef18d34e22b1774d73fadcc8e67 --- /dev/null +++ b/simhash/.idea/compiler.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/simhash/.idea/encodings.xml b/simhash/.idea/encodings.xml new file mode 100644 index 0000000000000000000000000000000000000000..aa00ffab7828f4818589659c804ec2cfd99baed3 --- /dev/null +++ b/simhash/.idea/encodings.xml @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/simhash/.idea/jarRepositories.xml b/simhash/.idea/jarRepositories.xml new file mode 100644 index 0000000000000000000000000000000000000000..5a2f139ce25c6f225e0cb5fb199704f51273de00 --- /dev/null +++ b/simhash/.idea/jarRepositories.xml @@ -0,0 +1,20 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/simhash/.idea/misc.xml b/simhash/.idea/misc.xml new file mode 100644 index 0000000000000000000000000000000000000000..82dbec8ad28463aed32007a93ffc07865ae98968 --- /dev/null +++ b/simhash/.idea/misc.xml @@ -0,0 +1,14 @@ + + + + + + + + + + \ No newline at end of file diff --git a/simhash/pom.xml b/simhash/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..e30812d890a7125eee67dfe6fbbecda91f1e637a --- /dev/null +++ b/simhash/pom.xml @@ -0,0 +1,50 @@ + + + 4.0.0 + + org.example + simhash + 1.0-SNAPSHOT + + + org.apache.commons + commons-lang3 + 3.5 + + + cn.hutool + hutool-all + 5.7.13 + + + com.hankcs.nlp + hanlp-lucene-plugin + 1.1.7 + + + junit + junit + 4.13.2 + + + cn.hutool + hutool-http + 5.8.14 + + + org.junit.jupiter + junit-jupiter-api + 5.8.2 + test + + + + + 17 + 17 + UTF-8 + + + \ No newline at end of file diff --git a/simhash/src/main/java/Main.java b/simhash/src/main/java/Main.java new file mode 100644 index 0000000000000000000000000000000000000000..40c8b9379abfbd52dc1d810388e121b67984f429 --- /dev/null +++ b/simhash/src/main/java/Main.java @@ -0,0 +1,39 @@ +import cn.hutool.core.date.DateUtil; +import exceptions.FileAnalyseException; +import exceptions.NotExistFileException; +import utils.CalculationUtils; +import utils.CommonUtils; +import java.util.Map; + +public class Main { + //合法参数个数为3 + static final int ARGS_NUM = 3; + public static void main(String[] args){ + // 读取并解析参数 + if (args.length != ARGS_NUM) { + throw new IllegalArgumentException("参数个数不正确"); + } + // 解析文件,处理分词 + Map originWordCount = null; + Map compareWordCount = null; + try { + originWordCount = CommonUtils.analyseText(CommonUtils.readFileToStr(args[0])); + compareWordCount = CommonUtils.analyseText(CommonUtils.readFileToStr(args[1])); + } catch (FileAnalyseException | NotExistFileException e) { + e.printStackTrace(); + } + // 获取simHash值 + String simHash1 = CalculationUtils.calculateSimHash(originWordCount); + String simHash2 = CalculationUtils.calculateSimHash(compareWordCount); + //计算相似度,保留两位小数 + double result = CalculationUtils.getSimilarity(simHash1, simHash2); + String format = String.format("相似度为:%.2f", result); + String writeFileContent = "---------------------------------------" + "\n" + + "原文件:" + args[0] + "\n" + + "对比文件:" + args[1] + "\n" + + format + "\n" + + "比较时间为:" + DateUtil.now() + "\n"; + ; + CommonUtils.writeFile(args[2],writeFileContent); + } +} diff --git a/simhash/src/main/java/exceptions/FileAnalyseException.java b/simhash/src/main/java/exceptions/FileAnalyseException.java new file mode 100644 index 0000000000000000000000000000000000000000..11aec77998e36d564d6696ba7b8745ee892c4d04 --- /dev/null +++ b/simhash/src/main/java/exceptions/FileAnalyseException.java @@ -0,0 +1,12 @@ +package exceptions; + +/** + * @author HJW + * @date 2022-09-21 12:57 + * 文件解析异常(转字符串为空或者过滤时没有可用词) + */ +public class FileAnalyseException extends Exception { + public FileAnalyseException(String message) { + super(message); + } +} \ No newline at end of file diff --git a/simhash/src/main/java/exceptions/HashException.java b/simhash/src/main/java/exceptions/HashException.java new file mode 100644 index 0000000000000000000000000000000000000000..c9e19026607f6ecbc1a0fb9ebd461ea8bdf8ab3b --- /dev/null +++ b/simhash/src/main/java/exceptions/HashException.java @@ -0,0 +1,14 @@ +package exceptions; + +import java.security.NoSuchAlgorithmException; + +/** + * @author HJW + * @date 2022-09-21 12:57 + * hash异常 md5 + */ +public class HashException extends NoSuchAlgorithmException { + public HashException(String message) { + super(message); + } +} \ No newline at end of file diff --git a/simhash/src/main/java/exceptions/NotExistFileException.java b/simhash/src/main/java/exceptions/NotExistFileException.java new file mode 100644 index 0000000000000000000000000000000000000000..9dc0f966899e87c7b73df7581a1edd6da08c1e99 --- /dev/null +++ b/simhash/src/main/java/exceptions/NotExistFileException.java @@ -0,0 +1,14 @@ +package exceptions; + +import java.io.FileNotFoundException; + + +/** + * @author HJW + * 找不到文件的自定义异常 + */ +public class NotExistFileException extends FileNotFoundException { + public NotExistFileException(String message) { + super(message); + } +} \ No newline at end of file diff --git a/simhash/src/main/java/utils/CalculationUtils.java b/simhash/src/main/java/utils/CalculationUtils.java new file mode 100644 index 0000000000000000000000000000000000000000..664a0a966b4656bf134f4470be959213a7f91c7f --- /dev/null +++ b/simhash/src/main/java/utils/CalculationUtils.java @@ -0,0 +1,158 @@ +package utils; + +import cn.hutool.core.util.StrUtil; +import exceptions.HashException; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.Map; + +/** + * 与计算有关的工具类 + */ +public class CalculationUtils { + static final int HASH_BIT = 128; + static final int DISTANCE_WAY1 = 16; + static final int DISTANCE_WAY2 = 32; + static final int DISTANCE_WAY3 = 64; + + /** + * 采用MD5进行对词语进行hash,得到的hash值使用16进制解析 再利用算法取128位二进制 + * @param word 词语 + * @return 128位二进制 + */ + public static String wordHash(String word) throws HashException { + //如果传入词语为null或“”或“ ” + if (word == null || StrUtil.isBlank(word) || StrUtil.isEmpty(word)) { + throw new HashException("词语为空"); + } + try { + // 采用MD5算法进行hash + MessageDigest digest = MessageDigest.getInstance("MD5"); + digest.update(word.getBytes(StandardCharsets.UTF_8)); + // hash值转为32位16进制 + StringBuilder hash = new StringBuilder(); + for (byte b : digest.digest()) { + hash.append(String.format("%02x", b)); + } + + // 16进制转为128位2进制码 + StringBuilder finalHash = new StringBuilder(); + String strTemp; + for (int i = 0; i < hash.length(); i ++) { + // 每一位16进制数加上0000 最后截取后面的4位 得到便是这位数的二进制 + strTemp = "0000" + Integer.toBinaryString(Integer.parseInt(hash.substring(i, i + 1), 16)); + finalHash.append(strTemp.substring(strTemp.length() - 4)); + } + + // 不为128直接报错 + if (finalHash.length() != HASH_BIT) { + throw new HashException("hash值长度不为128"); + } + + return finalHash.toString(); + + } catch (NoSuchAlgorithmException e) { + throw new HashException("MD5算法异常"); + } + + } + + /** + * 给二进制哈希值加权 + * @param hash 二进制哈希值 + * @param weight 权重 + * @return 加权后的二进制哈希值 + */ + public static int[] hashWeight(String hash, int weight) { + // 新建一个数组用于存放加权后的二进制哈希值 + int[] hashArray = new int[HASH_BIT]; + // 遍历二进制哈希值,0则是-1,1则是1,将每一位加权后存入数组 + for (int i = 0; i < hash.length(); i++) { + if (hash.charAt(i) == '1') { + hashArray[i] = weight; + } else { + hashArray[i] = -1 * weight; + } + } + + return hashArray; + } + + /** + * 得到的合并后的hash值进行降维,最终得到simHash + * @param mergeHash 合并后的hash值 + * @return sim哈希值 + */ + public static String getSimHash(int[] mergeHash){ + // 使用StringBuilder存储simHash + StringBuilder simHash = new StringBuilder(); + // 遍历合并后的hash值,大于0则是1,小于0则是0 + for (int hash : mergeHash) { + if (hash > 0) { + simHash.append("1"); + } else { + simHash.append("0"); + } + } + return simHash.toString(); + } + + + /** + * 根据词语得到simHash + * @param wordCount 词语及其出现次数 + * @return simHash + */ + public static String calculateSimHash(Map wordCount){ + // 新建一个数组用于存放合并后的hash值,初始值为0 + int[] mergeHash = new int[HASH_BIT]; + for (int i = 0; i < HASH_BIT; i++) { + mergeHash[i] = 0; + } + // 遍历词语及其出现次数,对每一个词语进行hash加权,然后合并 + wordCount.forEach((word,count) -> { + try { + int[] tempHash = hashWeight(wordHash(word),count); + for (int i = 0; i < tempHash.length; i++) { + mergeHash[i] += tempHash[i]; + } + } catch (HashException e) { + e.printStackTrace(); + } + }); + + // 降维得到simHash + return getSimHash(mergeHash); + } + + /** + * 计算两个simHash的相似度 + * @param simHash1 simHash1 + * @param simHash2 simHash2 + * @return 相似度 + */ + public static double getSimilarity(String simHash1, String simHash2) { + // 汉明距离 + int distance = 0; + // 遍历simHash1和simHash2,不相同则汉明距离加1 + for (int i = 0; i < simHash1.length(); i++) { + if (simHash1.charAt(i) != simHash2.charAt(i)) { + distance++; + } + } +// System.out.println("汉明距离为:" + distance); + // 更换计算策略 + if (distance >= 0 && distance <= DISTANCE_WAY1) { + return 1 - (double) distance / 256; + } else if (distance > 16 && distance <= DISTANCE_WAY2) { + return 1 - (double) distance / 128; + }else if (distance > 32 && distance <= DISTANCE_WAY3) { + return 1 - (double) distance / 64; + }else { + return 0; + } + + } + +} \ No newline at end of file diff --git a/simhash/src/main/java/utils/CommonUtils.java b/simhash/src/main/java/utils/CommonUtils.java new file mode 100644 index 0000000000000000000000000000000000000000..e78d1ae0f3b89e90a31e5061e55185c6f7319404 --- /dev/null +++ b/simhash/src/main/java/utils/CommonUtils.java @@ -0,0 +1,71 @@ +package utils; + +import cn.hutool.core.io.FileUtil; +import cn.hutool.core.util.StrUtil; +import com.hankcs.hanlp.HanLP; +import com.hankcs.hanlp.seg.common.Term; +import exceptions.FileAnalyseException; +import exceptions.NotExistFileException; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + + +/** + * 非计算的工具类 + */ +public class CommonUtils { + //最少关键词数为3 + public static final int SHORT_WORD_LENGTH = 3; + /** + * 从文件中读取文本 + * @param filePath 文件路径 + * @return 读取出的文本 + */ + public static String readFileToStr(String filePath) throws NotExistFileException { + try { + return FileUtil.readUtf8String(filePath);//返回读取的文本 + } catch (Exception e) { + throw new NotExistFileException("该绝对路径的文件不存在"); + } + } + + /** + * 把文本解析并过滤后转为map + * @param text 读取的文本 + * @return 存放词语和词频的map + */ + public static Map analyseText(String text) throws FileAnalyseException { + //文本内容为null或“”或“ ”时,文件解析异常 + if (text == null || StrUtil.isBlank(text) || StrUtil.isEmpty(text)) { + throw new FileAnalyseException("文件解析异常,解析内容为空"); + } + // 提取关键词 + List keyList = HanLP.extractKeyword(text, text.length()); + //提取出的关键词小于3 + if (keyList.size() <= SHORT_WORD_LENGTH) { + throw new FileAnalyseException("文件解析异常,关键词太少"); + } + // 分词,找出所有词语 + List termList = HanLP.segment(text); + List allWords = termList.stream().map(term -> term.word).collect(Collectors.toList()); + // 用于存放关键词和词频的map + Map wordCount = new HashMap<>(keyList.size()); + // 遍历全部词语,获取关键词词频,返回存词语和词频的map + for (String s:keyList) { + wordCount.put(s, Collections.frequency(allWords, s)); + } + return wordCount; + } + + /** + * 将查重结果写入指定文件 + * @param filePath 文件路径 + * @param content 查重结果内容 + */ + public static void writeFile(String filePath, String content) { + FileUtil.appendString(content, filePath, "utf-8"); + } +} \ No newline at end of file diff --git a/simhash/src/test/java/MainTest.java b/simhash/src/test/java/MainTest.java new file mode 100644 index 0000000000000000000000000000000000000000..c9879a91f58a09a0e900f56a8b99184c362eb456 --- /dev/null +++ b/simhash/src/test/java/MainTest.java @@ -0,0 +1,17 @@ +import org.junit.jupiter.api.Test; +public class MainTest { + static String writeFilePath = "E:\\测试文本\\write.txt"; + static String OrigFilePath = "E:\\测试文本\\orig.txt"; + static String CopyFilePath1 = "E:\\测试文本\\orig_0.8_add.txt"; + /** + * 测试主函数 + */ + @Test + void testMain(){ + String[] args = new String[3]; + args[0] = OrigFilePath; + args[1] = CopyFilePath1; + args[2] = writeFilePath; + Main.main(args); + } +} \ No newline at end of file diff --git a/simhash/target/classes/Main.class b/simhash/target/classes/Main.class new file mode 100644 index 0000000000000000000000000000000000000000..52bc6bc9e84e51a69a679bb356dc83289f94bd74 Binary files /dev/null and b/simhash/target/classes/Main.class differ diff --git a/simhash/target/classes/classpath.index b/simhash/target/classes/classpath.index new file mode 100644 index 0000000000000000000000000000000000000000..710196b307608a6994e17db7259659a6ae4d8f59 Binary files /dev/null and b/simhash/target/classes/classpath.index differ diff --git a/simhash/target/classes/exceptions/FileAnalyseException.class b/simhash/target/classes/exceptions/FileAnalyseException.class new file mode 100644 index 0000000000000000000000000000000000000000..11551ebc3366ded552220d16ff4e26675736aec0 Binary files /dev/null and b/simhash/target/classes/exceptions/FileAnalyseException.class differ diff --git a/simhash/target/classes/exceptions/HashException.class b/simhash/target/classes/exceptions/HashException.class new file mode 100644 index 0000000000000000000000000000000000000000..b13d9027bd8738487895a841c52869310a3f7ffa Binary files /dev/null and b/simhash/target/classes/exceptions/HashException.class differ diff --git a/simhash/target/classes/exceptions/NotExistFileException.class b/simhash/target/classes/exceptions/NotExistFileException.class new file mode 100644 index 0000000000000000000000000000000000000000..0318cb4fd72d8fbf922646f96ae0fa7864cfe6be Binary files /dev/null and b/simhash/target/classes/exceptions/NotExistFileException.class differ diff --git a/simhash/target/classes/utils/CalculationUtils.class b/simhash/target/classes/utils/CalculationUtils.class new file mode 100644 index 0000000000000000000000000000000000000000..17eec45b74425e88d2783a66ca3419e2603382ef Binary files /dev/null and b/simhash/target/classes/utils/CalculationUtils.class differ diff --git a/simhash/target/classes/utils/CommonUtils.class b/simhash/target/classes/utils/CommonUtils.class new file mode 100644 index 0000000000000000000000000000000000000000..a66b00e332940e3a56bcd0d0ec90a6908d10a234 Binary files /dev/null and b/simhash/target/classes/utils/CommonUtils.class differ diff --git a/simhash/target/test-classes/MainTest.class b/simhash/target/test-classes/MainTest.class new file mode 100644 index 0000000000000000000000000000000000000000..232debddd15f6020ee04f567fe236e24a519af77 Binary files /dev/null and b/simhash/target/test-classes/MainTest.class differ diff --git a/simhash/target/test-classes/classpath.index b/simhash/target/test-classes/classpath.index new file mode 100644 index 0000000000000000000000000000000000000000..c91390c74815757960dbd0886b529702b8074878 Binary files /dev/null and b/simhash/target/test-classes/classpath.index differ