diff --git a/helloworld.txt b/helloworld.txt
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/simhash/.idea/.gitignore b/simhash/.idea/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..35410cacdc5e87f985c93a96520f5e11a5c822e4
--- /dev/null
+++ b/simhash/.idea/.gitignore
@@ -0,0 +1,8 @@
+# 默认忽略的文件
+/shelf/
+/workspace.xml
+# 基于编辑器的 HTTP 客户端请求
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/simhash/.idea/compiler.xml b/simhash/.idea/compiler.xml
new file mode 100644
index 0000000000000000000000000000000000000000..e5fa1de71db3fef18d34e22b1774d73fadcc8e67
--- /dev/null
+++ b/simhash/.idea/compiler.xml
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/simhash/.idea/encodings.xml b/simhash/.idea/encodings.xml
new file mode 100644
index 0000000000000000000000000000000000000000..aa00ffab7828f4818589659c804ec2cfd99baed3
--- /dev/null
+++ b/simhash/.idea/encodings.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/simhash/.idea/jarRepositories.xml b/simhash/.idea/jarRepositories.xml
new file mode 100644
index 0000000000000000000000000000000000000000..5a2f139ce25c6f225e0cb5fb199704f51273de00
--- /dev/null
+++ b/simhash/.idea/jarRepositories.xml
@@ -0,0 +1,20 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/simhash/.idea/misc.xml b/simhash/.idea/misc.xml
new file mode 100644
index 0000000000000000000000000000000000000000..82dbec8ad28463aed32007a93ffc07865ae98968
--- /dev/null
+++ b/simhash/.idea/misc.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/simhash/pom.xml b/simhash/pom.xml
new file mode 100644
index 0000000000000000000000000000000000000000..e30812d890a7125eee67dfe6fbbecda91f1e637a
--- /dev/null
+++ b/simhash/pom.xml
@@ -0,0 +1,50 @@
+
+
+ 4.0.0
+
+ org.example
+ simhash
+ 1.0-SNAPSHOT
+
+
+ org.apache.commons
+ commons-lang3
+ 3.5
+
+
+ cn.hutool
+ hutool-all
+ 5.7.13
+
+
+ com.hankcs.nlp
+ hanlp-lucene-plugin
+ 1.1.7
+
+
+ junit
+ junit
+ 4.13.2
+
+
+ cn.hutool
+ hutool-http
+ 5.8.14
+
+
+ org.junit.jupiter
+ junit-jupiter-api
+ 5.8.2
+ test
+
+
+
+
+ 17
+ 17
+ UTF-8
+
+
+
\ No newline at end of file
diff --git a/simhash/src/main/java/Main.java b/simhash/src/main/java/Main.java
new file mode 100644
index 0000000000000000000000000000000000000000..40c8b9379abfbd52dc1d810388e121b67984f429
--- /dev/null
+++ b/simhash/src/main/java/Main.java
@@ -0,0 +1,39 @@
+import cn.hutool.core.date.DateUtil;
+import exceptions.FileAnalyseException;
+import exceptions.NotExistFileException;
+import utils.CalculationUtils;
+import utils.CommonUtils;
+import java.util.Map;
+
+public class Main {
+ //合法参数个数为3
+ static final int ARGS_NUM = 3;
+ public static void main(String[] args){
+ // 读取并解析参数
+ if (args.length != ARGS_NUM) {
+ throw new IllegalArgumentException("参数个数不正确");
+ }
+ // 解析文件,处理分词
+ Map originWordCount = null;
+ Map compareWordCount = null;
+ try {
+ originWordCount = CommonUtils.analyseText(CommonUtils.readFileToStr(args[0]));
+ compareWordCount = CommonUtils.analyseText(CommonUtils.readFileToStr(args[1]));
+ } catch (FileAnalyseException | NotExistFileException e) {
+ e.printStackTrace();
+ }
+ // 获取simHash值
+ String simHash1 = CalculationUtils.calculateSimHash(originWordCount);
+ String simHash2 = CalculationUtils.calculateSimHash(compareWordCount);
+ //计算相似度,保留两位小数
+ double result = CalculationUtils.getSimilarity(simHash1, simHash2);
+ String format = String.format("相似度为:%.2f", result);
+ String writeFileContent = "---------------------------------------" + "\n" +
+ "原文件:" + args[0] + "\n" +
+ "对比文件:" + args[1] + "\n" +
+ format + "\n" +
+ "比较时间为:" + DateUtil.now() + "\n";
+ ;
+ CommonUtils.writeFile(args[2],writeFileContent);
+ }
+}
diff --git a/simhash/src/main/java/exceptions/FileAnalyseException.java b/simhash/src/main/java/exceptions/FileAnalyseException.java
new file mode 100644
index 0000000000000000000000000000000000000000..11aec77998e36d564d6696ba7b8745ee892c4d04
--- /dev/null
+++ b/simhash/src/main/java/exceptions/FileAnalyseException.java
@@ -0,0 +1,12 @@
+package exceptions;
+
+/**
+ * @author HJW
+ * @date 2022-09-21 12:57
+ * 文件解析异常(转字符串为空或者过滤时没有可用词)
+ */
+public class FileAnalyseException extends Exception {
+ public FileAnalyseException(String message) {
+ super(message);
+ }
+}
\ No newline at end of file
diff --git a/simhash/src/main/java/exceptions/HashException.java b/simhash/src/main/java/exceptions/HashException.java
new file mode 100644
index 0000000000000000000000000000000000000000..c9e19026607f6ecbc1a0fb9ebd461ea8bdf8ab3b
--- /dev/null
+++ b/simhash/src/main/java/exceptions/HashException.java
@@ -0,0 +1,14 @@
+package exceptions;
+
+import java.security.NoSuchAlgorithmException;
+
+/**
+ * @author HJW
+ * @date 2022-09-21 12:57
+ * hash异常 md5
+ */
+public class HashException extends NoSuchAlgorithmException {
+ public HashException(String message) {
+ super(message);
+ }
+}
\ No newline at end of file
diff --git a/simhash/src/main/java/exceptions/NotExistFileException.java b/simhash/src/main/java/exceptions/NotExistFileException.java
new file mode 100644
index 0000000000000000000000000000000000000000..9dc0f966899e87c7b73df7581a1edd6da08c1e99
--- /dev/null
+++ b/simhash/src/main/java/exceptions/NotExistFileException.java
@@ -0,0 +1,14 @@
+package exceptions;
+
+import java.io.FileNotFoundException;
+
+
+/**
+ * @author HJW
+ * 找不到文件的自定义异常
+ */
+public class NotExistFileException extends FileNotFoundException {
+ public NotExistFileException(String message) {
+ super(message);
+ }
+}
\ No newline at end of file
diff --git a/simhash/src/main/java/utils/CalculationUtils.java b/simhash/src/main/java/utils/CalculationUtils.java
new file mode 100644
index 0000000000000000000000000000000000000000..664a0a966b4656bf134f4470be959213a7f91c7f
--- /dev/null
+++ b/simhash/src/main/java/utils/CalculationUtils.java
@@ -0,0 +1,158 @@
+package utils;
+
+import cn.hutool.core.util.StrUtil;
+import exceptions.HashException;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.Map;
+
+/**
+ * 与计算有关的工具类
+ */
+public class CalculationUtils {
+ static final int HASH_BIT = 128;
+ static final int DISTANCE_WAY1 = 16;
+ static final int DISTANCE_WAY2 = 32;
+ static final int DISTANCE_WAY3 = 64;
+
+ /**
+ * 采用MD5进行对词语进行hash,得到的hash值使用16进制解析 再利用算法取128位二进制
+ * @param word 词语
+ * @return 128位二进制
+ */
+ public static String wordHash(String word) throws HashException {
+ //如果传入词语为null或“”或“ ”
+ if (word == null || StrUtil.isBlank(word) || StrUtil.isEmpty(word)) {
+ throw new HashException("词语为空");
+ }
+ try {
+ // 采用MD5算法进行hash
+ MessageDigest digest = MessageDigest.getInstance("MD5");
+ digest.update(word.getBytes(StandardCharsets.UTF_8));
+ // hash值转为32位16进制
+ StringBuilder hash = new StringBuilder();
+ for (byte b : digest.digest()) {
+ hash.append(String.format("%02x", b));
+ }
+
+ // 16进制转为128位2进制码
+ StringBuilder finalHash = new StringBuilder();
+ String strTemp;
+ for (int i = 0; i < hash.length(); i ++) {
+ // 每一位16进制数加上0000 最后截取后面的4位 得到便是这位数的二进制
+ strTemp = "0000" + Integer.toBinaryString(Integer.parseInt(hash.substring(i, i + 1), 16));
+ finalHash.append(strTemp.substring(strTemp.length() - 4));
+ }
+
+ // 不为128直接报错
+ if (finalHash.length() != HASH_BIT) {
+ throw new HashException("hash值长度不为128");
+ }
+
+ return finalHash.toString();
+
+ } catch (NoSuchAlgorithmException e) {
+ throw new HashException("MD5算法异常");
+ }
+
+ }
+
+ /**
+ * 给二进制哈希值加权
+ * @param hash 二进制哈希值
+ * @param weight 权重
+ * @return 加权后的二进制哈希值
+ */
+ public static int[] hashWeight(String hash, int weight) {
+ // 新建一个数组用于存放加权后的二进制哈希值
+ int[] hashArray = new int[HASH_BIT];
+ // 遍历二进制哈希值,0则是-1,1则是1,将每一位加权后存入数组
+ for (int i = 0; i < hash.length(); i++) {
+ if (hash.charAt(i) == '1') {
+ hashArray[i] = weight;
+ } else {
+ hashArray[i] = -1 * weight;
+ }
+ }
+
+ return hashArray;
+ }
+
+ /**
+ * 得到的合并后的hash值进行降维,最终得到simHash
+ * @param mergeHash 合并后的hash值
+ * @return sim哈希值
+ */
+ public static String getSimHash(int[] mergeHash){
+ // 使用StringBuilder存储simHash
+ StringBuilder simHash = new StringBuilder();
+ // 遍历合并后的hash值,大于0则是1,小于0则是0
+ for (int hash : mergeHash) {
+ if (hash > 0) {
+ simHash.append("1");
+ } else {
+ simHash.append("0");
+ }
+ }
+ return simHash.toString();
+ }
+
+
+ /**
+ * 根据词语得到simHash
+ * @param wordCount 词语及其出现次数
+ * @return simHash
+ */
+ public static String calculateSimHash(Map wordCount){
+ // 新建一个数组用于存放合并后的hash值,初始值为0
+ int[] mergeHash = new int[HASH_BIT];
+ for (int i = 0; i < HASH_BIT; i++) {
+ mergeHash[i] = 0;
+ }
+ // 遍历词语及其出现次数,对每一个词语进行hash加权,然后合并
+ wordCount.forEach((word,count) -> {
+ try {
+ int[] tempHash = hashWeight(wordHash(word),count);
+ for (int i = 0; i < tempHash.length; i++) {
+ mergeHash[i] += tempHash[i];
+ }
+ } catch (HashException e) {
+ e.printStackTrace();
+ }
+ });
+
+ // 降维得到simHash
+ return getSimHash(mergeHash);
+ }
+
+ /**
+ * 计算两个simHash的相似度
+ * @param simHash1 simHash1
+ * @param simHash2 simHash2
+ * @return 相似度
+ */
+ public static double getSimilarity(String simHash1, String simHash2) {
+ // 汉明距离
+ int distance = 0;
+ // 遍历simHash1和simHash2,不相同则汉明距离加1
+ for (int i = 0; i < simHash1.length(); i++) {
+ if (simHash1.charAt(i) != simHash2.charAt(i)) {
+ distance++;
+ }
+ }
+// System.out.println("汉明距离为:" + distance);
+ // 更换计算策略
+ if (distance >= 0 && distance <= DISTANCE_WAY1) {
+ return 1 - (double) distance / 256;
+ } else if (distance > 16 && distance <= DISTANCE_WAY2) {
+ return 1 - (double) distance / 128;
+ }else if (distance > 32 && distance <= DISTANCE_WAY3) {
+ return 1 - (double) distance / 64;
+ }else {
+ return 0;
+ }
+
+ }
+
+}
\ No newline at end of file
diff --git a/simhash/src/main/java/utils/CommonUtils.java b/simhash/src/main/java/utils/CommonUtils.java
new file mode 100644
index 0000000000000000000000000000000000000000..e78d1ae0f3b89e90a31e5061e55185c6f7319404
--- /dev/null
+++ b/simhash/src/main/java/utils/CommonUtils.java
@@ -0,0 +1,71 @@
+package utils;
+
+import cn.hutool.core.io.FileUtil;
+import cn.hutool.core.util.StrUtil;
+import com.hankcs.hanlp.HanLP;
+import com.hankcs.hanlp.seg.common.Term;
+import exceptions.FileAnalyseException;
+import exceptions.NotExistFileException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+
+/**
+ * 非计算的工具类
+ */
+public class CommonUtils {
+ //最少关键词数为3
+ public static final int SHORT_WORD_LENGTH = 3;
+ /**
+ * 从文件中读取文本
+ * @param filePath 文件路径
+ * @return 读取出的文本
+ */
+ public static String readFileToStr(String filePath) throws NotExistFileException {
+ try {
+ return FileUtil.readUtf8String(filePath);//返回读取的文本
+ } catch (Exception e) {
+ throw new NotExistFileException("该绝对路径的文件不存在");
+ }
+ }
+
+ /**
+ * 把文本解析并过滤后转为map
+ * @param text 读取的文本
+ * @return 存放词语和词频的map
+ */
+ public static Map analyseText(String text) throws FileAnalyseException {
+ //文本内容为null或“”或“ ”时,文件解析异常
+ if (text == null || StrUtil.isBlank(text) || StrUtil.isEmpty(text)) {
+ throw new FileAnalyseException("文件解析异常,解析内容为空");
+ }
+ // 提取关键词
+ List keyList = HanLP.extractKeyword(text, text.length());
+ //提取出的关键词小于3
+ if (keyList.size() <= SHORT_WORD_LENGTH) {
+ throw new FileAnalyseException("文件解析异常,关键词太少");
+ }
+ // 分词,找出所有词语
+ List termList = HanLP.segment(text);
+ List allWords = termList.stream().map(term -> term.word).collect(Collectors.toList());
+ // 用于存放关键词和词频的map
+ Map wordCount = new HashMap<>(keyList.size());
+ // 遍历全部词语,获取关键词词频,返回存词语和词频的map
+ for (String s:keyList) {
+ wordCount.put(s, Collections.frequency(allWords, s));
+ }
+ return wordCount;
+ }
+
+ /**
+ * 将查重结果写入指定文件
+ * @param filePath 文件路径
+ * @param content 查重结果内容
+ */
+ public static void writeFile(String filePath, String content) {
+ FileUtil.appendString(content, filePath, "utf-8");
+ }
+}
\ No newline at end of file
diff --git a/simhash/src/test/java/MainTest.java b/simhash/src/test/java/MainTest.java
new file mode 100644
index 0000000000000000000000000000000000000000..c9879a91f58a09a0e900f56a8b99184c362eb456
--- /dev/null
+++ b/simhash/src/test/java/MainTest.java
@@ -0,0 +1,17 @@
+import org.junit.jupiter.api.Test;
+public class MainTest {
+ static String writeFilePath = "E:\\测试文本\\write.txt";
+ static String OrigFilePath = "E:\\测试文本\\orig.txt";
+ static String CopyFilePath1 = "E:\\测试文本\\orig_0.8_add.txt";
+ /**
+ * 测试主函数
+ */
+ @Test
+ void testMain(){
+ String[] args = new String[3];
+ args[0] = OrigFilePath;
+ args[1] = CopyFilePath1;
+ args[2] = writeFilePath;
+ Main.main(args);
+ }
+}
\ No newline at end of file
diff --git a/simhash/target/classes/Main.class b/simhash/target/classes/Main.class
new file mode 100644
index 0000000000000000000000000000000000000000..52bc6bc9e84e51a69a679bb356dc83289f94bd74
Binary files /dev/null and b/simhash/target/classes/Main.class differ
diff --git a/simhash/target/classes/classpath.index b/simhash/target/classes/classpath.index
new file mode 100644
index 0000000000000000000000000000000000000000..710196b307608a6994e17db7259659a6ae4d8f59
Binary files /dev/null and b/simhash/target/classes/classpath.index differ
diff --git a/simhash/target/classes/exceptions/FileAnalyseException.class b/simhash/target/classes/exceptions/FileAnalyseException.class
new file mode 100644
index 0000000000000000000000000000000000000000..11551ebc3366ded552220d16ff4e26675736aec0
Binary files /dev/null and b/simhash/target/classes/exceptions/FileAnalyseException.class differ
diff --git a/simhash/target/classes/exceptions/HashException.class b/simhash/target/classes/exceptions/HashException.class
new file mode 100644
index 0000000000000000000000000000000000000000..b13d9027bd8738487895a841c52869310a3f7ffa
Binary files /dev/null and b/simhash/target/classes/exceptions/HashException.class differ
diff --git a/simhash/target/classes/exceptions/NotExistFileException.class b/simhash/target/classes/exceptions/NotExistFileException.class
new file mode 100644
index 0000000000000000000000000000000000000000..0318cb4fd72d8fbf922646f96ae0fa7864cfe6be
Binary files /dev/null and b/simhash/target/classes/exceptions/NotExistFileException.class differ
diff --git a/simhash/target/classes/utils/CalculationUtils.class b/simhash/target/classes/utils/CalculationUtils.class
new file mode 100644
index 0000000000000000000000000000000000000000..17eec45b74425e88d2783a66ca3419e2603382ef
Binary files /dev/null and b/simhash/target/classes/utils/CalculationUtils.class differ
diff --git a/simhash/target/classes/utils/CommonUtils.class b/simhash/target/classes/utils/CommonUtils.class
new file mode 100644
index 0000000000000000000000000000000000000000..a66b00e332940e3a56bcd0d0ec90a6908d10a234
Binary files /dev/null and b/simhash/target/classes/utils/CommonUtils.class differ
diff --git a/simhash/target/test-classes/MainTest.class b/simhash/target/test-classes/MainTest.class
new file mode 100644
index 0000000000000000000000000000000000000000..232debddd15f6020ee04f567fe236e24a519af77
Binary files /dev/null and b/simhash/target/test-classes/MainTest.class differ
diff --git a/simhash/target/test-classes/classpath.index b/simhash/target/test-classes/classpath.index
new file mode 100644
index 0000000000000000000000000000000000000000..c91390c74815757960dbd0886b529702b8074878
Binary files /dev/null and b/simhash/target/test-classes/classpath.index differ