提交 26e5eeac 编写于 作者: boomyuan0000's avatar boomyuan0000

已删除simhash/.idea/.gitignore, simhash/.idea/compiler.xml,...

已删除simhash/.idea/.gitignore, simhash/.idea/compiler.xml, simhash/.idea/encodings.xml, simhash/.idea/jarRepositories.xml, simhash/.idea/misc.xml, simhash/src/main/java/exceptions/FileAnalyseException.java, simhash/src/main/java/exceptions/HashException.java, simhash/src/main/java/exceptions/NotExistFileException.java, simhash/src/main/java/utils/CalculationUtils.java, simhash/src/main/java/utils/CommonUtils.java, simhash/src/main/java/Main.java, simhash/src/test/java/MainTest.java, simhash/target/classes/exceptions/FileAnalyseException.class, simhash/target/classes/exceptions/HashException.class, simhash/target/classes/exceptions/NotExistFileException.class, simhash/target/classes/utils/CalculationUtils.class, simhash/target/classes/utils/CommonUtils.class, simhash/target/classes/Main.class, simhash/target/classes/classpath.index, simhash/target/test-classes/MainTest.class, simhash/target/test-classes/classpath.index, simhash/pom.xml
上级 40d1dbec
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<annotationProcessing>
<profile name="Maven default annotation processors profile" enabled="true">
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="simhash" />
</profile>
</annotationProcessing>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RemoteRepositoriesConfiguration">
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Central Repository" />
<option name="url" value="http://maven.aliyun.com/nexus/content/groups/public/" />
</remote-repository>
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Maven Central repository" />
<option name="url" value="https://repo1.maven.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="jboss.community" />
<option name="name" value="JBoss Community repository" />
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
</remote-repository>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_17" default="true" project-jdk-name="17" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>simhash</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.5</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.7.13</version>
</dependency>
<dependency>
<groupId>com.hankcs.nlp</groupId>
<artifactId>hanlp-lucene-plugin</artifactId>
<version>1.1.7</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13.2</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-http</artifactId>
<version>5.8.14</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>5.8.2</version>
<scope>test</scope>
</dependency>
</dependencies>
<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
</project>
\ No newline at end of file
import cn.hutool.core.date.DateUtil;
import exceptions.FileAnalyseException;
import exceptions.NotExistFileException;
import utils.CalculationUtils;
import utils.CommonUtils;
import java.util.Map;
public class Main {
//合法参数个数为3
static final int ARGS_NUM = 3;
public static void main(String[] args){
// 读取并解析参数
if (args.length != ARGS_NUM) {
throw new IllegalArgumentException("参数个数不正确");
}
// 解析文件,处理分词
Map<String, Integer> originWordCount = null;
Map<String, Integer> compareWordCount = null;
try {
//得到原文本的关键词和词频
originWordCount = CommonUtils.analyseText(CommonUtils.readFileToStr(args[0]));
//以及比对文本的关键词的关键词和词频
compareWordCount = CommonUtils.analyseText(CommonUtils.readFileToStr(args[1]));
} catch (FileAnalyseException | NotExistFileException e) {
e.printStackTrace();
}
// 获取simHash值
String simHash1 = CalculationUtils.calculateSimHash(originWordCount);
String simHash2 = CalculationUtils.calculateSimHash(compareWordCount);
//计算相似度,保留两位小数
double result = CalculationUtils.getSimilarity(simHash1, simHash2);
String format = String.format("相似度为:%.2f", result);
System.out.println(format);
String writeFileContent = "---------------------------------------" + "\n" +
"原文件:" + args[0] + "\n" +
"对比文件:" + args[1] + "\n" +
format + "\n" +
"比较时间为:" + DateUtil.now() + "\n";
;
CommonUtils.writeFile(args[2],writeFileContent);
}
}
package exceptions;
/**
* 文件解析异常
*/
public class FileAnalyseException extends Exception {
public FileAnalyseException(String message) {
super(message);
}
}
\ No newline at end of file
package exceptions;
import java.security.NoSuchAlgorithmException;
/**
* MD5算法hash异常
*/
public class HashException extends NoSuchAlgorithmException {
public HashException(String message) {
super(message);
}
}
\ No newline at end of file
package exceptions;
import java.io.FileNotFoundException;
/**
* 找不到文件的文件解析异常
*/
public class NotExistFileException extends FileNotFoundException {
public NotExistFileException(String message) {
super(message);
}
}
\ No newline at end of file
package utils;
import cn.hutool.core.util.StrUtil;
import exceptions.HashException;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Map;
/**
* 与计算有关的工具类
*/
public class CalculationUtils {
//hash码长度为128
static final int HASH_BIT = 128;
/**
* 采用MD5算法对关键词进行hash,得到的hash值使用16进制解析,再利用算法取128位二进制数作为hash值
* @param word 词语
* @return 128位二进制hash值
*/
public static String wordHash(String word) throws HashException {
//如果传入词语为null或“”或“ ”
if (word == null || StrUtil.isBlank(word) || StrUtil.isEmpty(word)) {
throw new HashException("词语为空");
}
try {
// 采用MD5算法进行hash
MessageDigest digest = MessageDigest.getInstance("MD5");
digest.update(word.getBytes(StandardCharsets.UTF_8));
// hash值转为32位16进制的散列值
StringBuilder hash = new StringBuilder();
for (byte b : digest.digest()) {
hash.append(String.format("%02x", b));
}
// 16进制的散列值转为128位二进制码
StringBuilder finalHash = new StringBuilder();
String strTemp;
for (int i = 0; i < hash.length(); i++) {
// 每一位16进制数加上0000,最后截取后4位,得到便是这位数的二进制
strTemp = "0000" + Integer.toBinaryString(Integer.parseInt(hash.substring(i, i + 1), 16));
finalHash.append(strTemp.substring(strTemp.length() - 4));
}
// 不为128则为hash异常
if (finalHash.length() != HASH_BIT) {
throw new HashException("hash值长度不为128");
}
return finalHash.toString();
} catch (NoSuchAlgorithmException e) {
throw new HashException("MD5算法异常");
}
}
/**
* 给二进制hash值加权
* @param hash 二进制哈希值
* @param weight 权重
* @return 加权后的二进制哈希值
*/
public static int[] hashWeight(String hash, int weight) {
// 新建一个数组用于存放加权后的二进制哈希值
int[] hashArray = new int[HASH_BIT];
// 遍历二进制哈希值,0则是-1,1则是1,将每一位加权后存入数组
for (int i = 0; i < hash.length(); i++) {
if (hash.charAt(i) == '1') {
hashArray[i] = weight;
} else {
hashArray[i] = -1 * weight;
}
}
return hashArray;
}
/**
* 合并后的hash进行降维,最终得到simHash
* @param mergeHash 合并后的hash值
* @return sim哈希值
*/
public static String getSimHash(int[] mergeHash){
// 使用StringBuilder存储simHash
StringBuilder simHash = new StringBuilder();
// 遍历合并后的hash值,大于0则是1,小于0则是0
for (int hash : mergeHash) {
if (hash > 0) {
simHash.append("1");
} else {
simHash.append("0");
}
}
return simHash.toString();
}
/**
* 根据词语得到simHash
* @param wordCount 词语及其出现次数
* @return simHash
*/
public static String calculateSimHash(Map<String,Integer> wordCount){
// 新建一个数组用于存放合并后的hash值,初始值为0
int[] mergeHash = new int[HASH_BIT];
for (int i = 0; i < HASH_BIT; i++) {
mergeHash[i] = 0;
}
// 遍历词语及其出现次数,对每一个词语进行hash加权,然后合并
wordCount.forEach((word,count) -> {
try {
int[] tempHash = hashWeight(wordHash(word),count);//加权后的hash值
for (int i = 0; i < tempHash.length; i++) {
mergeHash[i] += tempHash[i];
}
} catch (HashException e) {
e.printStackTrace();
}
});
// 降维得到simHash
return getSimHash(mergeHash);
}
/**
* 计算两个simHash的相似度
* @param simHash1 simHash1
* @param simHash2 simHash2
* @return 相似度
*/
public static double getSimilarity(String simHash1, String simHash2) {
// 得到两个simHash的汉明距离
// 遍历simHash1和simHash2,不相同则汉明距离加1
int hamingDistance = 0;
int same=0;
for (int i = 0; i < simHash1.length(); i++) {
if (simHash1.charAt(i) != simHash2.charAt(i)) {
hamingDistance++;
}
if (simHash1.charAt(i)=='1' && simHash2.charAt(i)=='1') {
same++;
}
}
System.out.println("两个simHash的汉明距离为:" + hamingDistance);
// 用杰卡德系数计算文本相似度
return (double)same/(hamingDistance+same);
}
}
\ No newline at end of file
package utils;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.StrUtil;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import exceptions.FileAnalyseException;
import exceptions.NotExistFileException;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/**
* 非计算的工具类
*/
public class CommonUtils {
//最少关键词数为3
public static final int SHORT_WORD_LENGTH = 3;
/**
* 从文件中读取文本
* @param filePath 文件路径
* @return 读取出的文本
*/
public static String readFileToStr(String filePath) throws NotExistFileException {
try {
return FileUtil.readUtf8String(filePath);//返回读取的文本
} catch (Exception e) {
throw new NotExistFileException("该绝对路径的文件不存在");
}
}
/**
* 把文本解析并过滤后转为map
* @param text 读取的文本
* @return 存放词语和词频的map
*/
public static Map<String,Integer> analyseText(String text) throws FileAnalyseException {
//文本内容为null或“”或“ ”时,文件解析异常
if (text == null || StrUtil.isBlank(text) || StrUtil.isEmpty(text)) {
throw new FileAnalyseException("文件解析异常,解析内容为空");
}
// 提取关键词
List<String> keyList = HanLP.extractKeyword(text, text.length());
//提取出的关键词小于3
if (keyList.size() <= SHORT_WORD_LENGTH) {
throw new FileAnalyseException("文件解析异常,关键词太少");
}
// 分词,找出所有词语
List<Term> termList = HanLP.segment(text);
List<String> allWords = termList.stream().map(term -> term.word).collect(Collectors.toList());
// 用于存放关键词和词频的map
Map<String,Integer> wordCount = new HashMap<>(keyList.size());
// 遍历全部词语,获取关键词词频,返回存词语和词频的map
for (String s:keyList) {
wordCount.put(s, Collections.frequency(allWords, s));
}
return wordCount;
}
/**
* 将查重结果写入指定文件
* @param filePath 文件路径
* @param content 查重结果内容
*/
public static void writeFile(String filePath, String content) {
FileUtil.appendString(content, filePath, "utf-8");
}
}
\ No newline at end of file
import com.hankcs.hanlp.HanLP;
import exceptions.HashException;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import exceptions.FileAnalyseException;
import exceptions.NotExistFileException;
import utils.CalculationUtils;
import utils.CommonUtils;
import java.util.Arrays;
import java.util.Map;
public class MainTest {
//读取文件后得到的文本
static String analyseStr;
//两个示例句子
static String originSentence = "今天是星期天,天气晴,今天晚上我要去看电影。";
static String compareSentence = "今天是周天,天气晴朗,我晚上要去看电影。";
//比对结果写入的文件
static String writeFilePath = "E:\\测试文本\\write.txt";
//原文件
static String OrigFilePath = "E:\\测试文本\\orig.txt";
//5个比对文件
static String CopyFilePath1 = "E:\\测试文本\\orig_0.8_add.txt";
static String CopyFilePath2 = "E:\\测试文本\\orig_0.8_del.txt";
static String CopyFilePath3 = "E:\\测试文本\\orig_0.8_dis_1.txt";
static String CopyFilePath4 = "E:\\测试文本\\orig_0.8_dis_10.txt";
static String CopyFilePath5 = "E:\\测试文本\\orig_0.8_dis_15.txt";
/**
* 测试写入文件
*/
@Test
void testWriteFile(){
CommonUtils.writeFile(writeFilePath, "------successfully content entry------");
try {
String s = CommonUtils.readFileToStr(writeFilePath);
Assertions.assertTrue(s.contains("------successfully content entry------"),"写入文件失败");
} catch (NotExistFileException e) {
e.printStackTrace();
Assertions.fail("写入文件失败");
}
}
/**
* 测试读取不存在的文件
*/
@Test
void testReadFileNotExist(){
try {
CommonUtils.readFileToStr("E:\\not existing.txt");
Assertions.fail("没有抛出异常");
} catch (NotExistFileException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
}
/**
* 测试文件解析异常(为null,为“”,为“ ”)
*/
@Test
void testFileAnalyseException(){
try {
CommonUtils.analyseText(null);
Assertions.fail("没有抛出异常");
} catch (FileAnalyseException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
try {
CommonUtils.analyseText("");
Assertions.fail("没有抛出异常");
} catch (FileAnalyseException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
try {
CommonUtils.analyseText(" ");
Assertions.fail("没有抛出异常");
} catch (FileAnalyseException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
}
/**
* 测试读取文件并查看分词结果
*/
@Test
void testReadFile(){
try {
//测试句子分词
System.out.println("分词结果为:"+CommonUtils.analyseText(originSentence));
//测试文本分词
analyseStr = CommonUtils.readFileToStr(OrigFilePath);
System.out.println("分词结果为:"+CommonUtils.analyseText(analyseStr));
} catch (Exception e) {
e.printStackTrace();
Assertions.fail("分词结果有误");
}
}
/**
* 测试MD5算法hash计算hash,检查所得到hash值是否为128位
*/
@Test
void testWordHash(){
HanLP.extractKeyword(originSentence, originSentence.length()).forEach(
word -> {
try {
String hash = CalculationUtils.wordHash(word);
System.out.println(word +" : "+ hash);
Assertions.assertEquals(128, hash.length(), "hash值长度不是128");
} catch (HashException e) {
Assertions.fail("哈希出错");
e.printStackTrace();
}
}
);
}
/**
* 测试哈希异常(得到hash值为空)
*/
@Test
void testHashException(){
try {
CalculationUtils.wordHash("");
Assertions.fail("没有抛出异常");
} catch (HashException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
try {
CalculationUtils.wordHash(null);
Assertions.fail("没有抛出异常");
} catch (HashException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
try {
CalculationUtils.wordHash(" ");
Assertions.fail("没有抛出异常");
} catch (HashException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
}
/**
* 测试加权算法
*/
@Test
void testHashWeight(){
Map<String, Integer> map = null;
try {
map = CommonUtils.analyseText(originSentence);
} catch (FileAnalyseException e) {
e.printStackTrace();
Assertions.fail("解析错误");
}
map.forEach((word, count) -> {
try {
String hash = CalculationUtils.wordHash(word);
int[] hashWeight = CalculationUtils.hashWeight(hash,count);
//打印加权后的hash值
System.out.println(word +" : "+ Arrays.toString(hashWeight));
Assertions.assertEquals(128, hashWeight.length, "加权后的hash值长度不是128");
} catch (HashException e) {
Assertions.fail("哈希出错");
e.printStackTrace();
}
});
}
/**
* 测试计算simHash
*/
@Test
void testCalculateSimHash() {
try {
String hash1 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(originSentence));
System.out.println("原句子\"" + originSentence + "\"的simHash值为:" + hash1);
Assertions.assertEquals(hash1.length(), 128, "hash值长度不是128");
String hash2=CalculationUtils.calculateSimHash(CommonUtils.analyseText((CommonUtils.readFileToStr(OrigFilePath))));
System.out.println("原文本的simHash值为:" + hash2);
Assertions.assertEquals(hash2.length(), 128, "hash值长度不是128");
} catch (FileAnalyseException | NotExistFileException e) {
e.printStackTrace();
}
}
/**
* 测试计算句子相似度
*/
@Test
void testGetSimilarity1(){
String hash1 = null;
String hash2 = null;
try {
hash1 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(originSentence));
hash2 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(compareSentence));
} catch (FileAnalyseException e) {
e.printStackTrace();
Assertions.fail("解析错误");
}
double similarity = CalculationUtils.getSimilarity(hash1, hash2);
String format = String.format("两个句子的相似度为:%.2f", similarity);
System.out.println(format);
Assertions.assertTrue(0 <= similarity && similarity <= 1, "相似度不在0-1之间");
}
/**
* 测试计算文本相似度
*/
@Test
void testGetSimilarity2(){
String hash1;
String hash2;
try {
hash1 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(CommonUtils.readFileToStr(OrigFilePath)));
hash2 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(CommonUtils.readFileToStr(CopyFilePath1)));
double similarity = CalculationUtils.getSimilarity(hash1, hash2);
String format = String.format("两个文本的相似度为:%.2f", similarity);
System.out.println(format);
Assertions.assertTrue(0 <= similarity && similarity <= 1, "相似度不在0-1之间");
} catch (FileAnalyseException | NotExistFileException e) {
e.printStackTrace();
}
}
/**
* 测试主函数
*/
@Test
void testMain(){
String[] args = new String[3];
args[0] = OrigFilePath;
args[1]=CopyFilePath1;
args[2] = writeFilePath;
Main.main(args);
args[1]=CopyFilePath2;
Main.main(args);
args[1]=CopyFilePath3;
Main.main(args);
args[1]=CopyFilePath4;
Main.main(args);
args[1]=CopyFilePath5;
Main.main(args);
args[0] = CopyFilePath3;
}
}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册