提交 72079d3b 编写于 作者: Y y

simbash

上级 0b52d435
# 默认忽略的文件
/shelf/
/workspace.xml
# 基于编辑器的 HTTP 客户端请求
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="CompilerConfiguration">
<annotationProcessing>
<profile name="Maven default annotation processors profile" enabled="true">
<sourceOutputDir name="target/generated-sources/annotations" />
<sourceTestOutputDir name="target/generated-test-sources/test-annotations" />
<outputRelativeToContentRoot value="true" />
<module name="simhash" />
</profile>
</annotationProcessing>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding">
<file url="file://$PROJECT_DIR$/src/main/java" charset="UTF-8" />
<file url="file://$PROJECT_DIR$/src/main/resources" charset="UTF-8" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="RemoteRepositoriesConfiguration">
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Central Repository" />
<option name="url" value="http://maven.aliyun.com/nexus/content/groups/public/" />
</remote-repository>
<remote-repository>
<option name="id" value="central" />
<option name="name" value="Maven Central repository" />
<option name="url" value="https://repo1.maven.org/maven2" />
</remote-repository>
<remote-repository>
<option name="id" value="jboss.community" />
<option name="name" value="JBoss Community repository" />
<option name="url" value="https://repository.jboss.org/nexus/content/repositories/public/" />
</remote-repository>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ExternalStorageConfigurationManager" enabled="true" />
<component name="MavenProjectsManager">
<option name="originalFiles">
<list>
<option value="$PROJECT_DIR$/pom.xml" />
</list>
</option>
</component>
<component name="ProjectRootManager" version="2" languageLevel="JDK_17" default="true" project-jdk-name="17" project-jdk-type="JavaSDK">
<output url="file://$PROJECT_DIR$/out" />
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>simhash</artifactId>
<version>1.0-SNAPSHOT</version>
<dependencies>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.5</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.7.13</version>
</dependency>
<dependency>
<groupId>com.hankcs.nlp</groupId>
<artifactId>hanlp-lucene-plugin</artifactId>
<version>1.1.7</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.13.2</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-http</artifactId>
<version>5.8.14</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
<version>5.8.2</version>
<scope>test</scope>
</dependency>
</dependencies>
<properties>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
</project>
\ No newline at end of file
import cn.hutool.core.date.DateUtil;
import exceptions.FileAnalyseException;
import exceptions.NotExistFileException;
import utils.CalculationUtils;
import utils.CommonUtils;
import java.util.Map;
public class Main {
//合法参数个数为3
static final int ARGS_NUM = 3;
public static void main(String[] args){
// 读取并解析参数
if (args.length != ARGS_NUM) {
throw new IllegalArgumentException("参数个数不正确");
}
// 解析文件,处理分词
Map<String, Integer> originWordCount = null;
Map<String, Integer> compareWordCount = null;
try {
originWordCount = CommonUtils.analyseText(CommonUtils.readFileToStr(args[0]));
compareWordCount = CommonUtils.analyseText(CommonUtils.readFileToStr(args[1]));
} catch (FileAnalyseException | NotExistFileException e) {
e.printStackTrace();
}
// 获取simHash值
String simHash1 = CalculationUtils.calculateSimHash(originWordCount);
String simHash2 = CalculationUtils.calculateSimHash(compareWordCount);
//计算相似度,保留两位小数
double result = CalculationUtils.getSimilarity(simHash1, simHash2);
String format = String.format("相似度为:%.2f", result);
String writeFileContent = "---------------------------------------" + "\n" +
"原文件:" + args[0] + "\n" +
"对比文件:" + args[1] + "\n" +
format + "\n" +
"比较时间为:" + DateUtil.now() + "\n";
;
CommonUtils.writeFile(args[2],writeFileContent);
}
}
package exceptions;
/**
* @author HJW
* @date 2022-09-21 12:57
* 文件解析异常(转字符串为空或者过滤时没有可用词)
*/
public class FileAnalyseException extends Exception {
public FileAnalyseException(String message) {
super(message);
}
}
\ No newline at end of file
package exceptions;
import java.security.NoSuchAlgorithmException;
/**
* @author HJW
* @date 2022-09-21 12:57
* hash异常 md5
*/
public class HashException extends NoSuchAlgorithmException {
public HashException(String message) {
super(message);
}
}
\ No newline at end of file
package exceptions;
import java.io.FileNotFoundException;
/**
* @author HJW
* 找不到文件的自定义异常
*/
public class NotExistFileException extends FileNotFoundException {
public NotExistFileException(String message) {
super(message);
}
}
\ No newline at end of file
package utils;
import cn.hutool.core.util.StrUtil;
import exceptions.HashException;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Map;
/**
* 与计算有关的工具类
*/
public class CalculationUtils {
static final int HASH_BIT = 128;
static final int DISTANCE_WAY1 = 16;
static final int DISTANCE_WAY2 = 32;
static final int DISTANCE_WAY3 = 64;
/**
* 采用MD5进行对词语进行hash,得到的hash值使用16进制解析 再利用算法取128位二进制
* @param word 词语
* @return 128位二进制
*/
public static String wordHash(String word) throws HashException {
//如果传入词语为null或“”或“ ”
if (word == null || StrUtil.isBlank(word) || StrUtil.isEmpty(word)) {
throw new HashException("词语为空");
}
try {
// 采用MD5算法进行hash
MessageDigest digest = MessageDigest.getInstance("MD5");
digest.update(word.getBytes(StandardCharsets.UTF_8));
// hash值转为32位16进制
StringBuilder hash = new StringBuilder();
for (byte b : digest.digest()) {
hash.append(String.format("%02x", b));
}
// 16进制转为128位2进制码
StringBuilder finalHash = new StringBuilder();
String strTemp;
for (int i = 0; i < hash.length(); i ++) {
// 每一位16进制数加上0000 最后截取后面的4位 得到便是这位数的二进制
strTemp = "0000" + Integer.toBinaryString(Integer.parseInt(hash.substring(i, i + 1), 16));
finalHash.append(strTemp.substring(strTemp.length() - 4));
}
// 不为128直接报错
if (finalHash.length() != HASH_BIT) {
throw new HashException("hash值长度不为128");
}
return finalHash.toString();
} catch (NoSuchAlgorithmException e) {
throw new HashException("MD5算法异常");
}
}
/**
* 给二进制哈希值加权
* @param hash 二进制哈希值
* @param weight 权重
* @return 加权后的二进制哈希值
*/
public static int[] hashWeight(String hash, int weight) {
// 新建一个数组用于存放加权后的二进制哈希值
int[] hashArray = new int[HASH_BIT];
// 遍历二进制哈希值,0则是-1,1则是1,将每一位加权后存入数组
for (int i = 0; i < hash.length(); i++) {
if (hash.charAt(i) == '1') {
hashArray[i] = weight;
} else {
hashArray[i] = -1 * weight;
}
}
return hashArray;
}
/**
* 得到的合并后的hash值进行降维,最终得到simHash
* @param mergeHash 合并后的hash值
* @return sim哈希值
*/
public static String getSimHash(int[] mergeHash){
// 使用StringBuilder存储simHash
StringBuilder simHash = new StringBuilder();
// 遍历合并后的hash值,大于0则是1,小于0则是0
for (int hash : mergeHash) {
if (hash > 0) {
simHash.append("1");
} else {
simHash.append("0");
}
}
return simHash.toString();
}
/**
* 根据词语得到simHash
* @param wordCount 词语及其出现次数
* @return simHash
*/
public static String calculateSimHash(Map<String,Integer> wordCount){
// 新建一个数组用于存放合并后的hash值,初始值为0
int[] mergeHash = new int[HASH_BIT];
for (int i = 0; i < HASH_BIT; i++) {
mergeHash[i] = 0;
}
// 遍历词语及其出现次数,对每一个词语进行hash加权,然后合并
wordCount.forEach((word,count) -> {
try {
int[] tempHash = hashWeight(wordHash(word),count);
for (int i = 0; i < tempHash.length; i++) {
mergeHash[i] += tempHash[i];
}
} catch (HashException e) {
e.printStackTrace();
}
});
// 降维得到simHash
return getSimHash(mergeHash);
}
/**
* 计算两个simHash的相似度
* @param simHash1 simHash1
* @param simHash2 simHash2
* @return 相似度
*/
public static double getSimilarity(String simHash1, String simHash2) {
// 汉明距离
int distance = 0;
// 遍历simHash1和simHash2,不相同则汉明距离加1
for (int i = 0; i < simHash1.length(); i++) {
if (simHash1.charAt(i) != simHash2.charAt(i)) {
distance++;
}
}
// System.out.println("汉明距离为:" + distance);
// 更换计算策略
if (distance >= 0 && distance <= DISTANCE_WAY1) {
return 1 - (double) distance / 256;
} else if (distance > 16 && distance <= DISTANCE_WAY2) {
return 1 - (double) distance / 128;
}else if (distance > 32 && distance <= DISTANCE_WAY3) {
return 1 - (double) distance / 64;
}else {
return 0;
}
}
}
\ No newline at end of file
package utils;
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.StrUtil;
import com.hankcs.hanlp.HanLP;
import com.hankcs.hanlp.seg.common.Term;
import exceptions.FileAnalyseException;
import exceptions.NotExistFileException;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
/**
* 非计算的工具类
*/
public class CommonUtils {
//最少关键词数为3
public static final int SHORT_WORD_LENGTH = 3;
/**
* 从文件中读取文本
* @param filePath 文件路径
* @return 读取出的文本
*/
public static String readFileToStr(String filePath) throws NotExistFileException {
try {
return FileUtil.readUtf8String(filePath);//返回读取的文本
} catch (Exception e) {
throw new NotExistFileException("该绝对路径的文件不存在");
}
}
/**
* 把文本解析并过滤后转为map
* @param text 读取的文本
* @return 存放词语和词频的map
*/
public static Map<String,Integer> analyseText(String text) throws FileAnalyseException {
//文本内容为null或“”或“ ”时,文件解析异常
if (text == null || StrUtil.isBlank(text) || StrUtil.isEmpty(text)) {
throw new FileAnalyseException("文件解析异常,解析内容为空");
}
// 提取关键词
List<String> keyList = HanLP.extractKeyword(text, text.length());
//提取出的关键词小于3
if (keyList.size() <= SHORT_WORD_LENGTH) {
throw new FileAnalyseException("文件解析异常,关键词太少");
}
// 分词,找出所有词语
List<Term> termList = HanLP.segment(text);
List<String> allWords = termList.stream().map(term -> term.word).collect(Collectors.toList());
// 用于存放关键词和词频的map
Map<String,Integer> wordCount = new HashMap<>(keyList.size());
// 遍历全部词语,获取关键词词频,返回存词语和词频的map
for (String s:keyList) {
wordCount.put(s, Collections.frequency(allWords, s));
}
return wordCount;
}
/**
* 将查重结果写入指定文件
* @param filePath 文件路径
* @param content 查重结果内容
*/
public static void writeFile(String filePath, String content) {
FileUtil.appendString(content, filePath, "utf-8");
}
}
\ No newline at end of file
import org.junit.jupiter.api.Test;
public class MainTest {
static String writeFilePath = "E:\\测试文本\\write.txt";
static String OrigFilePath = "E:\\测试文本\\orig.txt";
static String CopyFilePath1 = "E:\\测试文本\\orig_0.8_add.txt";
/**
* 测试主函数
*/
@Test
void testMain(){
String[] args = new String[3];
args[0] = OrigFilePath;
args[1] = CopyFilePath1;
args[2] = writeFilePath;
Main.main(args);
}
}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册