提交 9a4510f9 编写于 作者: Y y

simhash

上级 72079d3b
......@@ -17,7 +17,9 @@ public class Main {
Map<String, Integer> originWordCount = null;
Map<String, Integer> compareWordCount = null;
try {
//得到原文本的关键词和词频
originWordCount = CommonUtils.analyseText(CommonUtils.readFileToStr(args[0]));
//以及比对文本的关键词的关键词和词频
compareWordCount = CommonUtils.analyseText(CommonUtils.readFileToStr(args[1]));
} catch (FileAnalyseException | NotExistFileException e) {
e.printStackTrace();
......@@ -28,6 +30,7 @@ public class Main {
//计算相似度,保留两位小数
double result = CalculationUtils.getSimilarity(simHash1, simHash2);
String format = String.format("相似度为:%.2f", result);
System.out.println(format);
String writeFileContent = "---------------------------------------" + "\n" +
"原文件:" + args[0] + "\n" +
"对比文件:" + args[1] + "\n" +
......
package exceptions;
/**
* @author HJW
* @date 2022-09-21 12:57
* 文件解析异常(转字符串为空或者过滤时没有可用词)
* 文件解析异常
*/
public class FileAnalyseException extends Exception {
public FileAnalyseException(String message) {
......
......@@ -3,9 +3,7 @@ package exceptions;
import java.security.NoSuchAlgorithmException;
/**
* @author HJW
* @date 2022-09-21 12:57
* hash异常 md5
* MD5算法hash异常
*/
public class HashException extends NoSuchAlgorithmException {
public HashException(String message) {
......
package exceptions;
import java.io.FileNotFoundException;
/**
* @author HJW
* 找不到文件的自定义异常
* 找不到文件的文件解析异常
*/
public class NotExistFileException extends FileNotFoundException {
public NotExistFileException(String message) {
......
......@@ -11,15 +11,12 @@ import java.util.Map;
* 与计算有关的工具类
*/
public class CalculationUtils {
//hash码长度为128
static final int HASH_BIT = 128;
static final int DISTANCE_WAY1 = 16;
static final int DISTANCE_WAY2 = 32;
static final int DISTANCE_WAY3 = 64;
/**
* 采用MD5进行对词语进行hash,得到的hash值使用16进制解析 再利用算法取128位二进制
* 采用MD5算法对关键词进行hash,得到的hash值使用16进制解析,再利用算法取128位二进制数作为hash值
* @param word 词语
* @return 128位二进制
* @return 128位二进制hash值
*/
public static String wordHash(String word) throws HashException {
//如果传入词语为null或“”或“ ”
......@@ -30,36 +27,31 @@ public class CalculationUtils {
// 采用MD5算法进行hash
MessageDigest digest = MessageDigest.getInstance("MD5");
digest.update(word.getBytes(StandardCharsets.UTF_8));
// hash值转为32位16进制
// hash值转为32位16进制的散列值
StringBuilder hash = new StringBuilder();
for (byte b : digest.digest()) {
hash.append(String.format("%02x", b));
}
// 16进制转为128位2进制码
// 16进制的散列值转为128位二进制码
StringBuilder finalHash = new StringBuilder();
String strTemp;
for (int i = 0; i < hash.length(); i ++) {
// 每一位16进制数加上0000 最后截取后面的4位 得到便是这位数的二进制
for (int i = 0; i < hash.length(); i++) {
// 每一位16进制数加上0000,最后截取后4位,得到便是这位数的二进制
strTemp = "0000" + Integer.toBinaryString(Integer.parseInt(hash.substring(i, i + 1), 16));
finalHash.append(strTemp.substring(strTemp.length() - 4));
}
// 不为128直接报错
// 不为128则为hash异常
if (finalHash.length() != HASH_BIT) {
throw new HashException("hash值长度不为128");
}
return finalHash.toString();
} catch (NoSuchAlgorithmException e) {
throw new HashException("MD5算法异常");
}
}
/**
* 给二进制哈希值加权
* 给二进制hash值加权
* @param hash 二进制哈希值
* @param weight 权重
* @return 加权后的二进制哈希值
......@@ -75,12 +67,11 @@ public class CalculationUtils {
hashArray[i] = -1 * weight;
}
}
return hashArray;
}
/**
* 得到的合并后的hash值进行降维,最终得到simHash
* 合并后的hash进行降维,最终得到simHash
* @param mergeHash 合并后的hash值
* @return sim哈希值
*/
......@@ -98,7 +89,6 @@ public class CalculationUtils {
return simHash.toString();
}
/**
* 根据词语得到simHash
* @param wordCount 词语及其出现次数
......@@ -113,7 +103,7 @@ public class CalculationUtils {
// 遍历词语及其出现次数,对每一个词语进行hash加权,然后合并
wordCount.forEach((word,count) -> {
try {
int[] tempHash = hashWeight(wordHash(word),count);
int[] tempHash = hashWeight(wordHash(word),count);//加权后的hash值
for (int i = 0; i < tempHash.length; i++) {
mergeHash[i] += tempHash[i];
}
......@@ -121,7 +111,6 @@ public class CalculationUtils {
e.printStackTrace();
}
});
// 降维得到simHash
return getSimHash(mergeHash);
}
......@@ -133,26 +122,20 @@ public class CalculationUtils {
* @return 相似度
*/
public static double getSimilarity(String simHash1, String simHash2) {
// 汉明距离
int distance = 0;
// 得到两个simHash的汉明距离
// 遍历simHash1和simHash2,不相同则汉明距离加1
int hamingDistance = 0;
int same=0;
for (int i = 0; i < simHash1.length(); i++) {
if (simHash1.charAt(i) != simHash2.charAt(i)) {
distance++;
hamingDistance++;
}
if (simHash1.charAt(i)=='1' && simHash2.charAt(i)=='1') {
same++;
}
}
// System.out.println("汉明距离为:" + distance);
// 更换计算策略
if (distance >= 0 && distance <= DISTANCE_WAY1) {
return 1 - (double) distance / 256;
} else if (distance > 16 && distance <= DISTANCE_WAY2) {
return 1 - (double) distance / 128;
}else if (distance > 32 && distance <= DISTANCE_WAY3) {
return 1 - (double) distance / 64;
}else {
return 0;
}
System.out.println("两个simHash的汉明距离为:" + hamingDistance);
// 用杰卡德系数计算文本相似度
return (double)same/(hamingDistance+same);
}
}
\ No newline at end of file
import com.hankcs.hanlp.HanLP;
import exceptions.HashException;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import exceptions.FileAnalyseException;
import exceptions.NotExistFileException;
import utils.CalculationUtils;
import utils.CommonUtils;
import java.util.Arrays;
import java.util.Map;
public class MainTest {
//读取文件后得到的文本
static String analyseStr;
//两个示例句子
static String originSentence = "今天是星期天,天气晴,今天晚上我要去看电影。";
static String compareSentence = "今天是周天,天气晴朗,我晚上要去看电影。";
//比对结果写入的文件
static String writeFilePath = "E:\\测试文本\\write.txt";
//原文件
static String OrigFilePath = "E:\\测试文本\\orig.txt";
//5个比对文件
static String CopyFilePath1 = "E:\\测试文本\\orig_0.8_add.txt";
static String CopyFilePath2 = "E:\\测试文本\\orig_0.8_del.txt";
static String CopyFilePath3 = "E:\\测试文本\\orig_0.8_dis_1.txt";
static String CopyFilePath4 = "E:\\测试文本\\orig_0.8_dis_10.txt";
static String CopyFilePath5 = "E:\\测试文本\\orig_0.8_dis_15.txt";
/**
* 测试写入文件
*/
@Test
void testWriteFile(){
CommonUtils.writeFile(writeFilePath, "------successfully content entry------");
try {
String s = CommonUtils.readFileToStr(writeFilePath);
Assertions.assertTrue(s.contains("------successfully content entry------"),"写入文件失败");
} catch (NotExistFileException e) {
e.printStackTrace();
Assertions.fail("写入文件失败");
}
}
/**
* 测试读取不存在的文件
*/
@Test
void testReadFileNotExist(){
try {
CommonUtils.readFileToStr("E:\\not existing.txt");
Assertions.fail("没有抛出异常");
} catch (NotExistFileException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
}
/**
* 测试文件解析异常(为null,为“”,为“ ”)
*/
@Test
void testFileAnalyseException(){
try {
CommonUtils.analyseText(null);
Assertions.fail("没有抛出异常");
} catch (FileAnalyseException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
try {
CommonUtils.analyseText("");
Assertions.fail("没有抛出异常");
} catch (FileAnalyseException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
try {
CommonUtils.analyseText(" ");
Assertions.fail("没有抛出异常");
} catch (FileAnalyseException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
}
/**
* 测试读取文件并查看分词结果
*/
@Test
void testReadFile(){
try {
//测试句子分词
System.out.println("分词结果为:"+CommonUtils.analyseText(originSentence));
//测试文本分词
analyseStr = CommonUtils.readFileToStr(OrigFilePath);
System.out.println("分词结果为:"+CommonUtils.analyseText(analyseStr));
} catch (Exception e) {
e.printStackTrace();
Assertions.fail("分词结果有误");
}
}
/**
* 测试MD5算法hash计算hash,检查所得到hash值是否为128位
*/
@Test
void testWordHash(){
HanLP.extractKeyword(originSentence, originSentence.length()).forEach(
word -> {
try {
String hash = CalculationUtils.wordHash(word);
System.out.println(word +" : "+ hash);
Assertions.assertEquals(128, hash.length(), "hash值长度不是128");
} catch (HashException e) {
Assertions.fail("哈希出错");
e.printStackTrace();
}
}
);
}
/**
* 测试哈希异常(得到hash值为空)
*/
@Test
void testHashException(){
try {
CalculationUtils.wordHash("");
Assertions.fail("没有抛出异常");
} catch (HashException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
try {
CalculationUtils.wordHash(null);
Assertions.fail("没有抛出异常");
} catch (HashException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
try {
CalculationUtils.wordHash(" ");
Assertions.fail("没有抛出异常");
} catch (HashException e) {
e.printStackTrace();
Assertions.assertTrue(true);
}
}
/**
* 测试加权算法
*/
@Test
void testHashWeight(){
Map<String, Integer> map = null;
try {
map = CommonUtils.analyseText(originSentence);
} catch (FileAnalyseException e) {
e.printStackTrace();
Assertions.fail("解析错误");
}
map.forEach((word, count) -> {
try {
String hash = CalculationUtils.wordHash(word);
int[] hashWeight = CalculationUtils.hashWeight(hash,count);
//打印加权后的hash值
System.out.println(word +" : "+ Arrays.toString(hashWeight));
Assertions.assertEquals(128, hashWeight.length, "加权后的hash值长度不是128");
} catch (HashException e) {
Assertions.fail("哈希出错");
e.printStackTrace();
}
});
}
/**
* 测试计算simHash
*/
@Test
void testCalculateSimHash() {
try {
String hash1 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(originSentence));
System.out.println("原句子\"" + originSentence + "\"的simhash值为:" + hash1);
Assertions.assertEquals(hash1.length(), 128, "hash值长度不是128");
String hash2=CalculationUtils.calculateSimHash(CommonUtils.analyseText((CommonUtils.readFileToStr(OrigFilePath))));
System.out.println("原文本的simhash值为:" + hash2);
Assertions.assertEquals(hash2.length(), 128, "hash值长度不是128");
} catch (FileAnalyseException | NotExistFileException e) {
e.printStackTrace();
}
}
/**
* 测试计算句子相似度
*/
@Test
void testGetSimilarity1(){
String hash1 = null;
String hash2 = null;
try {
hash1 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(originSentence));
hash2 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(compareSentence));
} catch (FileAnalyseException e) {
e.printStackTrace();
Assertions.fail("解析错误");
}
double similarity = CalculationUtils.getSimilarity(hash1, hash2);
String format = String.format("两个句子的相似度为:%.2f", similarity);
System.out.println(format);
Assertions.assertTrue(0 <= similarity && similarity <= 1, "相似度不在0-1之间");
}
/**
* 测试计算文本相似度
*/
@Test
void testGetSimilarity2(){
String hash1;
String hash2;
try {
hash1 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(CommonUtils.readFileToStr(OrigFilePath)));
hash2 = CalculationUtils.calculateSimHash(CommonUtils.analyseText(CommonUtils.readFileToStr(CopyFilePath1)));
double similarity = CalculationUtils.getSimilarity(hash1, hash2);
String format = String.format("两个文本的相似度为:%.2f", similarity);
System.out.println(format);
Assertions.assertTrue(0 <= similarity && similarity <= 1, "相似度不在0-1之间");
} catch (FileAnalyseException | NotExistFileException e) {
e.printStackTrace();
}
}
/**
* 测试主函数
*/
......@@ -10,8 +235,18 @@ public class MainTest {
void testMain(){
String[] args = new String[3];
args[0] = OrigFilePath;
args[1] = CopyFilePath1;
args[1]=CopyFilePath1;
args[2] = writeFilePath;
Main.main(args);
args[1]=CopyFilePath2;
Main.main(args);
args[1]=CopyFilePath3;
Main.main(args);
args[1]=CopyFilePath4;
Main.main(args);
args[1]=CopyFilePath5;
Main.main(args);
args[0] = CopyFilePath3;
}
}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册