Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
boomyuan0000
3221005153
提交
9a4510f9
3
3221005153
项目概览
boomyuan0000
/
3221005153
通知
3
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
3
3221005153
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
前往新版Gitcode,体验更适合开发者的 AI 搜索 >>
提交
9a4510f9
编写于
3月 08, 2023
作者:
Y
y
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
simhash
上级
72079d3b
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
263 addition
and
49 deletion
+263
-49
simhash/src/main/java/Main.java
simhash/src/main/java/Main.java
+3
-0
simhash/src/main/java/exceptions/FileAnalyseException.java
simhash/src/main/java/exceptions/FileAnalyseException.java
+1
-3
simhash/src/main/java/exceptions/HashException.java
simhash/src/main/java/exceptions/HashException.java
+1
-3
simhash/src/main/java/exceptions/NotExistFileException.java
simhash/src/main/java/exceptions/NotExistFileException.java
+1
-4
simhash/src/main/java/utils/CalculationUtils.java
simhash/src/main/java/utils/CalculationUtils.java
+21
-38
simhash/src/test/java/MainTest.java
simhash/src/test/java/MainTest.java
+236
-1
simhash/target/classes/Main.class
simhash/target/classes/Main.class
+0
-0
simhash/target/classes/exceptions/FileAnalyseException.class
simhash/target/classes/exceptions/FileAnalyseException.class
+0
-0
simhash/target/classes/exceptions/HashException.class
simhash/target/classes/exceptions/HashException.class
+0
-0
simhash/target/classes/exceptions/NotExistFileException.class
...ash/target/classes/exceptions/NotExistFileException.class
+0
-0
simhash/target/classes/utils/CalculationUtils.class
simhash/target/classes/utils/CalculationUtils.class
+0
-0
simhash/target/test-classes/MainTest.class
simhash/target/test-classes/MainTest.class
+0
-0
未找到文件。
simhash/src/main/java/Main.java
浏览文件 @
9a4510f9
...
...
@@ -17,7 +17,9 @@ public class Main {
Map
<
String
,
Integer
>
originWordCount
=
null
;
Map
<
String
,
Integer
>
compareWordCount
=
null
;
try
{
//得到原文本的关键词和词频
originWordCount
=
CommonUtils
.
analyseText
(
CommonUtils
.
readFileToStr
(
args
[
0
]));
//以及比对文本的关键词的关键词和词频
compareWordCount
=
CommonUtils
.
analyseText
(
CommonUtils
.
readFileToStr
(
args
[
1
]));
}
catch
(
FileAnalyseException
|
NotExistFileException
e
)
{
e
.
printStackTrace
();
...
...
@@ -28,6 +30,7 @@ public class Main {
//计算相似度,保留两位小数
double
result
=
CalculationUtils
.
getSimilarity
(
simHash1
,
simHash2
);
String
format
=
String
.
format
(
"相似度为:%.2f"
,
result
);
System
.
out
.
println
(
format
);
String
writeFileContent
=
"---------------------------------------"
+
"\n"
+
"原文件:"
+
args
[
0
]
+
"\n"
+
"对比文件:"
+
args
[
1
]
+
"\n"
+
...
...
simhash/src/main/java/exceptions/FileAnalyseException.java
浏览文件 @
9a4510f9
package
exceptions
;
/**
* @author HJW
* @date 2022-09-21 12:57
* 文件解析异常(转字符串为空或者过滤时没有可用词)
* 文件解析异常
*/
public
class
FileAnalyseException
extends
Exception
{
public
FileAnalyseException
(
String
message
)
{
...
...
simhash/src/main/java/exceptions/HashException.java
浏览文件 @
9a4510f9
...
...
@@ -3,9 +3,7 @@ package exceptions;
import
java.security.NoSuchAlgorithmException
;
/**
* @author HJW
* @date 2022-09-21 12:57
* hash异常 md5
* MD5算法hash异常
*/
public
class
HashException
extends
NoSuchAlgorithmException
{
public
HashException
(
String
message
)
{
...
...
simhash/src/main/java/exceptions/NotExistFileException.java
浏览文件 @
9a4510f9
package
exceptions
;
import
java.io.FileNotFoundException
;
/**
* @author HJW
* 找不到文件的自定义异常
* 找不到文件的文件解析异常
*/
public
class
NotExistFileException
extends
FileNotFoundException
{
public
NotExistFileException
(
String
message
)
{
...
...
simhash/src/main/java/utils/CalculationUtils.java
浏览文件 @
9a4510f9
...
...
@@ -11,15 +11,12 @@ import java.util.Map;
* 与计算有关的工具类
*/
public
class
CalculationUtils
{
//hash码长度为128
static
final
int
HASH_BIT
=
128
;
static
final
int
DISTANCE_WAY1
=
16
;
static
final
int
DISTANCE_WAY2
=
32
;
static
final
int
DISTANCE_WAY3
=
64
;
/**
* 采用MD5
进行对词语进行hash,得到的hash值使用16进制解析 再利用算法取128位二进制
* 采用MD5
算法对关键词进行hash,得到的hash值使用16进制解析,再利用算法取128位二进制数作为hash值
* @param word 词语
* @return 128位二进制
* @return 128位二进制
hash值
*/
public
static
String
wordHash
(
String
word
)
throws
HashException
{
//如果传入词语为null或“”或“ ”
...
...
@@ -30,36 +27,31 @@ public class CalculationUtils {
// 采用MD5算法进行hash
MessageDigest
digest
=
MessageDigest
.
getInstance
(
"MD5"
);
digest
.
update
(
word
.
getBytes
(
StandardCharsets
.
UTF_8
));
// hash值转为32位16进制
// hash值转为32位16进制
的散列值
StringBuilder
hash
=
new
StringBuilder
();
for
(
byte
b
:
digest
.
digest
())
{
hash
.
append
(
String
.
format
(
"%02x"
,
b
));
}
// 16进制转为128位2进制码
// 16进制的散列值转为128位二进制码
StringBuilder
finalHash
=
new
StringBuilder
();
String
strTemp
;
for
(
int
i
=
0
;
i
<
hash
.
length
();
i
++)
{
// 每一位16进制数加上0000
最后截取后面的4位
得到便是这位数的二进制
for
(
int
i
=
0
;
i
<
hash
.
length
();
i
++)
{
// 每一位16进制数加上0000
,最后截取后4位,
得到便是这位数的二进制
strTemp
=
"0000"
+
Integer
.
toBinaryString
(
Integer
.
parseInt
(
hash
.
substring
(
i
,
i
+
1
),
16
));
finalHash
.
append
(
strTemp
.
substring
(
strTemp
.
length
()
-
4
));
}
// 不为128直接报错
// 不为128则为hash异常
if
(
finalHash
.
length
()
!=
HASH_BIT
)
{
throw
new
HashException
(
"hash值长度不为128"
);
}
return
finalHash
.
toString
();
}
catch
(
NoSuchAlgorithmException
e
)
{
throw
new
HashException
(
"MD5算法异常"
);
}
}
/**
* 给二进制
哈希
值加权
* 给二进制
hash
值加权
* @param hash 二进制哈希值
* @param weight 权重
* @return 加权后的二进制哈希值
...
...
@@ -75,12 +67,11 @@ public class CalculationUtils {
hashArray
[
i
]
=
-
1
*
weight
;
}
}
return
hashArray
;
}
/**
*
得到的合并后的hash值
进行降维,最终得到simHash
*
合并后的hash
进行降维,最终得到simHash
* @param mergeHash 合并后的hash值
* @return sim哈希值
*/
...
...
@@ -98,7 +89,6 @@ public class CalculationUtils {
return
simHash
.
toString
();
}
/**
* 根据词语得到simHash
* @param wordCount 词语及其出现次数
...
...
@@ -113,7 +103,7 @@ public class CalculationUtils {
// 遍历词语及其出现次数,对每一个词语进行hash加权,然后合并
wordCount
.
forEach
((
word
,
count
)
->
{
try
{
int
[]
tempHash
=
hashWeight
(
wordHash
(
word
),
count
);
int
[]
tempHash
=
hashWeight
(
wordHash
(
word
),
count
);
//加权后的hash值
for
(
int
i
=
0
;
i
<
tempHash
.
length
;
i
++)
{
mergeHash
[
i
]
+=
tempHash
[
i
];
}
...
...
@@ -121,7 +111,6 @@ public class CalculationUtils {
e
.
printStackTrace
();
}
});
// 降维得到simHash
return
getSimHash
(
mergeHash
);
}
...
...
@@ -133,26 +122,20 @@ public class CalculationUtils {
* @return 相似度
*/
public
static
double
getSimilarity
(
String
simHash1
,
String
simHash2
)
{
// 汉明距离
int
distance
=
0
;
// 得到两个simHash的汉明距离
// 遍历simHash1和simHash2,不相同则汉明距离加1
int
hamingDistance
=
0
;
int
same
=
0
;
for
(
int
i
=
0
;
i
<
simHash1
.
length
();
i
++)
{
if
(
simHash1
.
charAt
(
i
)
!=
simHash2
.
charAt
(
i
))
{
distance
++;
hamingDistance
++;
}
if
(
simHash1
.
charAt
(
i
)==
'1'
&&
simHash2
.
charAt
(
i
)==
'1'
)
{
same
++;
}
}
// System.out.println("汉明距离为:" + distance);
// 更换计算策略
if
(
distance
>=
0
&&
distance
<=
DISTANCE_WAY1
)
{
return
1
-
(
double
)
distance
/
256
;
}
else
if
(
distance
>
16
&&
distance
<=
DISTANCE_WAY2
)
{
return
1
-
(
double
)
distance
/
128
;
}
else
if
(
distance
>
32
&&
distance
<=
DISTANCE_WAY3
)
{
return
1
-
(
double
)
distance
/
64
;
}
else
{
return
0
;
}
System
.
out
.
println
(
"两个simHash的汉明距离为:"
+
hamingDistance
);
// 用杰卡德系数计算文本相似度
return
(
double
)
same
/(
hamingDistance
+
same
);
}
}
\ No newline at end of file
simhash/src/test/java/MainTest.java
浏览文件 @
9a4510f9
import
com.hankcs.hanlp.HanLP
;
import
exceptions.HashException
;
import
org.junit.jupiter.api.Assertions
;
import
org.junit.jupiter.api.Test
;
import
exceptions.FileAnalyseException
;
import
exceptions.NotExistFileException
;
import
utils.CalculationUtils
;
import
utils.CommonUtils
;
import
java.util.Arrays
;
import
java.util.Map
;
public
class
MainTest
{
//读取文件后得到的文本
static
String
analyseStr
;
//两个示例句子
static
String
originSentence
=
"今天是星期天,天气晴,今天晚上我要去看电影。"
;
static
String
compareSentence
=
"今天是周天,天气晴朗,我晚上要去看电影。"
;
//比对结果写入的文件
static
String
writeFilePath
=
"E:\\测试文本\\write.txt"
;
//原文件
static
String
OrigFilePath
=
"E:\\测试文本\\orig.txt"
;
//5个比对文件
static
String
CopyFilePath1
=
"E:\\测试文本\\orig_0.8_add.txt"
;
static
String
CopyFilePath2
=
"E:\\测试文本\\orig_0.8_del.txt"
;
static
String
CopyFilePath3
=
"E:\\测试文本\\orig_0.8_dis_1.txt"
;
static
String
CopyFilePath4
=
"E:\\测试文本\\orig_0.8_dis_10.txt"
;
static
String
CopyFilePath5
=
"E:\\测试文本\\orig_0.8_dis_15.txt"
;
/**
* 测试写入文件
*/
@Test
void
testWriteFile
(){
CommonUtils
.
writeFile
(
writeFilePath
,
"------successfully content entry------"
);
try
{
String
s
=
CommonUtils
.
readFileToStr
(
writeFilePath
);
Assertions
.
assertTrue
(
s
.
contains
(
"------successfully content entry------"
),
"写入文件失败"
);
}
catch
(
NotExistFileException
e
)
{
e
.
printStackTrace
();
Assertions
.
fail
(
"写入文件失败"
);
}
}
/**
* 测试读取不存在的文件
*/
@Test
void
testReadFileNotExist
(){
try
{
CommonUtils
.
readFileToStr
(
"E:\\not existing.txt"
);
Assertions
.
fail
(
"没有抛出异常"
);
}
catch
(
NotExistFileException
e
)
{
e
.
printStackTrace
();
Assertions
.
assertTrue
(
true
);
}
}
/**
* 测试文件解析异常(为null,为“”,为“ ”)
*/
@Test
void
testFileAnalyseException
(){
try
{
CommonUtils
.
analyseText
(
null
);
Assertions
.
fail
(
"没有抛出异常"
);
}
catch
(
FileAnalyseException
e
)
{
e
.
printStackTrace
();
Assertions
.
assertTrue
(
true
);
}
try
{
CommonUtils
.
analyseText
(
""
);
Assertions
.
fail
(
"没有抛出异常"
);
}
catch
(
FileAnalyseException
e
)
{
e
.
printStackTrace
();
Assertions
.
assertTrue
(
true
);
}
try
{
CommonUtils
.
analyseText
(
" "
);
Assertions
.
fail
(
"没有抛出异常"
);
}
catch
(
FileAnalyseException
e
)
{
e
.
printStackTrace
();
Assertions
.
assertTrue
(
true
);
}
}
/**
* 测试读取文件并查看分词结果
*/
@Test
void
testReadFile
(){
try
{
//测试句子分词
System
.
out
.
println
(
"分词结果为:"
+
CommonUtils
.
analyseText
(
originSentence
));
//测试文本分词
analyseStr
=
CommonUtils
.
readFileToStr
(
OrigFilePath
);
System
.
out
.
println
(
"分词结果为:"
+
CommonUtils
.
analyseText
(
analyseStr
));
}
catch
(
Exception
e
)
{
e
.
printStackTrace
();
Assertions
.
fail
(
"分词结果有误"
);
}
}
/**
* 测试MD5算法hash计算hash,检查所得到hash值是否为128位
*/
@Test
void
testWordHash
(){
HanLP
.
extractKeyword
(
originSentence
,
originSentence
.
length
()).
forEach
(
word
->
{
try
{
String
hash
=
CalculationUtils
.
wordHash
(
word
);
System
.
out
.
println
(
word
+
" : "
+
hash
);
Assertions
.
assertEquals
(
128
,
hash
.
length
(),
"hash值长度不是128"
);
}
catch
(
HashException
e
)
{
Assertions
.
fail
(
"哈希出错"
);
e
.
printStackTrace
();
}
}
);
}
/**
* 测试哈希异常(得到hash值为空)
*/
@Test
void
testHashException
(){
try
{
CalculationUtils
.
wordHash
(
""
);
Assertions
.
fail
(
"没有抛出异常"
);
}
catch
(
HashException
e
)
{
e
.
printStackTrace
();
Assertions
.
assertTrue
(
true
);
}
try
{
CalculationUtils
.
wordHash
(
null
);
Assertions
.
fail
(
"没有抛出异常"
);
}
catch
(
HashException
e
)
{
e
.
printStackTrace
();
Assertions
.
assertTrue
(
true
);
}
try
{
CalculationUtils
.
wordHash
(
" "
);
Assertions
.
fail
(
"没有抛出异常"
);
}
catch
(
HashException
e
)
{
e
.
printStackTrace
();
Assertions
.
assertTrue
(
true
);
}
}
/**
* 测试加权算法
*/
@Test
void
testHashWeight
(){
Map
<
String
,
Integer
>
map
=
null
;
try
{
map
=
CommonUtils
.
analyseText
(
originSentence
);
}
catch
(
FileAnalyseException
e
)
{
e
.
printStackTrace
();
Assertions
.
fail
(
"解析错误"
);
}
map
.
forEach
((
word
,
count
)
->
{
try
{
String
hash
=
CalculationUtils
.
wordHash
(
word
);
int
[]
hashWeight
=
CalculationUtils
.
hashWeight
(
hash
,
count
);
//打印加权后的hash值
System
.
out
.
println
(
word
+
" : "
+
Arrays
.
toString
(
hashWeight
));
Assertions
.
assertEquals
(
128
,
hashWeight
.
length
,
"加权后的hash值长度不是128"
);
}
catch
(
HashException
e
)
{
Assertions
.
fail
(
"哈希出错"
);
e
.
printStackTrace
();
}
});
}
/**
* 测试计算simHash
*/
@Test
void
testCalculateSimHash
()
{
try
{
String
hash1
=
CalculationUtils
.
calculateSimHash
(
CommonUtils
.
analyseText
(
originSentence
));
System
.
out
.
println
(
"原句子\""
+
originSentence
+
"\"的simhash值为:"
+
hash1
);
Assertions
.
assertEquals
(
hash1
.
length
(),
128
,
"hash值长度不是128"
);
String
hash2
=
CalculationUtils
.
calculateSimHash
(
CommonUtils
.
analyseText
((
CommonUtils
.
readFileToStr
(
OrigFilePath
))));
System
.
out
.
println
(
"原文本的simhash值为:"
+
hash2
);
Assertions
.
assertEquals
(
hash2
.
length
(),
128
,
"hash值长度不是128"
);
}
catch
(
FileAnalyseException
|
NotExistFileException
e
)
{
e
.
printStackTrace
();
}
}
/**
* 测试计算句子相似度
*/
@Test
void
testGetSimilarity1
(){
String
hash1
=
null
;
String
hash2
=
null
;
try
{
hash1
=
CalculationUtils
.
calculateSimHash
(
CommonUtils
.
analyseText
(
originSentence
));
hash2
=
CalculationUtils
.
calculateSimHash
(
CommonUtils
.
analyseText
(
compareSentence
));
}
catch
(
FileAnalyseException
e
)
{
e
.
printStackTrace
();
Assertions
.
fail
(
"解析错误"
);
}
double
similarity
=
CalculationUtils
.
getSimilarity
(
hash1
,
hash2
);
String
format
=
String
.
format
(
"两个句子的相似度为:%.2f"
,
similarity
);
System
.
out
.
println
(
format
);
Assertions
.
assertTrue
(
0
<=
similarity
&&
similarity
<=
1
,
"相似度不在0-1之间"
);
}
/**
* 测试计算文本相似度
*/
@Test
void
testGetSimilarity2
(){
String
hash1
;
String
hash2
;
try
{
hash1
=
CalculationUtils
.
calculateSimHash
(
CommonUtils
.
analyseText
(
CommonUtils
.
readFileToStr
(
OrigFilePath
)));
hash2
=
CalculationUtils
.
calculateSimHash
(
CommonUtils
.
analyseText
(
CommonUtils
.
readFileToStr
(
CopyFilePath1
)));
double
similarity
=
CalculationUtils
.
getSimilarity
(
hash1
,
hash2
);
String
format
=
String
.
format
(
"两个文本的相似度为:%.2f"
,
similarity
);
System
.
out
.
println
(
format
);
Assertions
.
assertTrue
(
0
<=
similarity
&&
similarity
<=
1
,
"相似度不在0-1之间"
);
}
catch
(
FileAnalyseException
|
NotExistFileException
e
)
{
e
.
printStackTrace
();
}
}
/**
* 测试主函数
*/
...
...
@@ -10,8 +235,18 @@ public class MainTest {
void
testMain
(){
String
[]
args
=
new
String
[
3
];
args
[
0
]
=
OrigFilePath
;
args
[
1
]
=
CopyFilePath1
;
args
[
1
]
=
CopyFilePath1
;
args
[
2
]
=
writeFilePath
;
Main
.
main
(
args
);
args
[
1
]=
CopyFilePath2
;
Main
.
main
(
args
);
args
[
1
]=
CopyFilePath3
;
Main
.
main
(
args
);
args
[
1
]=
CopyFilePath4
;
Main
.
main
(
args
);
args
[
1
]=
CopyFilePath5
;
Main
.
main
(
args
);
args
[
0
]
=
CopyFilePath3
;
}
}
\ No newline at end of file
simhash/target/classes/Main.class
浏览文件 @
9a4510f9
无法预览此类型文件
simhash/target/classes/exceptions/FileAnalyseException.class
浏览文件 @
9a4510f9
无法预览此类型文件
simhash/target/classes/exceptions/HashException.class
浏览文件 @
9a4510f9
无法预览此类型文件
simhash/target/classes/exceptions/NotExistFileException.class
浏览文件 @
9a4510f9
无法预览此类型文件
simhash/target/classes/utils/CalculationUtils.class
浏览文件 @
9a4510f9
无法预览此类型文件
simhash/target/test-classes/MainTest.class
浏览文件 @
9a4510f9
无法预览此类型文件
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录