Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
南宫伊儿
elasticsearch-analysis-ik
提交
43c8bc9f
E
elasticsearch-analysis-ik
项目概览
南宫伊儿
/
elasticsearch-analysis-ik
与 Fork 源项目一致
从无法访问的项目Fork
通知
3
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
E
elasticsearch-analysis-ik
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
43c8bc9f
编写于
5月 12, 2013
作者:
weixin_43283383
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #10 from wyhw/ik_lucene4
elasticsearch ik 0.20.x => 0.90.x
上级
a2dc3c78
5e14e3d6
变更
16
展开全部
隐藏空白更改
内联
并排
Showing
16 changed file
with
1520 addition
and
1580 deletion
+1520
-1580
pom.xml
pom.xml
+2
-2
src/main/java/org/elasticsearch/index/analysis/IkAnalyzer.java
...ain/java/org/elasticsearch/index/analysis/IkAnalyzer.java
+17
-4
src/main/java/org/wltea/analyzer/core/AnalyzeContext.java
src/main/java/org/wltea/analyzer/core/AnalyzeContext.java
+14
-8
src/main/java/org/wltea/analyzer/core/CJKSegmenter.java
src/main/java/org/wltea/analyzer/core/CJKSegmenter.java
+5
-5
src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java
.../java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java
+5
-5
src/main/java/org/wltea/analyzer/core/IKArbitrator.java
src/main/java/org/wltea/analyzer/core/IKArbitrator.java
+3
-4
src/main/java/org/wltea/analyzer/core/IKSegmenter.java
src/main/java/org/wltea/analyzer/core/IKSegmenter.java
+26
-8
src/main/java/org/wltea/analyzer/core/LetterSegmenter.java
src/main/java/org/wltea/analyzer/core/LetterSegmenter.java
+1
-1
src/main/java/org/wltea/analyzer/dic/DictSegment.java
src/main/java/org/wltea/analyzer/dic/DictSegment.java
+1
-9
src/main/java/org/wltea/analyzer/dic/Dictionary.java
src/main/java/org/wltea/analyzer/dic/Dictionary.java
+287
-418
src/main/java/org/wltea/analyzer/dic/Hit.java
src/main/java/org/wltea/analyzer/dic/Hit.java
+9
-3
src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java
src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java
+60
-24
src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java
src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java
+75
-74
src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java
...ava/org/wltea/analyzer/query/IKQueryExpressionParser.java
+716
-716
src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java
src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java
+152
-152
src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java
...a/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java
+147
-147
未找到文件。
pom.xml
浏览文件 @
43c8bc9f
...
@@ -31,7 +31,7 @@
...
@@ -31,7 +31,7 @@
</parent>
</parent>
<properties>
<properties>
<elasticsearch.version>
0.
20.2
</elasticsearch.version>
<elasticsearch.version>
0.
90.0
</elasticsearch.version>
</properties>
</properties>
<repositories>
<repositories>
...
@@ -132,4 +132,4 @@
...
@@ -132,4 +132,4 @@
</plugin>
</plugin>
</plugins>
</plugins>
</build>
</build>
</project>
</project>
\ No newline at end of file
src/main/java/org/elasticsearch/index/analysis/IkAnalyzer.java
浏览文件 @
43c8bc9f
...
@@ -2,19 +2,32 @@ package org.elasticsearch.index.analysis;
...
@@ -2,19 +2,32 @@ package org.elasticsearch.index.analysis;
import
org.apache.lucene.analysis.Analyzer
;
import
org.apache.lucene.analysis.Analyzer
;
import
org.apache.lucene.analysis.TokenStream
;
import
org.apache.lucene.analysis.TokenStream
;
import
org.apache.lucene.analysis.Tokenizer
;
import
org.wltea.analyzer.lucene.IKTokenizer
;
import
org.wltea.analyzer.lucene.IKTokenizer
;
//import org.wltea.lucene.IKTokenizer;
import
java.io.Reader
;
import
java.io.Reader
;
public
class
IkAnalyzer
extends
Analyzer
{
public
class
IkAnalyzer
extends
Analyzer
{
// private boolean isMaxWordLength = false;
@Override
public
TokenStream
tokenStream
(
String
fieldName
,
Reader
reader
)
{
// @Override public TokenStream tokenStream(String fieldName, Reader reader) {
return
new
IKTokenizer
(
reader
,
true
);
//
return new IKTokenizer(reader,true);
}
//
}
public
IkAnalyzer
()
{
public
IkAnalyzer
()
{
super
();
super
();
}
}
@Override
protected
TokenStreamComponents
createComponents
(
String
s
,
Reader
reader
)
{
// new TokenStreamComponents
Tokenizer
tokenizer
=
new
IKTokenizer
(
reader
,
true
);
return
new
TokenStreamComponents
(
tokenizer
,
null
);
//To change body of implemented methods use File | Settings | File Templates.
}
// public boolean isMaxWordLength() {
// return isMaxWordLength;
// }
}
}
src/main/java/org/wltea/analyzer/core/AnalyzeContext.java
浏览文件 @
43c8bc9f
...
@@ -24,11 +24,16 @@
...
@@ -24,11 +24,16 @@
*/
*/
package
org.wltea.analyzer.core
;
package
org.wltea.analyzer.core
;
import
org.wltea.analyzer.dic.Dictionary
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.io.Reader
;
import
java.io.Reader
;
import
java.util.*
;
import
java.util.HashMap
;
import
java.util.HashSet
;
import
java.util.LinkedList
;
import
java.util.Map
;
import
java.util.Set
;
import
org.wltea.analyzer.cfg.Configuration
;
import
org.wltea.analyzer.dic.Dictionary
;
/**
/**
*
*
...
@@ -68,12 +73,12 @@ class AnalyzeContext {
...
@@ -68,12 +73,12 @@ class AnalyzeContext {
private
Map
<
Integer
,
LexemePath
>
pathMap
;
private
Map
<
Integer
,
LexemePath
>
pathMap
;
//最终分词结果集
//最终分词结果集
private
LinkedList
<
Lexeme
>
results
;
private
LinkedList
<
Lexeme
>
results
;
private
boolean
useSmart
;
//分词器配置项
//分词器配置项
private
boolean
useSmart
;
// private Configuration cfg
;
public
AnalyzeContext
(
boolean
useSmart
){
public
AnalyzeContext
(
boolean
useSmart
){
this
.
useSmart
=
useSmart
;
this
.
useSmart
=
useSmart
;
this
.
segmentBuff
=
new
char
[
BUFF_SIZE
];
this
.
segmentBuff
=
new
char
[
BUFF_SIZE
];
this
.
charTypes
=
new
int
[
BUFF_SIZE
];
this
.
charTypes
=
new
int
[
BUFF_SIZE
];
this
.
buffLocker
=
new
HashSet
<
String
>();
this
.
buffLocker
=
new
HashSet
<
String
>();
...
@@ -313,7 +318,7 @@ class AnalyzeContext {
...
@@ -313,7 +318,7 @@ class AnalyzeContext {
while
(
result
!=
null
){
while
(
result
!=
null
){
//数量词合并
//数量词合并
this
.
compound
(
result
);
this
.
compound
(
result
);
if
(
Dictionary
.
isStopWord
(
this
.
segmentBuff
,
result
.
getBegin
()
,
result
.
getLength
())){
if
(
Dictionary
.
getSingleton
().
isStopWord
(
this
.
segmentBuff
,
result
.
getBegin
()
,
result
.
getLength
())){
//是停止词继续取列表的下一个
//是停止词继续取列表的下一个
result
=
this
.
results
.
pollFirst
();
result
=
this
.
results
.
pollFirst
();
}
else
{
}
else
{
...
@@ -344,6 +349,7 @@ class AnalyzeContext {
...
@@ -344,6 +349,7 @@ class AnalyzeContext {
* 组合词元
* 组合词元
*/
*/
private
void
compound
(
Lexeme
result
){
private
void
compound
(
Lexeme
result
){
if
(!
this
.
useSmart
){
if
(!
this
.
useSmart
){
return
;
return
;
}
}
...
...
src/main/java/org/wltea/analyzer/core/CJKSegmenter.java
浏览文件 @
43c8bc9f
...
@@ -25,12 +25,12 @@
...
@@ -25,12 +25,12 @@
*/
*/
package
org.wltea.analyzer.core
;
package
org.wltea.analyzer.core
;
import
org.wltea.analyzer.dic.Dictionary
;
import
org.wltea.analyzer.dic.Hit
;
import
java.util.LinkedList
;
import
java.util.LinkedList
;
import
java.util.List
;
import
java.util.List
;
import
org.wltea.analyzer.dic.Dictionary
;
import
org.wltea.analyzer.dic.Hit
;
/**
/**
* 中文-日韩文子分词器
* 中文-日韩文子分词器
...
@@ -58,7 +58,7 @@ class CJKSegmenter implements ISegmenter {
...
@@ -58,7 +58,7 @@ class CJKSegmenter implements ISegmenter {
//处理词段队列
//处理词段队列
Hit
[]
tmpArray
=
this
.
tmpHits
.
toArray
(
new
Hit
[
this
.
tmpHits
.
size
()]);
Hit
[]
tmpArray
=
this
.
tmpHits
.
toArray
(
new
Hit
[
this
.
tmpHits
.
size
()]);
for
(
Hit
hit
:
tmpArray
){
for
(
Hit
hit
:
tmpArray
){
hit
=
Dictionary
.
matchInMainDict
WithHit
(
context
.
getSegmentBuff
(),
context
.
getCursor
()
,
hit
);
hit
=
Dictionary
.
getSingleton
().
match
WithHit
(
context
.
getSegmentBuff
(),
context
.
getCursor
()
,
hit
);
if
(
hit
.
isMatch
()){
if
(
hit
.
isMatch
()){
//输出当前的词
//输出当前的词
Lexeme
newLexeme
=
new
Lexeme
(
context
.
getBufferOffset
()
,
hit
.
getBegin
()
,
context
.
getCursor
()
-
hit
.
getBegin
()
+
1
,
Lexeme
.
TYPE_CNWORD
);
Lexeme
newLexeme
=
new
Lexeme
(
context
.
getBufferOffset
()
,
hit
.
getBegin
()
,
context
.
getCursor
()
-
hit
.
getBegin
()
+
1
,
Lexeme
.
TYPE_CNWORD
);
...
@@ -77,7 +77,7 @@ class CJKSegmenter implements ISegmenter {
...
@@ -77,7 +77,7 @@ class CJKSegmenter implements ISegmenter {
//*********************************
//*********************************
//再对当前指针位置的字符进行单字匹配
//再对当前指针位置的字符进行单字匹配
Hit
singleCharHit
=
Dictionary
.
matchInMainDict
(
context
.
getSegmentBuff
(),
context
.
getCursor
(),
1
);
Hit
singleCharHit
=
Dictionary
.
getSingleton
().
matchInMainDict
(
context
.
getSegmentBuff
(),
context
.
getCursor
(),
1
);
if
(
singleCharHit
.
isMatch
()){
//首字成词
if
(
singleCharHit
.
isMatch
()){
//首字成词
//输出当前的词
//输出当前的词
Lexeme
newLexeme
=
new
Lexeme
(
context
.
getBufferOffset
()
,
context
.
getCursor
()
,
1
,
Lexeme
.
TYPE_CNWORD
);
Lexeme
newLexeme
=
new
Lexeme
(
context
.
getBufferOffset
()
,
context
.
getCursor
()
,
1
,
Lexeme
.
TYPE_CNWORD
);
...
...
src/main/java/org/wltea/analyzer/core/CN_QuantifierSegmenter.java
浏览文件 @
43c8bc9f
...
@@ -24,14 +24,14 @@
...
@@ -24,14 +24,14 @@
*/
*/
package
org.wltea.analyzer.core
;
package
org.wltea.analyzer.core
;
import
org.wltea.analyzer.dic.Dictionary
;
import
org.wltea.analyzer.dic.Hit
;
import
java.util.HashSet
;
import
java.util.HashSet
;
import
java.util.LinkedList
;
import
java.util.LinkedList
;
import
java.util.List
;
import
java.util.List
;
import
java.util.Set
;
import
java.util.Set
;
import
org.wltea.analyzer.dic.Dictionary
;
import
org.wltea.analyzer.dic.Hit
;
/**
/**
*
*
* 中文数量词子分词器
* 中文数量词子分词器
...
@@ -155,7 +155,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
...
@@ -155,7 +155,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
//处理词段队列
//处理词段队列
Hit
[]
tmpArray
=
this
.
countHits
.
toArray
(
new
Hit
[
this
.
countHits
.
size
()]);
Hit
[]
tmpArray
=
this
.
countHits
.
toArray
(
new
Hit
[
this
.
countHits
.
size
()]);
for
(
Hit
hit
:
tmpArray
){
for
(
Hit
hit
:
tmpArray
){
hit
=
Dictionary
.
matchInMainDict
WithHit
(
context
.
getSegmentBuff
(),
context
.
getCursor
()
,
hit
);
hit
=
Dictionary
.
getSingleton
().
match
WithHit
(
context
.
getSegmentBuff
(),
context
.
getCursor
()
,
hit
);
if
(
hit
.
isMatch
()){
if
(
hit
.
isMatch
()){
//输出当前的词
//输出当前的词
Lexeme
newLexeme
=
new
Lexeme
(
context
.
getBufferOffset
()
,
hit
.
getBegin
()
,
context
.
getCursor
()
-
hit
.
getBegin
()
+
1
,
Lexeme
.
TYPE_COUNT
);
Lexeme
newLexeme
=
new
Lexeme
(
context
.
getBufferOffset
()
,
hit
.
getBegin
()
,
context
.
getCursor
()
-
hit
.
getBegin
()
+
1
,
Lexeme
.
TYPE_COUNT
);
...
@@ -174,7 +174,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
...
@@ -174,7 +174,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
//*********************************
//*********************************
//对当前指针位置的字符进行单字匹配
//对当前指针位置的字符进行单字匹配
Hit
singleCharHit
=
Dictionary
.
matchInQuantifierDict
(
context
.
getSegmentBuff
(),
context
.
getCursor
(),
1
);
Hit
singleCharHit
=
Dictionary
.
getSingleton
().
matchInQuantifierDict
(
context
.
getSegmentBuff
(),
context
.
getCursor
(),
1
);
if
(
singleCharHit
.
isMatch
()){
//首字成量词词
if
(
singleCharHit
.
isMatch
()){
//首字成量词词
//输出当前的词
//输出当前的词
Lexeme
newLexeme
=
new
Lexeme
(
context
.
getBufferOffset
()
,
context
.
getCursor
()
,
1
,
Lexeme
.
TYPE_COUNT
);
Lexeme
newLexeme
=
new
Lexeme
(
context
.
getBufferOffset
()
,
context
.
getCursor
()
,
1
,
Lexeme
.
TYPE_COUNT
);
...
...
src/main/java/org/wltea/analyzer/core/IKArbitrator.java
浏览文件 @
43c8bc9f
...
@@ -38,7 +38,7 @@ class IKArbitrator {
...
@@ -38,7 +38,7 @@ class IKArbitrator {
/**
/**
* 分词歧义处理
* 分词歧义处理
* @param orgLexemes
//
* @param orgLexemes
* @param useSmart
* @param useSmart
*/
*/
void
process
(
AnalyzeContext
context
,
boolean
useSmart
){
void
process
(
AnalyzeContext
context
,
boolean
useSmart
){
...
@@ -87,7 +87,6 @@ class IKArbitrator {
...
@@ -87,7 +87,6 @@ class IKArbitrator {
* 歧义识别
* 歧义识别
* @param lexemeCell 歧义路径链表头
* @param lexemeCell 歧义路径链表头
* @param fullTextLength 歧义路径文本长度
* @param fullTextLength 歧义路径文本长度
* @param option 候选结果路径
* @return
* @return
*/
*/
private
LexemePath
judge
(
QuickSortSet
.
Cell
lexemeCell
,
int
fullTextLength
){
private
LexemePath
judge
(
QuickSortSet
.
Cell
lexemeCell
,
int
fullTextLength
){
...
@@ -120,7 +119,7 @@ class IKArbitrator {
...
@@ -120,7 +119,7 @@ class IKArbitrator {
/**
/**
* 向前遍历,添加词元,构造一个无歧义词元组合
* 向前遍历,添加词元,构造一个无歧义词元组合
* @param LexemePath path
//
* @param LexemePath path
* @return
* @return
*/
*/
private
Stack
<
QuickSortSet
.
Cell
>
forwardPath
(
QuickSortSet
.
Cell
lexemeCell
,
LexemePath
option
){
private
Stack
<
QuickSortSet
.
Cell
>
forwardPath
(
QuickSortSet
.
Cell
lexemeCell
,
LexemePath
option
){
...
@@ -140,7 +139,7 @@ class IKArbitrator {
...
@@ -140,7 +139,7 @@ class IKArbitrator {
/**
/**
* 回滚词元链,直到它能够接受指定的词元
* 回滚词元链,直到它能够接受指定的词元
* @param lexeme
// * @param lexeme
* @param l
* @param l
*/
*/
private
void
backPath
(
Lexeme
l
,
LexemePath
option
){
private
void
backPath
(
Lexeme
l
,
LexemePath
option
){
...
...
src/main/java/org/wltea/analyzer/core/IKSegmenter.java
浏览文件 @
43c8bc9f
...
@@ -23,14 +23,15 @@
...
@@ -23,14 +23,15 @@
*/
*/
package
org.wltea.analyzer.core
;
package
org.wltea.analyzer.core
;
import
org.elasticsearch.common.logging.ESLogger
;
import
org.elasticsearch.common.logging.Loggers
;
import
java.io.IOException
;
import
java.io.IOException
;
import
java.io.Reader
;
import
java.io.Reader
;
import
java.util.ArrayList
;
import
java.util.ArrayList
;
import
java.util.List
;
import
java.util.List
;
import
org.wltea.analyzer.cfg.Configuration
;
//import org.wltea.analyzer.cfg.DefaultConfig;
import
org.wltea.analyzer.dic.Dictionary
;
/**
/**
* IK分词器主类
* IK分词器主类
*
*
...
@@ -39,16 +40,18 @@ public final class IKSegmenter {
...
@@ -39,16 +40,18 @@ public final class IKSegmenter {
//字符窜reader
//字符窜reader
private
Reader
input
;
private
Reader
input
;
//分词器配置项
private
Configuration
cfg
;
//分词器上下文
//分词器上下文
private
AnalyzeContext
context
;
private
AnalyzeContext
context
;
//分词处理器列表
//分词处理器列表
private
List
<
ISegmenter
>
segmenters
;
private
List
<
ISegmenter
>
segmenters
;
//分词歧义裁决器
//分词歧义裁决器
private
IKArbitrator
arbitrator
;
private
IKArbitrator
arbitrator
;
private
ESLogger
logger
=
null
;
private
boolean
useSmart
=
false
;
private
final
boolean
useSmart
;
/**
/**
* IK分词器构造函数
* IK分词器构造函数
* @param input
* @param input
* @param useSmart 为true,使用智能分词策略
* @param useSmart 为true,使用智能分词策略
...
@@ -57,16 +60,31 @@ public final class IKSegmenter {
...
@@ -57,16 +60,31 @@ public final class IKSegmenter {
* 智能分词: 合并数词和量词,对分词结果进行歧义判断
* 智能分词: 合并数词和量词,对分词结果进行歧义判断
*/
*/
public
IKSegmenter
(
Reader
input
,
boolean
useSmart
){
public
IKSegmenter
(
Reader
input
,
boolean
useSmart
){
logger
=
Loggers
.
getLogger
(
"ik-analyzer"
);
this
.
input
=
input
;
this
.
input
=
input
;
// this.cfg = DefaultConfig.getInstance();
this
.
useSmart
=
useSmart
;
this
.
useSmart
=
useSmart
;
this
.
init
();
this
.
init
();
}
/**
* IK分词器构造函数
* @param input
* @param cfg 使用自定义的Configuration构造分词器
*
*/
public
IKSegmenter
(
Reader
input
,
Configuration
cfg
){
this
.
input
=
input
;
this
.
cfg
=
cfg
;
this
.
init
();
}
}
/**
/**
* 初始化
* 初始化
*/
*/
private
void
init
(){
private
void
init
(){
//初始化词典单例
// Dictionary.initial(this.cfg);
// Dictionary.getSingleton();
//初始化分词上下文
//初始化分词上下文
this
.
context
=
new
AnalyzeContext
(
useSmart
);
this
.
context
=
new
AnalyzeContext
(
useSmart
);
//加载子分词器
//加载子分词器
...
...
src/main/java/org/wltea/analyzer/core/LetterSegmenter.java
浏览文件 @
43c8bc9f
...
@@ -120,7 +120,7 @@ class LetterSegmenter implements ISegmenter {
...
@@ -120,7 +120,7 @@ class LetterSegmenter implements ISegmenter {
/**
/**
* 处理数字字母混合输出
* 处理数字字母混合输出
* 如:windos2000 | linliangyi2005@gmail.com
* 如:windos2000 | linliangyi2005@gmail.com
* @param input
//
* @param input
* @param context
* @param context
* @return
* @return
*/
*/
...
...
src/main/java/org/wltea/analyzer/dic/DictSegment.java
浏览文件 @
43c8bc9f
...
@@ -326,13 +326,5 @@ class DictSegment implements Comparable<DictSegment>{
...
@@ -326,13 +326,5 @@ class DictSegment implements Comparable<DictSegment>{
//对当前节点存储的char进行比较
//对当前节点存储的char进行比较
return
this
.
nodeChar
.
compareTo
(
o
.
nodeChar
);
return
this
.
nodeChar
.
compareTo
(
o
.
nodeChar
);
}
}
public
int
getDicNum
(){
if
(
charMap
!=
null
)
{
return
charMap
.
size
();
}
return
0
;
}
}
}
src/main/java/org/wltea/analyzer/dic/Dictionary.java
浏览文件 @
43c8bc9f
此差异已折叠。
点击以展开。
src/main/java/org/wltea/analyzer/dic/Hit.java
浏览文件 @
43c8bc9f
...
@@ -58,7 +58,9 @@ public class Hit {
...
@@ -58,7 +58,9 @@ public class Hit {
public
boolean
isMatch
()
{
public
boolean
isMatch
()
{
return
(
this
.
hitState
&
MATCH
)
>
0
;
return
(
this
.
hitState
&
MATCH
)
>
0
;
}
}
/**
*
*/
public
void
setMatch
()
{
public
void
setMatch
()
{
this
.
hitState
=
this
.
hitState
|
MATCH
;
this
.
hitState
=
this
.
hitState
|
MATCH
;
}
}
...
@@ -69,7 +71,9 @@ public class Hit {
...
@@ -69,7 +71,9 @@ public class Hit {
public
boolean
isPrefix
()
{
public
boolean
isPrefix
()
{
return
(
this
.
hitState
&
PREFIX
)
>
0
;
return
(
this
.
hitState
&
PREFIX
)
>
0
;
}
}
/**
*
*/
public
void
setPrefix
()
{
public
void
setPrefix
()
{
this
.
hitState
=
this
.
hitState
|
PREFIX
;
this
.
hitState
=
this
.
hitState
|
PREFIX
;
}
}
...
@@ -79,7 +83,9 @@ public class Hit {
...
@@ -79,7 +83,9 @@ public class Hit {
public
boolean
isUnmatch
()
{
public
boolean
isUnmatch
()
{
return
this
.
hitState
==
UNMATCH
;
return
this
.
hitState
==
UNMATCH
;
}
}
/**
*
*/
public
void
setUnmatch
()
{
public
void
setUnmatch
()
{
this
.
hitState
=
UNMATCH
;
this
.
hitState
=
UNMATCH
;
}
}
...
...
src/main/java/org/wltea/analyzer/lucene/IKAnalyzer.java
浏览文件 @
43c8bc9f
/**
/**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
*/
package
org.wltea.analyzer.lucene
;
package
org.wltea.analyzer.lucene
;
import
java.io.Reader
;
import
org.apache.lucene.analysis.Analyzer
;
import
org.apache.lucene.analysis.Analyzer
;
import
org.apache.lucene.analysis.Token
Stream
;
import
org.apache.lucene.analysis.Token
izer
;
import
org.elasticsearch.common.settings.Settings
;
import
org.elasticsearch.common.settings.Settings
;
import
org.wltea.analyzer.dic.Dictionary
;
import
org.wltea.analyzer.dic.Dictionary
;
import
java.io.Reader
;
/**
* IK分词器,Lucene Analyzer接口实现
public
final
class
IKAnalyzer
extends
Analyzer
{
* 兼容Lucene 4.0版本
*/
public
final
class
IKAnalyzer
extends
Analyzer
{
private
boolean
isMaxWordLength
=
false
;
private
boolean
useSmart
;
private
boolean
useSmart
=
false
;
public
boolean
useSmart
()
{
return
useSmart
;
}
public
IKAnalyzer
(){
public
void
setUseSmart
(
boolean
useSmart
)
{
this
.
useSmart
=
useSmart
;
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* 默认细粒度切分算法
*/
public
IKAnalyzer
(){
this
(
false
);
this
(
false
);
}
}
/**
public
IKAnalyzer
(
boolean
isMaxWordLength
){
* IK分词器Lucene Analyzer接口实现类
*
* @param useSmart 当为true时,分词器进行智能切分
*/
public
IKAnalyzer
(
boolean
useSmart
){
super
();
super
();
this
.
setMaxWordLength
(
isMaxWordLength
)
;
this
.
useSmart
=
useSmart
;
}
}
public
IKAnalyzer
(
Settings
indexSetting
,
Settings
settings1
)
{
public
IKAnalyzer
(
Settings
indexSetting
,
Settings
settings1
)
{
super
();
super
();
Dictionary
.
getInstance
().
Init
(
indexSetting
);
Dictionary
.
getInstance
().
Init
(
indexSetting
);
if
(
settings1
.
get
(
"use_smart"
,
"true"
).
equals
(
"true"
)){
if
(
settings1
.
get
(
"use_smart"
,
"true"
).
equals
(
"true"
)){
useSmart
=
true
;
useSmart
=
true
;
}
}
}
}
/**
@Override
* 重载Analyzer接口,构造分词组件
public
TokenStream
tokenStream
(
String
fieldName
,
Reader
reader
)
{
*/
return
new
IKTokenizer
(
reader
,
useSmart
);
@Override
}
protected
TokenStreamComponents
createComponents
(
String
fieldName
,
final
Reader
in
)
{
Tokenizer
_IKTokenizer
=
new
IKTokenizer
(
in
,
this
.
useSmart
());
public
void
setMaxWordLength
(
boolean
isMaxWordLength
)
{
return
new
TokenStreamComponents
(
_IKTokenizer
);
this
.
isMaxWordLength
=
isMaxWordLength
;
}
public
boolean
isMaxWordLength
()
{
return
isMaxWordLength
;
}
}
}
}
src/main/java/org/wltea/analyzer/lucene/IKTokenizer.java
浏览文件 @
43c8bc9f
/**
/**
* IK 中文分词 版本 5.0.1
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
* IK Analyzer release 5.0.1
*
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* this work for additional information regarding copyright ownership.
...
@@ -20,94 +20,95 @@
...
@@ -20,94 +20,95 @@
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*
*
*/
*/
package
org.wltea.analyzer.lucene
;
package
org.wltea.analyzer.lucene
;
import
java.io.IOException
;
import
java.io.Reader
;
import
org.apache.lucene.analysis.Tokenizer
;
import
org.apache.lucene.analysis.Tokenizer
;
import
org.apache.lucene.analysis.tokenattributes.CharTermAttribute
;
import
org.apache.lucene.analysis.tokenattributes.CharTermAttribute
;
import
org.apache.lucene.analysis.tokenattributes.OffsetAttribute
;
import
org.apache.lucene.analysis.tokenattributes.OffsetAttribute
;
import
org.apache.lucene.analysis.tokenattributes.TypeAttribute
;
import
org.apache.lucene.analysis.tokenattributes.TypeAttribute
;
import
org.wltea.analyzer.core.IKSegmenter
;
import
org.wltea.analyzer.core.IKSegmenter
;
import
org.wltea.analyzer.core.Lexeme
;
import
org.wltea.analyzer.core.Lexeme
;
import
java.io.IOException
;
import
java.io.Reader
;
/**
/**
* IK分词器 Lucene Tokenizer适配器类
* IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本
* 兼容Lucene 4.0版本
*/
*/
public
final
class
IKTokenizer
extends
Tokenizer
{
public
final
class
IKTokenizer
extends
Tokenizer
{
//IK分词器实现
private
IKSegmenter
_IKImplement
;
//词元文本属性
private
final
CharTermAttribute
termAtt
;
//词元位移属性
private
final
OffsetAttribute
offsetAtt
;
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
private
final
TypeAttribute
typeAtt
;
//记录最后一个词元的结束位置
private
int
endPosition
;
/**
* Lucene 4.0 Tokenizer适配器类构造函数
* @param in
* @param useSmart
*/
public
IKTokenizer
(
Reader
in
,
boolean
useSmart
){
super
(
in
);
offsetAtt
=
addAttribute
(
OffsetAttribute
.
class
);
termAtt
=
addAttribute
(
CharTermAttribute
.
class
);
typeAtt
=
addAttribute
(
TypeAttribute
.
class
);
_IKImplement
=
new
IKSegmenter
(
input
,
useSmart
);
}
//IK分词器实现
/* (non-Javadoc)
private
IKSegmenter
_IKImplement
;
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
//词元文本属性
@Override
private
final
CharTermAttribute
termAtt
;
public
boolean
incrementToken
()
throws
IOException
{
//词元位移属性
//清除所有的词元属性
private
final
OffsetAttribute
offsetAtt
;
clearAttributes
();
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
Lexeme
nextLexeme
=
_IKImplement
.
next
();
private
final
TypeAttribute
typeAtt
;
if
(
nextLexeme
!=
null
){
//记录最后一个词元的结束位置
//将Lexeme转成Attributes
private
int
endPosition
;
//设置词元文本
termAtt
.
append
(
nextLexeme
.
getLexemeText
());
/**
//设置词元长度
* Lucene 4.0 Tokenizer适配器类构造函数
termAtt
.
setLength
(
nextLexeme
.
getLength
());
* @param in
//设置词元位移
* @param useSmart
offsetAtt
.
setOffset
(
nextLexeme
.
getBeginPosition
(),
nextLexeme
.
getEndPosition
());
*/
//记录分词的最后位置
public
IKTokenizer
(
Reader
in
,
boolean
useSmart
){
endPosition
=
nextLexeme
.
getEndPosition
();
super
(
in
);
//记录词元分类
offsetAtt
=
addAttribute
(
OffsetAttribute
.
class
);
typeAtt
.
setType
(
nextLexeme
.
getLexemeTypeString
());
termAtt
=
addAttribute
(
CharTermAttribute
.
class
);
//返会true告知还有下个词元
typeAtt
=
addAttribute
(
TypeAttribute
.
class
);
return
true
;
_IKImplement
=
new
IKSegmenter
(
input
,
useSmart
);
}
}
//返会false告知词元输出完毕
return
false
;
/* (non-Javadoc)
}
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
/*
@Override
* (non-Javadoc)
public
boolean
incrementToken
()
throws
IOException
{
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
//清除所有的词元属性
*/
clearAttributes
();
@Override
Lexeme
nextLexeme
=
_IKImplement
.
next
();
public
void
reset
()
throws
IOException
{
if
(
nextLexeme
!=
null
){
super
.
reset
();
//将Lexeme转成Attributes
_IKImplement
.
reset
(
input
);
//设置词元文本
}
termAtt
.
append
(
nextLexeme
.
getLexemeText
());
//设置词元长度
@Override
termAtt
.
setLength
(
nextLexeme
.
getLength
());
public
final
void
end
()
{
//设置词元位移
// set final offset
offsetAtt
.
setOffset
(
nextLexeme
.
getBeginPosition
(),
nextLexeme
.
getEndPosition
());
int
finalOffset
=
correctOffset
(
this
.
endPosition
);
//记录分词的最后位置
offsetAtt
.
setOffset
(
finalOffset
,
finalOffset
);
endPosition
=
nextLexeme
.
getEndPosition
();
}
//记录词元分类
typeAtt
.
setType
(
nextLexeme
.
getLexemeTypeString
());
//返会true告知还有下个词元
return
true
;
}
//返会false告知词元输出完毕
return
false
;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public
void
reset
()
throws
IOException
{
super
.
reset
();
_IKImplement
.
reset
(
input
);
}
@Override
public
final
void
end
()
{
// set final offset
int
finalOffset
=
correctOffset
(
this
.
endPosition
);
offsetAtt
.
setOffset
(
finalOffset
,
finalOffset
);
}
}
}
src/main/java/org/wltea/analyzer/query/IKQueryExpressionParser.java
浏览文件 @
43c8bc9f
此差异已折叠。
点击以展开。
src/main/java/org/wltea/analyzer/query/SWMCQueryBuilder.java
浏览文件 @
43c8bc9f
/
//
**
/**
//
* IK 中文分词 版本 5.0
* IK 中文分词 版本 5.0
//
* IK Analyzer release 5.0
* IK Analyzer release 5.0
// *
*
//
* Licensed to the Apache Software Foundation (ASF) under one or more
* Licensed to the Apache Software Foundation (ASF) under one or more
//
* contributor license agreements. See the NOTICE file distributed with
* contributor license agreements. See the NOTICE file distributed with
//
* this work for additional information regarding copyright ownership.
* this work for additional information regarding copyright ownership.
//
* The ASF licenses this file to You under the Apache License, Version 2.0
* The ASF licenses this file to You under the Apache License, Version 2.0
//
* (the "License"); you may not use this file except in compliance with
* (the "License"); you may not use this file except in compliance with
//
* the License. You may obtain a copy of the License at
* the License. You may obtain a copy of the License at
//
*
*
//
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
//
*
*
//
* Unless required by applicable law or agreed to in writing, software
* Unless required by applicable law or agreed to in writing, software
//
* distributed under the License is distributed on an "AS IS" BASIS,
* distributed under the License is distributed on an "AS IS" BASIS,
//
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//
* See the License for the specific language governing permissions and
* See the License for the specific language governing permissions and
//
* limitations under the License.
* limitations under the License.
//
*
*
//
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 源代码由林良益(linliangyi2005@gmail.com)提供
//
* 版权声明 2012,乌龙茶工作室
* 版权声明 2012,乌龙茶工作室
//
* provided by Linliangyi and copyright 2012 by Oolong studio
* provided by Linliangyi and copyright 2012 by Oolong studio
// *
*
//
*/
*/
//
package org.wltea.analyzer.query;
package
org.wltea.analyzer.query
;
//
//
import java.io.IOException;
import
java.io.IOException
;
//
import java.io.StringReader;
import
java.io.StringReader
;
//
import java.util.ArrayList;
import
java.util.ArrayList
;
//
import java.util.List;
import
java.util.List
;
//
//
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import
org.apache.lucene.analysis.standard.StandardAnalyzer
;
//
import org.apache.lucene.queryparser.classic.ParseException;
import
org.apache.lucene.queryparser.classic.ParseException
;
//
import org.apache.lucene.queryparser.classic.QueryParser;
import
org.apache.lucene.queryparser.classic.QueryParser
;
//
import org.apache.lucene.search.Query;
import
org.apache.lucene.search.Query
;
//
import org.apache.lucene.util.Version;
import
org.apache.lucene.util.Version
;
//
import org.wltea.analyzer.core.IKSegmenter;
import
org.wltea.analyzer.core.IKSegmenter
;
//
import org.wltea.analyzer.core.Lexeme;
import
org.wltea.analyzer.core.Lexeme
;
//
/
//
**
/**
//
* Single Word Multi Char Query Builder
* Single Word Multi Char Query Builder
//
* IK分词算法专用
* IK分词算法专用
//
* @author linliangyi
* @author linliangyi
//
*
*
//
*/
*/
//
public class SWMCQueryBuilder {
public
class
SWMCQueryBuilder
{
//
//
/**
/**
//
* 生成SWMCQuery
* 生成SWMCQuery
//
* @param fieldName
* @param fieldName
//
* @param keywords
* @param keywords
//
* @param quickMode
* @param quickMode
//
* @return Lucene Query
* @return Lucene Query
//
*/
*/
//
public static Query create(String fieldName ,String keywords , boolean quickMode){
public
static
Query
create
(
String
fieldName
,
String
keywords
,
boolean
quickMode
){
//
if(fieldName == null || keywords == null){
if
(
fieldName
==
null
||
keywords
==
null
){
//
throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
throw
new
IllegalArgumentException
(
"参数 fieldName 、 keywords 不能为null."
);
//
}
}
//
//1.对keywords进行分词处理
//1.对keywords进行分词处理
//
List<Lexeme> lexemes = doAnalyze(keywords);
List
<
Lexeme
>
lexemes
=
doAnalyze
(
keywords
);
//
//2.根据分词结果,生成SWMCQuery
//2.根据分词结果,生成SWMCQuery
//
Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
Query
_SWMCQuery
=
getSWMCQuery
(
fieldName
,
lexemes
,
quickMode
);
//
return _SWMCQuery;
return
_SWMCQuery
;
//
}
}
//
//
/**
/**
//
* 分词切分,并返回结链表
* 分词切分,并返回结链表
//
* @param keywords
* @param keywords
//
* @return
* @return
//
*/
*/
//
private static List<Lexeme> doAnalyze(String keywords){
private
static
List
<
Lexeme
>
doAnalyze
(
String
keywords
){
//
List<Lexeme> lexemes = new ArrayList<Lexeme>();
List
<
Lexeme
>
lexemes
=
new
ArrayList
<
Lexeme
>();
//
IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
IKSegmenter
ikSeg
=
new
IKSegmenter
(
new
StringReader
(
keywords
)
,
true
);
//
try{
try
{
//
Lexeme l = null;
Lexeme
l
=
null
;
//
while( (l = ikSeg.next()) != null){
while
(
(
l
=
ikSeg
.
next
())
!=
null
){
//
lexemes.add(l);
lexemes
.
add
(
l
);
//
}
}
//
}catch(IOException e){
}
catch
(
IOException
e
){
//
e.printStackTrace();
e
.
printStackTrace
();
//
}
}
//
return lexemes;
return
lexemes
;
//
}
}
//
//
//
/**
/**
//
* 根据分词结果生成SWMC搜索
* 根据分词结果生成SWMC搜索
//
* @param fieldName
* @param fieldName
// * @param pathOption
// * @param pathOption
//
* @param quickMode
* @param quickMode
//
* @return
* @return
//
*/
*/
//
private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){
private
static
Query
getSWMCQuery
(
String
fieldName
,
List
<
Lexeme
>
lexemes
,
boolean
quickMode
){
//
//构造SWMC的查询表达式
//构造SWMC的查询表达式
//
StringBuffer keywordBuffer = new StringBuffer();
StringBuffer
keywordBuffer
=
new
StringBuffer
();
//
//精简的SWMC的查询表达式
//精简的SWMC的查询表达式
//
StringBuffer keywordBuffer_Short = new StringBuffer();
StringBuffer
keywordBuffer_Short
=
new
StringBuffer
();
//
//记录最后词元长度
//记录最后词元长度
//
int lastLexemeLength = 0;
int
lastLexemeLength
=
0
;
//
//记录最后词元结束位置
//记录最后词元结束位置
//
int lastLexemeEnd = -1;
int
lastLexemeEnd
=
-
1
;
//
//
int shortCount = 0;
int
shortCount
=
0
;
//
int totalCount = 0;
int
totalCount
=
0
;
//
for(Lexeme l : lexemes){
for
(
Lexeme
l
:
lexemes
){
//
totalCount += l.getLength();
totalCount
+=
l
.
getLength
();
//
//精简表达式
//精简表达式
//
if(l.getLength() > 1){
if
(
l
.
getLength
()
>
1
){
//
keywordBuffer_Short.append(' ').append(l.getLexemeText());
keywordBuffer_Short
.
append
(
' '
).
append
(
l
.
getLexemeText
());
//
shortCount += l.getLength();
shortCount
+=
l
.
getLength
();
//
}
}
//
//
if(lastLexemeLength == 0){
if
(
lastLexemeLength
==
0
){
// keywordBuffer.append(l.getLexemeText());
keywordBuffer
.
append
(
l
.
getLexemeText
());
//
}else if(lastLexemeLength == 1 && l.getLength() == 1
}
else
if
(
lastLexemeLength
==
1
&&
l
.
getLength
()
==
1
//
&& lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
&&
lastLexemeEnd
==
l
.
getBeginPosition
()){
//单字位置相邻,长度为一,合并)
//
keywordBuffer.append(l.getLexemeText());
keywordBuffer
.
append
(
l
.
getLexemeText
());
//
}else{
}
else
{
//
keywordBuffer.append(' ').append(l.getLexemeText());
keywordBuffer
.
append
(
' '
).
append
(
l
.
getLexemeText
());
//
//
}
}
//
lastLexemeLength = l.getLength();
lastLexemeLength
=
l
.
getLength
();
//
lastLexemeEnd = l.getEndPosition();
lastLexemeEnd
=
l
.
getEndPosition
();
//
}
}
//
//
//借助lucene queryparser 生成SWMC Query
//借助lucene queryparser 生成SWMC Query
//
QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40));
QueryParser
qp
=
new
QueryParser
(
Version
.
LUCENE_40
,
fieldName
,
new
StandardAnalyzer
(
Version
.
LUCENE_40
));
//
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
qp
.
setDefaultOperator
(
QueryParser
.
AND_OPERATOR
);
//
qp.setAutoGeneratePhraseQueries(true);
qp
.
setAutoGeneratePhraseQueries
(
true
);
//
//
if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
if
(
quickMode
&&
(
shortCount
*
1.0f
/
totalCount
)
>
0.5f
){
//
try {
try
{
//
//System.out.println(keywordBuffer.toString());
//System.out.println(keywordBuffer.toString());
//
Query q = qp.parse(keywordBuffer_Short.toString());
Query
q
=
qp
.
parse
(
keywordBuffer_Short
.
toString
());
//
return q;
return
q
;
//
} catch (ParseException e) {
}
catch
(
ParseException
e
)
{
//
e.printStackTrace();
e
.
printStackTrace
();
//
}
}
//
//
}else{
}
else
{
//
if(keywordBuffer.length() > 0){
if
(
keywordBuffer
.
length
()
>
0
){
//
try {
try
{
//
//System.out.println(keywordBuffer.toString());
//System.out.println(keywordBuffer.toString());
//
Query q = qp.parse(keywordBuffer.toString());
Query
q
=
qp
.
parse
(
keywordBuffer
.
toString
());
//
return q;
return
q
;
//
} catch (ParseException e) {
}
catch
(
ParseException
e
)
{
//
e.printStackTrace();
e
.
printStackTrace
();
//
}
}
//
}
}
//
}
}
//
return null;
return
null
;
//
}
}
//
}
}
src/main/java/org/wltea/analyzer/sample/LuceneIndexAndSearchDemo.java
浏览文件 @
43c8bc9f
/
//
**
/**
//
* IK 中文分词 版本 5.0
* IK 中文分词 版本 5.0
//
* IK Analyzer release 5.0
* IK Analyzer release 5.0
// *
*
//
* Licensed to the Apache Software Foundation (ASF) under one or more
* Licensed to the Apache Software Foundation (ASF) under one or more
//
* contributor license agreements. See the NOTICE file distributed with
* contributor license agreements. See the NOTICE file distributed with
//
* this work for additional information regarding copyright ownership.
* this work for additional information regarding copyright ownership.
//
* The ASF licenses this file to You under the Apache License, Version 2.0
* The ASF licenses this file to You under the Apache License, Version 2.0
//
* (the "License"); you may not use this file except in compliance with
* (the "License"); you may not use this file except in compliance with
//
* the License. You may obtain a copy of the License at
* the License. You may obtain a copy of the License at
//
*
*
//
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
//
*
*
//
* Unless required by applicable law or agreed to in writing, software
* Unless required by applicable law or agreed to in writing, software
//
* distributed under the License is distributed on an "AS IS" BASIS,
* distributed under the License is distributed on an "AS IS" BASIS,
//
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//
* See the License for the specific language governing permissions and
* See the License for the specific language governing permissions and
//
* limitations under the License.
* limitations under the License.
//
*
*
//
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 源代码由林良益(linliangyi2005@gmail.com)提供
//
* 版权声明 2012,乌龙茶工作室
* 版权声明 2012,乌龙茶工作室
//
* provided by Linliangyi and copyright 2012 by Oolong studio
* provided by Linliangyi and copyright 2012 by Oolong studio
// *
*
// *
*
//
*/
*/
//
package org.wltea.analyzer.sample;
package
org.wltea.analyzer.sample
;
//
//
import java.io.IOException;
import
java.io.IOException
;
//
//
import org.apache.lucene.analysis.Analyzer;
import
org.apache.lucene.analysis.Analyzer
;
//
import org.apache.lucene.document.Document;
import
org.apache.lucene.document.Document
;
//
import org.apache.lucene.document.Field;
import
org.apache.lucene.document.Field
;
//
import org.apache.lucene.document.StringField;
import
org.apache.lucene.document.StringField
;
//
import org.apache.lucene.document.TextField;
import
org.apache.lucene.document.TextField
;
//
import org.apache.lucene.index.CorruptIndexException;
import
org.apache.lucene.index.CorruptIndexException
;
//
import org.apache.lucene.index.DirectoryReader;
import
org.apache.lucene.index.DirectoryReader
;
//
import org.apache.lucene.index.IndexReader;
import
org.apache.lucene.index.IndexReader
;
//
import org.apache.lucene.index.IndexWriter;
import
org.apache.lucene.index.IndexWriter
;
//
import org.apache.lucene.index.IndexWriterConfig;
import
org.apache.lucene.index.IndexWriterConfig
;
//
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import
org.apache.lucene.index.IndexWriterConfig.OpenMode
;
//
import org.apache.lucene.queryparser.classic.ParseException;
import
org.apache.lucene.queryparser.classic.ParseException
;
//
import org.apache.lucene.queryparser.classic.QueryParser;
import
org.apache.lucene.queryparser.classic.QueryParser
;
//
import org.apache.lucene.search.IndexSearcher;
import
org.apache.lucene.search.IndexSearcher
;
//
import org.apache.lucene.search.Query;
import
org.apache.lucene.search.Query
;
//
import org.apache.lucene.search.ScoreDoc;
import
org.apache.lucene.search.ScoreDoc
;
//
import org.apache.lucene.search.TopDocs;
import
org.apache.lucene.search.TopDocs
;
//
import org.apache.lucene.store.Directory;
import
org.apache.lucene.store.Directory
;
//
import org.apache.lucene.store.LockObtainFailedException;
import
org.apache.lucene.store.LockObtainFailedException
;
//
import org.apache.lucene.store.RAMDirectory;
import
org.apache.lucene.store.RAMDirectory
;
//
import org.apache.lucene.util.Version;
import
org.apache.lucene.util.Version
;
//
import org.wltea.analyzer.lucene.IKAnalyzer;
import
org.wltea.analyzer.lucene.IKAnalyzer
;
//
//
//
//
/
//
**
/**
//
* 使用IKAnalyzer进行Lucene索引和查询的演示
* 使用IKAnalyzer进行Lucene索引和查询的演示
//
* 2012-3-2
* 2012-3-2
// *
*
//
* 以下是结合Lucene4.0 API的写法
* 以下是结合Lucene4.0 API的写法
//
*
*
//
*/
*/
//
public class LuceneIndexAndSearchDemo {
public
class
LuceneIndexAndSearchDemo
{
//
//
//
/**
/**
//
* 模拟:
* 模拟:
//
* 创建一个单条记录的索引,并对其进行搜索
* 创建一个单条记录的索引,并对其进行搜索
//
* @param args
* @param args
//
*/
*/
//
public static void main(String[] args){
public
static
void
main
(
String
[]
args
){
//
//Lucene Document的域名
//Lucene Document的域名
//
String fieldName = "text";
String
fieldName
=
"text"
;
//
//检索内容
//检索内容
//
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
String
text
=
"IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"
;
//
//
//实例化IKAnalyzer分词器
//实例化IKAnalyzer分词器
//
Analyzer analyzer = new IKAnalyzer(true);
Analyzer
analyzer
=
new
IKAnalyzer
(
true
);
//
//
Directory directory = null;
Directory
directory
=
null
;
//
IndexWriter iwriter = null;
IndexWriter
iwriter
=
null
;
//
IndexReader ireader = null;
IndexReader
ireader
=
null
;
//
IndexSearcher isearcher = null;
IndexSearcher
isearcher
=
null
;
//
try {
try
{
//
//建立内存索引对象
//建立内存索引对象
// directory = new RAMDirectory();
directory
=
new
RAMDirectory
();
//
//
//配置IndexWriterConfig
//配置IndexWriterConfig
//
IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer);
IndexWriterConfig
iwConfig
=
new
IndexWriterConfig
(
Version
.
LUCENE_40
,
analyzer
);
//
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
iwConfig
.
setOpenMode
(
OpenMode
.
CREATE_OR_APPEND
);
//
iwriter = new IndexWriter(directory , iwConfig);
iwriter
=
new
IndexWriter
(
directory
,
iwConfig
);
//
//写入索引
//写入索引
//
Document doc = new Document();
Document
doc
=
new
Document
();
//
doc.add(new StringField("ID", "10000", Field.Store.YES));
doc
.
add
(
new
StringField
(
"ID"
,
"10000"
,
Field
.
Store
.
YES
));
//
doc.add(new TextField(fieldName, text, Field.Store.YES));
doc
.
add
(
new
TextField
(
fieldName
,
text
,
Field
.
Store
.
YES
));
//
iwriter.addDocument(doc);
iwriter
.
addDocument
(
doc
);
//
iwriter.close();
iwriter
.
close
();
//
//
//
//搜索过程**********************************
//搜索过程**********************************
// //实例化搜索器
//实例化搜索器
//
ireader = DirectoryReader.open(directory);
ireader
=
DirectoryReader
.
open
(
directory
);
// isearcher = new IndexSearcher(ireader);
isearcher
=
new
IndexSearcher
(
ireader
);
//
// String keyword = "中文分词工具包";
String
keyword
=
"中文分词工具包"
;
//
//使用QueryParser查询分析器构造Query对象
//使用QueryParser查询分析器构造Query对象
//
QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
QueryParser
qp
=
new
QueryParser
(
Version
.
LUCENE_40
,
fieldName
,
analyzer
);
//
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
qp
.
setDefaultOperator
(
QueryParser
.
AND_OPERATOR
);
//
Query query = qp.parse(keyword);
Query
query
=
qp
.
parse
(
keyword
);
//
System.out.println("Query = " + query);
System
.
out
.
println
(
"Query = "
+
query
);
//
//
//搜索相似度最高的5条记录
//搜索相似度最高的5条记录
//
TopDocs topDocs = isearcher.search(query , 5);
TopDocs
topDocs
=
isearcher
.
search
(
query
,
5
);
//
System.out.println("命中:" + topDocs.totalHits);
System
.
out
.
println
(
"命中:"
+
topDocs
.
totalHits
);
//
//输出结果
//输出结果
//
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
ScoreDoc
[]
scoreDocs
=
topDocs
.
scoreDocs
;
//
for (int i = 0; i < topDocs.totalHits; i++){
for
(
int
i
=
0
;
i
<
topDocs
.
totalHits
;
i
++){
//
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
Document
targetDoc
=
isearcher
.
doc
(
scoreDocs
[
i
].
doc
);
//
System.out.println("内容:" + targetDoc.toString());
System
.
out
.
println
(
"内容:"
+
targetDoc
.
toString
());
// }
}
//
//
} catch (CorruptIndexException e) {
}
catch
(
CorruptIndexException
e
)
{
//
e.printStackTrace();
e
.
printStackTrace
();
//
} catch (LockObtainFailedException e) {
}
catch
(
LockObtainFailedException
e
)
{
//
e.printStackTrace();
e
.
printStackTrace
();
//
} catch (IOException e) {
}
catch
(
IOException
e
)
{
//
e.printStackTrace();
e
.
printStackTrace
();
//
} catch (ParseException e) {
}
catch
(
ParseException
e
)
{
//
e.printStackTrace();
e
.
printStackTrace
();
//
} finally{
}
finally
{
//
if(ireader != null){
if
(
ireader
!=
null
){
//
try {
try
{
//
ireader.close();
ireader
.
close
();
//
} catch (IOException e) {
}
catch
(
IOException
e
)
{
//
e.printStackTrace();
e
.
printStackTrace
();
//
}
}
//
}
}
//
if(directory != null){
if
(
directory
!=
null
){
//
try {
try
{
//
directory.close();
directory
.
close
();
//
} catch (IOException e) {
}
catch
(
IOException
e
)
{
//
e.printStackTrace();
e
.
printStackTrace
();
//
}
}
//
}
}
//
}
}
//
}
}
//
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录