提交 43c8bc9f 编写于 作者: weixin_43283383's avatar weixin_43283383

Merge pull request #10 from wyhw/ik_lucene4

elasticsearch ik 0.20.x => 0.90.x
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
</parent> </parent>
<properties> <properties>
<elasticsearch.version>0.20.2</elasticsearch.version> <elasticsearch.version>0.90.0</elasticsearch.version>
</properties> </properties>
<repositories> <repositories>
...@@ -132,4 +132,4 @@ ...@@ -132,4 +132,4 @@
</plugin> </plugin>
</plugins> </plugins>
</build> </build>
</project> </project>
\ No newline at end of file
...@@ -2,19 +2,32 @@ package org.elasticsearch.index.analysis; ...@@ -2,19 +2,32 @@ package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.wltea.analyzer.lucene.IKTokenizer; import org.wltea.analyzer.lucene.IKTokenizer;
//import org.wltea.lucene.IKTokenizer;
import java.io.Reader; import java.io.Reader;
public class IkAnalyzer extends Analyzer { public class IkAnalyzer extends Analyzer {
// private boolean isMaxWordLength = false;
@Override public TokenStream tokenStream(String fieldName, Reader reader) { // @Override public TokenStream tokenStream(String fieldName, Reader reader) {
return new IKTokenizer(reader,true); // return new IKTokenizer(reader,true);
} // }
public IkAnalyzer() { public IkAnalyzer() {
super(); super();
} }
@Override
protected TokenStreamComponents createComponents(String s, Reader reader) {
// new TokenStreamComponents
Tokenizer tokenizer = new IKTokenizer(reader, true);
return new TokenStreamComponents(tokenizer, null); //To change body of implemented methods use File | Settings | File Templates.
}
// public boolean isMaxWordLength() {
// return isMaxWordLength;
// }
} }
...@@ -24,11 +24,16 @@ ...@@ -24,11 +24,16 @@
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.*; import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
/** /**
* *
...@@ -68,12 +73,12 @@ class AnalyzeContext { ...@@ -68,12 +73,12 @@ class AnalyzeContext {
private Map<Integer , LexemePath> pathMap; private Map<Integer , LexemePath> pathMap;
//最终分词结果集 //最终分词结果集
private LinkedList<Lexeme> results; private LinkedList<Lexeme> results;
private boolean useSmart;
//分词器配置项 //分词器配置项
private boolean useSmart; // private Configuration cfg;
public AnalyzeContext(boolean useSmart){ public AnalyzeContext(boolean useSmart){
this.useSmart = useSmart; this.useSmart = useSmart;
this.segmentBuff = new char[BUFF_SIZE]; this.segmentBuff = new char[BUFF_SIZE];
this.charTypes = new int[BUFF_SIZE]; this.charTypes = new int[BUFF_SIZE];
this.buffLocker = new HashSet<String>(); this.buffLocker = new HashSet<String>();
...@@ -313,7 +318,7 @@ class AnalyzeContext { ...@@ -313,7 +318,7 @@ class AnalyzeContext {
while(result != null){ while(result != null){
//数量词合并 //数量词合并
this.compound(result); this.compound(result);
if(Dictionary.isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){ if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
//是停止词继续取列表的下一个 //是停止词继续取列表的下一个
result = this.results.pollFirst(); result = this.results.pollFirst();
}else{ }else{
...@@ -344,6 +349,7 @@ class AnalyzeContext { ...@@ -344,6 +349,7 @@ class AnalyzeContext {
* 组合词元 * 组合词元
*/ */
private void compound(Lexeme result){ private void compound(Lexeme result){
if(!this.useSmart){ if(!this.useSmart){
return ; return ;
} }
......
...@@ -25,12 +25,12 @@ ...@@ -25,12 +25,12 @@
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
/** /**
* 中文-日韩文子分词器 * 中文-日韩文子分词器
...@@ -58,7 +58,7 @@ class CJKSegmenter implements ISegmenter { ...@@ -58,7 +58,7 @@ class CJKSegmenter implements ISegmenter {
//处理词段队列 //处理词段队列
Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]); Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
for(Hit hit : tmpArray){ for(Hit hit : tmpArray){
hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit); hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){ if(hit.isMatch()){
//输出当前的词 //输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD); Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
...@@ -77,7 +77,7 @@ class CJKSegmenter implements ISegmenter { ...@@ -77,7 +77,7 @@ class CJKSegmenter implements ISegmenter {
//********************************* //*********************************
//再对当前指针位置的字符进行单字匹配 //再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1); Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成词 if(singleCharHit.isMatch()){//首字成词
//输出当前的词 //输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD); Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
......
...@@ -24,14 +24,14 @@ ...@@ -24,14 +24,14 @@
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
import java.util.HashSet; import java.util.HashSet;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
/** /**
* *
* 中文数量词子分词器 * 中文数量词子分词器
...@@ -155,7 +155,7 @@ class CN_QuantifierSegmenter implements ISegmenter{ ...@@ -155,7 +155,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
//处理词段队列 //处理词段队列
Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]); Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
for(Hit hit : tmpArray){ for(Hit hit : tmpArray){
hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit); hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){ if(hit.isMatch()){
//输出当前的词 //输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT); Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
...@@ -174,7 +174,7 @@ class CN_QuantifierSegmenter implements ISegmenter{ ...@@ -174,7 +174,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
//********************************* //*********************************
//对当前指针位置的字符进行单字匹配 //对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1); Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成量词词 if(singleCharHit.isMatch()){//首字成量词词
//输出当前的词 //输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT); Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
......
...@@ -38,7 +38,7 @@ class IKArbitrator { ...@@ -38,7 +38,7 @@ class IKArbitrator {
/** /**
* 分词歧义处理 * 分词歧义处理
* @param orgLexemes // * @param orgLexemes
* @param useSmart * @param useSmart
*/ */
void process(AnalyzeContext context , boolean useSmart){ void process(AnalyzeContext context , boolean useSmart){
...@@ -87,7 +87,6 @@ class IKArbitrator { ...@@ -87,7 +87,6 @@ class IKArbitrator {
* 歧义识别 * 歧义识别
* @param lexemeCell 歧义路径链表头 * @param lexemeCell 歧义路径链表头
* @param fullTextLength 歧义路径文本长度 * @param fullTextLength 歧义路径文本长度
* @param option 候选结果路径
* @return * @return
*/ */
private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){ private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
...@@ -120,7 +119,7 @@ class IKArbitrator { ...@@ -120,7 +119,7 @@ class IKArbitrator {
/** /**
* 向前遍历,添加词元,构造一个无歧义词元组合 * 向前遍历,添加词元,构造一个无歧义词元组合
* @param LexemePath path // * @param LexemePath path
* @return * @return
*/ */
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){ private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
...@@ -140,7 +139,7 @@ class IKArbitrator { ...@@ -140,7 +139,7 @@ class IKArbitrator {
/** /**
* 回滚词元链,直到它能够接受指定的词元 * 回滚词元链,直到它能够接受指定的词元
* @param lexeme // * @param lexeme
* @param l * @param l
*/ */
private void backPath(Lexeme l , LexemePath option){ private void backPath(Lexeme l , LexemePath option){
......
...@@ -23,14 +23,15 @@ ...@@ -23,14 +23,15 @@
*/ */
package org.wltea.analyzer.core; package org.wltea.analyzer.core;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.wltea.analyzer.cfg.Configuration;
//import org.wltea.analyzer.cfg.DefaultConfig;
import org.wltea.analyzer.dic.Dictionary;
/** /**
* IK分词器主类 * IK分词器主类
* *
...@@ -39,16 +40,18 @@ public final class IKSegmenter { ...@@ -39,16 +40,18 @@ public final class IKSegmenter {
//字符窜reader //字符窜reader
private Reader input; private Reader input;
//分词器配置项
private Configuration cfg;
//分词器上下文 //分词器上下文
private AnalyzeContext context; private AnalyzeContext context;
//分词处理器列表 //分词处理器列表
private List<ISegmenter> segmenters; private List<ISegmenter> segmenters;
//分词歧义裁决器 //分词歧义裁决器
private IKArbitrator arbitrator; private IKArbitrator arbitrator;
private ESLogger logger=null; private boolean useSmart = false;
private final boolean useSmart;
/** /**
* IK分词器构造函数 * IK分词器构造函数
* @param input * @param input
* @param useSmart 为true,使用智能分词策略 * @param useSmart 为true,使用智能分词策略
...@@ -57,16 +60,31 @@ public final class IKSegmenter { ...@@ -57,16 +60,31 @@ public final class IKSegmenter {
* 智能分词: 合并数词和量词,对分词结果进行歧义判断 * 智能分词: 合并数词和量词,对分词结果进行歧义判断
*/ */
public IKSegmenter(Reader input , boolean useSmart){ public IKSegmenter(Reader input , boolean useSmart){
logger = Loggers.getLogger("ik-analyzer");
this.input = input; this.input = input;
// this.cfg = DefaultConfig.getInstance();
this.useSmart=useSmart; this.useSmart=useSmart;
this.init(); this.init();
}
/**
* IK分词器构造函数
* @param input
* @param cfg 使用自定义的Configuration构造分词器
*
*/
public IKSegmenter(Reader input , Configuration cfg){
this.input = input;
this.cfg = cfg;
this.init();
} }
/** /**
* 初始化 * 初始化
*/ */
private void init(){ private void init(){
//初始化词典单例
// Dictionary.initial(this.cfg);
// Dictionary.getSingleton();
//初始化分词上下文 //初始化分词上下文
this.context = new AnalyzeContext(useSmart); this.context = new AnalyzeContext(useSmart);
//加载子分词器 //加载子分词器
......
...@@ -120,7 +120,7 @@ class LetterSegmenter implements ISegmenter { ...@@ -120,7 +120,7 @@ class LetterSegmenter implements ISegmenter {
/** /**
* 处理数字字母混合输出 * 处理数字字母混合输出
* 如:windos2000 | linliangyi2005@gmail.com * 如:windos2000 | linliangyi2005@gmail.com
* @param input // * @param input
* @param context * @param context
* @return * @return
*/ */
......
...@@ -326,13 +326,5 @@ class DictSegment implements Comparable<DictSegment>{ ...@@ -326,13 +326,5 @@ class DictSegment implements Comparable<DictSegment>{
//对当前节点存储的char进行比较 //对当前节点存储的char进行比较
return this.nodeChar.compareTo(o.nodeChar); return this.nodeChar.compareTo(o.nodeChar);
} }
public int getDicNum(){
if(charMap!=null)
{
return charMap.size();
}
return 0;
}
} }
...@@ -58,7 +58,9 @@ public class Hit { ...@@ -58,7 +58,9 @@ public class Hit {
public boolean isMatch() { public boolean isMatch() {
return (this.hitState & MATCH) > 0; return (this.hitState & MATCH) > 0;
} }
/**
*
*/
public void setMatch() { public void setMatch() {
this.hitState = this.hitState | MATCH; this.hitState = this.hitState | MATCH;
} }
...@@ -69,7 +71,9 @@ public class Hit { ...@@ -69,7 +71,9 @@ public class Hit {
public boolean isPrefix() { public boolean isPrefix() {
return (this.hitState & PREFIX) > 0; return (this.hitState & PREFIX) > 0;
} }
/**
*
*/
public void setPrefix() { public void setPrefix() {
this.hitState = this.hitState | PREFIX; this.hitState = this.hitState | PREFIX;
} }
...@@ -79,7 +83,9 @@ public class Hit { ...@@ -79,7 +83,9 @@ public class Hit {
public boolean isUnmatch() { public boolean isUnmatch() {
return this.hitState == UNMATCH ; return this.hitState == UNMATCH ;
} }
/**
*
*/
public void setUnmatch() { public void setUnmatch() {
this.hitState = UNMATCH; this.hitState = UNMATCH;
} }
......
/** /**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
* *
*/ */
package org.wltea.analyzer.lucene; package org.wltea.analyzer.lucene;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.wltea.analyzer.dic.Dictionary; import org.wltea.analyzer.dic.Dictionary;
import java.io.Reader; /**
* IK分词器,Lucene Analyzer接口实现
public final class IKAnalyzer extends Analyzer { * 兼容Lucene 4.0版本
*/
public final class IKAnalyzer extends Analyzer{
private boolean isMaxWordLength = false; private boolean useSmart;
private boolean useSmart=false;
public boolean useSmart() {
return useSmart;
}
public IKAnalyzer(){ public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* 默认细粒度切分算法
*/
public IKAnalyzer(){
this(false); this(false);
} }
/**
public IKAnalyzer(boolean isMaxWordLength){ * IK分词器Lucene Analyzer接口实现类
*
* @param useSmart 当为true时,分词器进行智能切分
*/
public IKAnalyzer(boolean useSmart){
super(); super();
this.setMaxWordLength(isMaxWordLength); this.useSmart = useSmart;
} }
public IKAnalyzer(Settings indexSetting,Settings settings1) { public IKAnalyzer(Settings indexSetting,Settings settings1) {
super(); super();
Dictionary.getInstance().Init(indexSetting); Dictionary.getInstance().Init(indexSetting);
if(settings1.get("use_smart", "true").equals("true")){ if(settings1.get("use_smart", "true").equals("true")){
useSmart=true; useSmart = true;
} }
} }
/**
@Override * 重载Analyzer接口,构造分词组件
public TokenStream tokenStream(String fieldName, Reader reader) { */
return new IKTokenizer(reader , useSmart); @Override
} protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart());
public void setMaxWordLength(boolean isMaxWordLength) { return new TokenStreamComponents(_IKTokenizer);
this.isMaxWordLength = isMaxWordLength;
}
public boolean isMaxWordLength() {
return isMaxWordLength;
} }
} }
/** /**
* IK 中文分词 版本 5.0.1 * IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1 * IK Analyzer release 5.0.1
* *
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. * this work for additional information regarding copyright ownership.
...@@ -20,94 +20,95 @@ ...@@ -20,94 +20,95 @@
* 源代码由林良益(linliangyi2005@gmail.com)提供 * 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室 * 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
* *
* *
*/ */
package org.wltea.analyzer.lucene; package org.wltea.analyzer.lucene;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme; import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.Reader;
/** /**
* IK分词器 Lucene Tokenizer适配器类 * IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本 * 兼容Lucene 4.0版本
*/ */
public final class IKTokenizer extends Tokenizer { public final class IKTokenizer extends Tokenizer {
//IK分词器实现
private IKSegmenter _IKImplement;
//词元文本属性
private final CharTermAttribute termAtt;
//词元位移属性
private final OffsetAttribute offsetAtt;
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
/**
* Lucene 4.0 Tokenizer适配器类构造函数
* @param in
* @param useSmart
*/
public IKTokenizer(Reader in , boolean useSmart){
super(in);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input , useSmart);
}
//IK分词器实现 /* (non-Javadoc)
private IKSegmenter _IKImplement; * @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
//词元文本属性 @Override
private final CharTermAttribute termAtt; public boolean incrementToken() throws IOException {
//词元位移属性 //清除所有的词元属性
private final OffsetAttribute offsetAtt; clearAttributes();
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量) Lexeme nextLexeme = _IKImplement.next();
private final TypeAttribute typeAtt; if(nextLexeme != null){
//记录最后一个词元的结束位置 //将Lexeme转成Attributes
private int endPosition; //设置词元文本
termAtt.append(nextLexeme.getLexemeText());
/** //设置词元长度
* Lucene 4.0 Tokenizer适配器类构造函数 termAtt.setLength(nextLexeme.getLength());
* @param in //设置词元位移
* @param useSmart offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
*/ //记录分词的最后位置
public IKTokenizer(Reader in , boolean useSmart){ endPosition = nextLexeme.getEndPosition();
super(in); //记录词元分类
offsetAtt = addAttribute(OffsetAttribute.class); typeAtt.setType(nextLexeme.getLexemeTypeString());
termAtt = addAttribute(CharTermAttribute.class); //返会true告知还有下个词元
typeAtt = addAttribute(TypeAttribute.class); return true;
_IKImplement = new IKSegmenter(input , useSmart); }
} //返会false告知词元输出完毕
return false;
/* (non-Javadoc) }
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/ /*
@Override * (non-Javadoc)
public boolean incrementToken() throws IOException { * @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
//清除所有的词元属性 */
clearAttributes(); @Override
Lexeme nextLexeme = _IKImplement.next(); public void reset() throws IOException {
if(nextLexeme != null){ super.reset();
//将Lexeme转成Attributes _IKImplement.reset(input);
//设置词元文本 }
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度 @Override
termAtt.setLength(nextLexeme.getLength()); public final void end() {
//设置词元位移 // set final offset
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition()); int finalOffset = correctOffset(this.endPosition);
//记录分词的最后位置 offsetAtt.setOffset(finalOffset, finalOffset);
endPosition = nextLexeme.getEndPosition(); }
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
//返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
return false;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
} }
///** /**
// * IK 中文分词 版本 5.0 * IK 中文分词 版本 5.0
// * IK Analyzer release 5.0 * IK Analyzer release 5.0
// * *
// * Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
// * contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
// * this work for additional information regarding copyright ownership. * this work for additional information regarding copyright ownership.
// * The ASF licenses this file to You under the Apache License, Version 2.0 * The ASF licenses this file to You under the Apache License, Version 2.0
// * (the "License"); you may not use this file except in compliance with * (the "License"); you may not use this file except in compliance with
// * the License. You may obtain a copy of the License at * the License. You may obtain a copy of the License at
// * *
// * http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
// * *
// * Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
// * limitations under the License. * limitations under the License.
// * *
// * 源代码由林良益(linliangyi2005@gmail.com)提供 * 源代码由林良益(linliangyi2005@gmail.com)提供
// * 版权声明 2012,乌龙茶工作室 * 版权声明 2012,乌龙茶工作室
// * provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
// * *
// */ */
//package org.wltea.analyzer.query; package org.wltea.analyzer.query;
//
//import java.io.IOException; import java.io.IOException;
//import java.io.StringReader; import java.io.StringReader;
//import java.util.ArrayList; import java.util.ArrayList;
//import java.util.List; import java.util.List;
//
//import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer;
//import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.ParseException;
//import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.queryparser.classic.QueryParser;
//import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
//import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
//import org.wltea.analyzer.core.IKSegmenter; import org.wltea.analyzer.core.IKSegmenter;
//import org.wltea.analyzer.core.Lexeme; import org.wltea.analyzer.core.Lexeme;
//
///** /**
// * Single Word Multi Char Query Builder * Single Word Multi Char Query Builder
// * IK分词算法专用 * IK分词算法专用
// * @author linliangyi * @author linliangyi
// * *
// */ */
//public class SWMCQueryBuilder { public class SWMCQueryBuilder {
//
// /** /**
// * 生成SWMCQuery * 生成SWMCQuery
// * @param fieldName * @param fieldName
// * @param keywords * @param keywords
// * @param quickMode * @param quickMode
// * @return Lucene Query * @return Lucene Query
// */ */
// public static Query create(String fieldName ,String keywords , boolean quickMode){ public static Query create(String fieldName ,String keywords , boolean quickMode){
// if(fieldName == null || keywords == null){ if(fieldName == null || keywords == null){
// throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null."); throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
// } }
// //1.对keywords进行分词处理 //1.对keywords进行分词处理
// List<Lexeme> lexemes = doAnalyze(keywords); List<Lexeme> lexemes = doAnalyze(keywords);
// //2.根据分词结果,生成SWMCQuery //2.根据分词结果,生成SWMCQuery
// Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode); Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
// return _SWMCQuery; return _SWMCQuery;
// } }
//
// /** /**
// * 分词切分,并返回结链表 * 分词切分,并返回结链表
// * @param keywords * @param keywords
// * @return * @return
// */ */
// private static List<Lexeme> doAnalyze(String keywords){ private static List<Lexeme> doAnalyze(String keywords){
// List<Lexeme> lexemes = new ArrayList<Lexeme>(); List<Lexeme> lexemes = new ArrayList<Lexeme>();
// IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true); IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
// try{ try{
// Lexeme l = null; Lexeme l = null;
// while( (l = ikSeg.next()) != null){ while( (l = ikSeg.next()) != null){
// lexemes.add(l); lexemes.add(l);
// } }
// }catch(IOException e){ }catch(IOException e){
// e.printStackTrace(); e.printStackTrace();
// } }
// return lexemes; return lexemes;
// } }
//
//
// /** /**
// * 根据分词结果生成SWMC搜索 * 根据分词结果生成SWMC搜索
// * @param fieldName * @param fieldName
// * @param pathOption // * @param pathOption
// * @param quickMode * @param quickMode
// * @return * @return
// */ */
// private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){ private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){
// //构造SWMC的查询表达式 //构造SWMC的查询表达式
// StringBuffer keywordBuffer = new StringBuffer(); StringBuffer keywordBuffer = new StringBuffer();
// //精简的SWMC的查询表达式 //精简的SWMC的查询表达式
// StringBuffer keywordBuffer_Short = new StringBuffer(); StringBuffer keywordBuffer_Short = new StringBuffer();
// //记录最后词元长度 //记录最后词元长度
// int lastLexemeLength = 0; int lastLexemeLength = 0;
// //记录最后词元结束位置 //记录最后词元结束位置
// int lastLexemeEnd = -1; int lastLexemeEnd = -1;
//
// int shortCount = 0; int shortCount = 0;
// int totalCount = 0; int totalCount = 0;
// for(Lexeme l : lexemes){ for(Lexeme l : lexemes){
// totalCount += l.getLength(); totalCount += l.getLength();
// //精简表达式 //精简表达式
// if(l.getLength() > 1){ if(l.getLength() > 1){
// keywordBuffer_Short.append(' ').append(l.getLexemeText()); keywordBuffer_Short.append(' ').append(l.getLexemeText());
// shortCount += l.getLength(); shortCount += l.getLength();
// } }
//
// if(lastLexemeLength == 0){ if(lastLexemeLength == 0){
// keywordBuffer.append(l.getLexemeText()); keywordBuffer.append(l.getLexemeText());
// }else if(lastLexemeLength == 1 && l.getLength() == 1 }else if(lastLexemeLength == 1 && l.getLength() == 1
// && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并) && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
// keywordBuffer.append(l.getLexemeText()); keywordBuffer.append(l.getLexemeText());
// }else{ }else{
// keywordBuffer.append(' ').append(l.getLexemeText()); keywordBuffer.append(' ').append(l.getLexemeText());
//
// } }
// lastLexemeLength = l.getLength(); lastLexemeLength = l.getLength();
// lastLexemeEnd = l.getEndPosition(); lastLexemeEnd = l.getEndPosition();
// } }
//
// //借助lucene queryparser 生成SWMC Query //借助lucene queryparser 生成SWMC Query
// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40)); QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40));
// qp.setDefaultOperator(QueryParser.AND_OPERATOR); qp.setDefaultOperator(QueryParser.AND_OPERATOR);
// qp.setAutoGeneratePhraseQueries(true); qp.setAutoGeneratePhraseQueries(true);
//
// if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){ if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
// try { try {
// //System.out.println(keywordBuffer.toString()); //System.out.println(keywordBuffer.toString());
// Query q = qp.parse(keywordBuffer_Short.toString()); Query q = qp.parse(keywordBuffer_Short.toString());
// return q; return q;
// } catch (ParseException e) { } catch (ParseException e) {
// e.printStackTrace(); e.printStackTrace();
// } }
//
// }else{ }else{
// if(keywordBuffer.length() > 0){ if(keywordBuffer.length() > 0){
// try { try {
// //System.out.println(keywordBuffer.toString()); //System.out.println(keywordBuffer.toString());
// Query q = qp.parse(keywordBuffer.toString()); Query q = qp.parse(keywordBuffer.toString());
// return q; return q;
// } catch (ParseException e) { } catch (ParseException e) {
// e.printStackTrace(); e.printStackTrace();
// } }
// } }
// } }
// return null; return null;
// } }
//} }
///** /**
// * IK 中文分词 版本 5.0 * IK 中文分词 版本 5.0
// * IK Analyzer release 5.0 * IK Analyzer release 5.0
// * *
// * Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
// * contributor license agreements. See the NOTICE file distributed with * contributor license agreements. See the NOTICE file distributed with
// * this work for additional information regarding copyright ownership. * this work for additional information regarding copyright ownership.
// * The ASF licenses this file to You under the Apache License, Version 2.0 * The ASF licenses this file to You under the Apache License, Version 2.0
// * (the "License"); you may not use this file except in compliance with * (the "License"); you may not use this file except in compliance with
// * the License. You may obtain a copy of the License at * the License. You may obtain a copy of the License at
// * *
// * http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
// * *
// * Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
// * limitations under the License. * limitations under the License.
// * *
// * 源代码由林良益(linliangyi2005@gmail.com)提供 * 源代码由林良益(linliangyi2005@gmail.com)提供
// * 版权声明 2012,乌龙茶工作室 * 版权声明 2012,乌龙茶工作室
// * provided by Linliangyi and copyright 2012 by Oolong studio * provided by Linliangyi and copyright 2012 by Oolong studio
// * *
// * *
// */ */
//package org.wltea.analyzer.sample; package org.wltea.analyzer.sample;
//
//import java.io.IOException; import java.io.IOException;
//
//import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
//import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
//import org.apache.lucene.document.Field; import org.apache.lucene.document.Field;
//import org.apache.lucene.document.StringField; import org.apache.lucene.document.StringField;
//import org.apache.lucene.document.TextField; import org.apache.lucene.document.TextField;
//import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.CorruptIndexException;
//import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.DirectoryReader;
//import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexReader;
//import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter;
//import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig;
//import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig.OpenMode;
//import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.ParseException;
//import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.queryparser.classic.QueryParser;
//import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
//import org.apache.lucene.search.Query; import org.apache.lucene.search.Query;
//import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
//import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
//import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
//import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.store.LockObtainFailedException;
//import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.store.RAMDirectory;
//import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
//import org.wltea.analyzer.lucene.IKAnalyzer; import org.wltea.analyzer.lucene.IKAnalyzer;
//
//
//
//
///** /**
// * 使用IKAnalyzer进行Lucene索引和查询的演示 * 使用IKAnalyzer进行Lucene索引和查询的演示
// * 2012-3-2 * 2012-3-2
// * *
// * 以下是结合Lucene4.0 API的写法 * 以下是结合Lucene4.0 API的写法
// * *
// */ */
//public class LuceneIndexAndSearchDemo { public class LuceneIndexAndSearchDemo {
//
//
// /** /**
// * 模拟: * 模拟:
// * 创建一个单条记录的索引,并对其进行搜索 * 创建一个单条记录的索引,并对其进行搜索
// * @param args * @param args
// */ */
// public static void main(String[] args){ public static void main(String[] args){
// //Lucene Document的域名 //Lucene Document的域名
// String fieldName = "text"; String fieldName = "text";
// //检索内容 //检索内容
// String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。"; String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
//
// //实例化IKAnalyzer分词器 //实例化IKAnalyzer分词器
// Analyzer analyzer = new IKAnalyzer(true); Analyzer analyzer = new IKAnalyzer(true);
//
// Directory directory = null; Directory directory = null;
// IndexWriter iwriter = null; IndexWriter iwriter = null;
// IndexReader ireader = null; IndexReader ireader = null;
// IndexSearcher isearcher = null; IndexSearcher isearcher = null;
// try { try {
// //建立内存索引对象 //建立内存索引对象
// directory = new RAMDirectory(); directory = new RAMDirectory();
//
// //配置IndexWriterConfig //配置IndexWriterConfig
// IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer); IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer);
// iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
// iwriter = new IndexWriter(directory , iwConfig); iwriter = new IndexWriter(directory , iwConfig);
// //写入索引 //写入索引
// Document doc = new Document(); Document doc = new Document();
// doc.add(new StringField("ID", "10000", Field.Store.YES)); doc.add(new StringField("ID", "10000", Field.Store.YES));
// doc.add(new TextField(fieldName, text, Field.Store.YES)); doc.add(new TextField(fieldName, text, Field.Store.YES));
// iwriter.addDocument(doc); iwriter.addDocument(doc);
// iwriter.close(); iwriter.close();
//
//
// //搜索过程********************************** //搜索过程**********************************
// //实例化搜索器 //实例化搜索器
// ireader = DirectoryReader.open(directory); ireader = DirectoryReader.open(directory);
// isearcher = new IndexSearcher(ireader); isearcher = new IndexSearcher(ireader);
//
// String keyword = "中文分词工具包"; String keyword = "中文分词工具包";
// //使用QueryParser查询分析器构造Query对象 //使用QueryParser查询分析器构造Query对象
// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer); QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
// qp.setDefaultOperator(QueryParser.AND_OPERATOR); qp.setDefaultOperator(QueryParser.AND_OPERATOR);
// Query query = qp.parse(keyword); Query query = qp.parse(keyword);
// System.out.println("Query = " + query); System.out.println("Query = " + query);
//
// //搜索相似度最高的5条记录 //搜索相似度最高的5条记录
// TopDocs topDocs = isearcher.search(query , 5); TopDocs topDocs = isearcher.search(query , 5);
// System.out.println("命中:" + topDocs.totalHits); System.out.println("命中:" + topDocs.totalHits);
// //输出结果 //输出结果
// ScoreDoc[] scoreDocs = topDocs.scoreDocs; ScoreDoc[] scoreDocs = topDocs.scoreDocs;
// for (int i = 0; i < topDocs.totalHits; i++){ for (int i = 0; i < topDocs.totalHits; i++){
// Document targetDoc = isearcher.doc(scoreDocs[i].doc); Document targetDoc = isearcher.doc(scoreDocs[i].doc);
// System.out.println("内容:" + targetDoc.toString()); System.out.println("内容:" + targetDoc.toString());
// } }
//
// } catch (CorruptIndexException e) { } catch (CorruptIndexException e) {
// e.printStackTrace(); e.printStackTrace();
// } catch (LockObtainFailedException e) { } catch (LockObtainFailedException e) {
// e.printStackTrace(); e.printStackTrace();
// } catch (IOException e) { } catch (IOException e) {
// e.printStackTrace(); e.printStackTrace();
// } catch (ParseException e) { } catch (ParseException e) {
// e.printStackTrace(); e.printStackTrace();
// } finally{ } finally{
// if(ireader != null){ if(ireader != null){
// try { try {
// ireader.close(); ireader.close();
// } catch (IOException e) { } catch (IOException e) {
// e.printStackTrace(); e.printStackTrace();
// } }
// } }
// if(directory != null){ if(directory != null){
// try { try {
// directory.close(); directory.close();
// } catch (IOException e) { } catch (IOException e) {
// e.printStackTrace(); e.printStackTrace();
// } }
// } }
// } }
// } }
//} }
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册