提交 5e14e3d6 编写于 作者: W wangweihua

elasticsearch ik 0.20.x => 0.90.x

上级 a2dc3c78
......@@ -31,7 +31,7 @@
</parent>
<properties>
<elasticsearch.version>0.20.2</elasticsearch.version>
<elasticsearch.version>0.90.0</elasticsearch.version>
</properties>
<repositories>
......@@ -132,4 +132,4 @@
</plugin>
</plugins>
</build>
</project>
\ No newline at end of file
</project>
......@@ -2,19 +2,32 @@ package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.wltea.analyzer.lucene.IKTokenizer;
//import org.wltea.lucene.IKTokenizer;
import java.io.Reader;
public class IkAnalyzer extends Analyzer {
@Override public TokenStream tokenStream(String fieldName, Reader reader) {
return new IKTokenizer(reader,true);
}
// private boolean isMaxWordLength = false;
// @Override public TokenStream tokenStream(String fieldName, Reader reader) {
// return new IKTokenizer(reader,true);
// }
public IkAnalyzer() {
super();
}
@Override
protected TokenStreamComponents createComponents(String s, Reader reader) {
// new TokenStreamComponents
Tokenizer tokenizer = new IKTokenizer(reader, true);
return new TokenStreamComponents(tokenizer, null); //To change body of implemented methods use File | Settings | File Templates.
}
// public boolean isMaxWordLength() {
// return isMaxWordLength;
// }
}
......@@ -24,11 +24,16 @@
*/
package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import java.io.IOException;
import java.io.Reader;
import java.util.*;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
/**
*
......@@ -68,12 +73,12 @@ class AnalyzeContext {
private Map<Integer , LexemePath> pathMap;
//最终分词结果集
private LinkedList<Lexeme> results;
private boolean useSmart;
//分词器配置项
private boolean useSmart;
// private Configuration cfg;
public AnalyzeContext(boolean useSmart){
this.useSmart = useSmart;
this.useSmart = useSmart;
this.segmentBuff = new char[BUFF_SIZE];
this.charTypes = new int[BUFF_SIZE];
this.buffLocker = new HashSet<String>();
......@@ -313,7 +318,7 @@ class AnalyzeContext {
while(result != null){
//数量词合并
this.compound(result);
if(Dictionary.isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
if(Dictionary.getSingleton().isStopWord(this.segmentBuff , result.getBegin() , result.getLength())){
//是停止词继续取列表的下一个
result = this.results.pollFirst();
}else{
......@@ -344,6 +349,7 @@ class AnalyzeContext {
* 组合词元
*/
private void compound(Lexeme result){
if(!this.useSmart){
return ;
}
......
......@@ -25,12 +25,12 @@
*/
package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
import java.util.LinkedList;
import java.util.List;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
/**
* 中文-日韩文子分词器
......@@ -58,7 +58,7 @@ class CJKSegmenter implements ISegmenter {
//处理词段队列
Hit[] tmpArray = this.tmpHits.toArray(new Hit[this.tmpHits.size()]);
for(Hit hit : tmpArray){
hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit);
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_CNWORD);
......@@ -77,7 +77,7 @@ class CJKSegmenter implements ISegmenter {
//*********************************
//再对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
Hit singleCharHit = Dictionary.getSingleton().matchInMainDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_CNWORD);
......
......@@ -24,14 +24,14 @@
*/
package org.wltea.analyzer.core;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.dic.Hit;
/**
*
* 中文数量词子分词器
......@@ -155,7 +155,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
//处理词段队列
Hit[] tmpArray = this.countHits.toArray(new Hit[this.countHits.size()]);
for(Hit hit : tmpArray){
hit = Dictionary.matchInMainDictWithHit(context.getSegmentBuff(), context.getCursor() , hit);
hit = Dictionary.getSingleton().matchWithHit(context.getSegmentBuff(), context.getCursor() , hit);
if(hit.isMatch()){
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , hit.getBegin() , context.getCursor() - hit.getBegin() + 1 , Lexeme.TYPE_COUNT);
......@@ -174,7 +174,7 @@ class CN_QuantifierSegmenter implements ISegmenter{
//*********************************
//对当前指针位置的字符进行单字匹配
Hit singleCharHit = Dictionary.matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
Hit singleCharHit = Dictionary.getSingleton().matchInQuantifierDict(context.getSegmentBuff(), context.getCursor(), 1);
if(singleCharHit.isMatch()){//首字成量词词
//输出当前的词
Lexeme newLexeme = new Lexeme(context.getBufferOffset() , context.getCursor() , 1 , Lexeme.TYPE_COUNT);
......
......@@ -38,7 +38,7 @@ class IKArbitrator {
/**
* 分词歧义处理
* @param orgLexemes
// * @param orgLexemes
* @param useSmart
*/
void process(AnalyzeContext context , boolean useSmart){
......@@ -87,7 +87,6 @@ class IKArbitrator {
* 歧义识别
* @param lexemeCell 歧义路径链表头
* @param fullTextLength 歧义路径文本长度
* @param option 候选结果路径
* @return
*/
private LexemePath judge(QuickSortSet.Cell lexemeCell , int fullTextLength){
......@@ -120,7 +119,7 @@ class IKArbitrator {
/**
* 向前遍历,添加词元,构造一个无歧义词元组合
* @param LexemePath path
// * @param LexemePath path
* @return
*/
private Stack<QuickSortSet.Cell> forwardPath(QuickSortSet.Cell lexemeCell , LexemePath option){
......@@ -140,7 +139,7 @@ class IKArbitrator {
/**
* 回滚词元链,直到它能够接受指定的词元
* @param lexeme
// * @param lexeme
* @param l
*/
private void backPath(Lexeme l , LexemePath option){
......
......@@ -23,14 +23,15 @@
*/
package org.wltea.analyzer.core;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import org.wltea.analyzer.cfg.Configuration;
//import org.wltea.analyzer.cfg.DefaultConfig;
import org.wltea.analyzer.dic.Dictionary;
/**
* IK分词器主类
*
......@@ -39,16 +40,18 @@ public final class IKSegmenter {
//字符窜reader
private Reader input;
//分词器配置项
private Configuration cfg;
//分词器上下文
private AnalyzeContext context;
//分词处理器列表
private List<ISegmenter> segmenters;
//分词歧义裁决器
private IKArbitrator arbitrator;
private ESLogger logger=null;
private final boolean useSmart;
private boolean useSmart = false;
/**
/**
* IK分词器构造函数
* @param input
* @param useSmart 为true,使用智能分词策略
......@@ -57,16 +60,31 @@ public final class IKSegmenter {
* 智能分词: 合并数词和量词,对分词结果进行歧义判断
*/
public IKSegmenter(Reader input , boolean useSmart){
logger = Loggers.getLogger("ik-analyzer");
this.input = input;
// this.cfg = DefaultConfig.getInstance();
this.useSmart=useSmart;
this.init();
this.init();
}
/**
* IK分词器构造函数
* @param input
* @param cfg 使用自定义的Configuration构造分词器
*
*/
public IKSegmenter(Reader input , Configuration cfg){
this.input = input;
this.cfg = cfg;
this.init();
}
/**
* 初始化
*/
private void init(){
//初始化词典单例
// Dictionary.initial(this.cfg);
// Dictionary.getSingleton();
//初始化分词上下文
this.context = new AnalyzeContext(useSmart);
//加载子分词器
......
......@@ -120,7 +120,7 @@ class LetterSegmenter implements ISegmenter {
/**
* 处理数字字母混合输出
* 如:windos2000 | linliangyi2005@gmail.com
* @param input
// * @param input
* @param context
* @return
*/
......
......@@ -326,13 +326,5 @@ class DictSegment implements Comparable<DictSegment>{
//对当前节点存储的char进行比较
return this.nodeChar.compareTo(o.nodeChar);
}
public int getDicNum(){
if(charMap!=null)
{
return charMap.size();
}
return 0;
}
}
......@@ -58,7 +58,9 @@ public class Hit {
public boolean isMatch() {
return (this.hitState & MATCH) > 0;
}
/**
*
*/
public void setMatch() {
this.hitState = this.hitState | MATCH;
}
......@@ -69,7 +71,9 @@ public class Hit {
public boolean isPrefix() {
return (this.hitState & PREFIX) > 0;
}
/**
*
*/
public void setPrefix() {
this.hitState = this.hitState | PREFIX;
}
......@@ -79,7 +83,9 @@ public class Hit {
public boolean isUnmatch() {
return this.hitState == UNMATCH ;
}
/**
*
*/
public void setUnmatch() {
this.hitState = UNMATCH;
}
......
/**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package org.wltea.analyzer.lucene;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.settings.Settings;
import org.wltea.analyzer.dic.Dictionary;
import java.io.Reader;
public final class IKAnalyzer extends Analyzer {
/**
* IK分词器,Lucene Analyzer接口实现
* 兼容Lucene 4.0版本
*/
public final class IKAnalyzer extends Analyzer{
private boolean isMaxWordLength = false;
private boolean useSmart=false;
private boolean useSmart;
public boolean useSmart() {
return useSmart;
}
public IKAnalyzer(){
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
/**
* IK分词器Lucene Analyzer接口实现类
*
* 默认细粒度切分算法
*/
public IKAnalyzer(){
this(false);
}
public IKAnalyzer(boolean isMaxWordLength){
/**
* IK分词器Lucene Analyzer接口实现类
*
* @param useSmart 当为true时,分词器进行智能切分
*/
public IKAnalyzer(boolean useSmart){
super();
this.setMaxWordLength(isMaxWordLength);
this.useSmart = useSmart;
}
public IKAnalyzer(Settings indexSetting,Settings settings1) {
super();
Dictionary.getInstance().Init(indexSetting);
Dictionary.getInstance().Init(indexSetting);
if(settings1.get("use_smart", "true").equals("true")){
useSmart=true;
useSmart = true;
}
}
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new IKTokenizer(reader , useSmart);
}
public void setMaxWordLength(boolean isMaxWordLength) {
this.isMaxWordLength = isMaxWordLength;
}
public boolean isMaxWordLength() {
return isMaxWordLength;
/**
* 重载Analyzer接口,构造分词组件
*/
@Override
protected TokenStreamComponents createComponents(String fieldName, final Reader in) {
Tokenizer _IKTokenizer = new IKTokenizer(in , this.useSmart());
return new TokenStreamComponents(_IKTokenizer);
}
}
/**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
*
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
......@@ -20,94 +20,95 @@
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*
*
*/
package org.wltea.analyzer.lucene;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.Reader;
/**
* IK分词器 Lucene Tokenizer适配器类
* 兼容Lucene 4.0版本
*/
public final class IKTokenizer extends Tokenizer {
//IK分词器实现
private IKSegmenter _IKImplement;
//词元文本属性
private final CharTermAttribute termAtt;
//词元位移属性
private final OffsetAttribute offsetAtt;
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
/**
* Lucene 4.0 Tokenizer适配器类构造函数
* @param in
* @param useSmart
*/
public IKTokenizer(Reader in , boolean useSmart){
super(in);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input , useSmart);
}
//IK分词器实现
private IKSegmenter _IKImplement;
//词元文本属性
private final CharTermAttribute termAtt;
//词元位移属性
private final OffsetAttribute offsetAtt;
//词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
/**
* Lucene 4.0 Tokenizer适配器类构造函数
* @param in
* @param useSmart
*/
public IKTokenizer(Reader in , boolean useSmart){
super(in);
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input , useSmart);
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
//清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if(nextLexeme != null){
//将Lexeme转成Attributes
//设置词元文本
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
//记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
//返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
return false;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#incrementToken()
*/
@Override
public boolean incrementToken() throws IOException {
//清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if(nextLexeme != null){
//将Lexeme转成Attributes
//设置词元文本
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
//记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
//返会true告知还有下个词元
return true;
}
//返会false告知词元输出完毕
return false;
}
/*
* (non-Javadoc)
* @see org.apache.lucene.analysis.Tokenizer#reset(java.io.Reader)
*/
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}
///**
// * IK 中文分词 版本 5.0
// * IK Analyzer release 5.0
// *
// * Licensed to the Apache Software Foundation (ASF) under one or more
// * contributor license agreements. See the NOTICE file distributed with
// * this work for additional information regarding copyright ownership.
// * The ASF licenses this file to You under the Apache License, Version 2.0
// * (the "License"); you may not use this file except in compliance with
// * the License. You may obtain a copy of the License at
// *
// * http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// *
// * 源代码由林良益(linliangyi2005@gmail.com)提供
// * 版权声明 2012,乌龙茶工作室
// * provided by Linliangyi and copyright 2012 by Oolong studio
// *
// */
//package org.wltea.analyzer.query;
//
//import java.io.IOException;
//import java.io.StringReader;
//import java.util.ArrayList;
//import java.util.List;
//
//import org.apache.lucene.analysis.standard.StandardAnalyzer;
//import org.apache.lucene.queryparser.classic.ParseException;
//import org.apache.lucene.queryparser.classic.QueryParser;
//import org.apache.lucene.search.Query;
//import org.apache.lucene.util.Version;
//import org.wltea.analyzer.core.IKSegmenter;
//import org.wltea.analyzer.core.Lexeme;
//
///**
// * Single Word Multi Char Query Builder
// * IK分词算法专用
// * @author linliangyi
// *
// */
//public class SWMCQueryBuilder {
//
// /**
// * 生成SWMCQuery
// * @param fieldName
// * @param keywords
// * @param quickMode
// * @return Lucene Query
// */
// public static Query create(String fieldName ,String keywords , boolean quickMode){
// if(fieldName == null || keywords == null){
// throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
// }
// //1.对keywords进行分词处理
// List<Lexeme> lexemes = doAnalyze(keywords);
// //2.根据分词结果,生成SWMCQuery
// Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
// return _SWMCQuery;
// }
//
// /**
// * 分词切分,并返回结链表
// * @param keywords
// * @return
// */
// private static List<Lexeme> doAnalyze(String keywords){
// List<Lexeme> lexemes = new ArrayList<Lexeme>();
// IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
// try{
// Lexeme l = null;
// while( (l = ikSeg.next()) != null){
// lexemes.add(l);
// }
// }catch(IOException e){
// e.printStackTrace();
// }
// return lexemes;
// }
//
//
// /**
// * 根据分词结果生成SWMC搜索
// * @param fieldName
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*/
package org.wltea.analyzer.query;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
/**
* Single Word Multi Char Query Builder
* IK分词算法专用
* @author linliangyi
*
*/
public class SWMCQueryBuilder {
/**
* 生成SWMCQuery
* @param fieldName
* @param keywords
* @param quickMode
* @return Lucene Query
*/
public static Query create(String fieldName ,String keywords , boolean quickMode){
if(fieldName == null || keywords == null){
throw new IllegalArgumentException("参数 fieldName 、 keywords 不能为null.");
}
//1.对keywords进行分词处理
List<Lexeme> lexemes = doAnalyze(keywords);
//2.根据分词结果,生成SWMCQuery
Query _SWMCQuery = getSWMCQuery(fieldName , lexemes , quickMode);
return _SWMCQuery;
}
/**
* 分词切分,并返回结链表
* @param keywords
* @return
*/
private static List<Lexeme> doAnalyze(String keywords){
List<Lexeme> lexemes = new ArrayList<Lexeme>();
IKSegmenter ikSeg = new IKSegmenter(new StringReader(keywords) , true);
try{
Lexeme l = null;
while( (l = ikSeg.next()) != null){
lexemes.add(l);
}
}catch(IOException e){
e.printStackTrace();
}
return lexemes;
}
/**
* 根据分词结果生成SWMC搜索
* @param fieldName
// * @param pathOption
// * @param quickMode
// * @return
// */
// private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){
// //构造SWMC的查询表达式
// StringBuffer keywordBuffer = new StringBuffer();
// //精简的SWMC的查询表达式
// StringBuffer keywordBuffer_Short = new StringBuffer();
// //记录最后词元长度
// int lastLexemeLength = 0;
// //记录最后词元结束位置
// int lastLexemeEnd = -1;
//
// int shortCount = 0;
// int totalCount = 0;
// for(Lexeme l : lexemes){
// totalCount += l.getLength();
// //精简表达式
// if(l.getLength() > 1){
// keywordBuffer_Short.append(' ').append(l.getLexemeText());
// shortCount += l.getLength();
// }
//
// if(lastLexemeLength == 0){
// keywordBuffer.append(l.getLexemeText());
// }else if(lastLexemeLength == 1 && l.getLength() == 1
// && lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
// keywordBuffer.append(l.getLexemeText());
// }else{
// keywordBuffer.append(' ').append(l.getLexemeText());
//
// }
// lastLexemeLength = l.getLength();
// lastLexemeEnd = l.getEndPosition();
// }
//
// //借助lucene queryparser 生成SWMC Query
// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40));
// qp.setDefaultOperator(QueryParser.AND_OPERATOR);
// qp.setAutoGeneratePhraseQueries(true);
//
// if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
// try {
// //System.out.println(keywordBuffer.toString());
// Query q = qp.parse(keywordBuffer_Short.toString());
// return q;
// } catch (ParseException e) {
// e.printStackTrace();
// }
//
// }else{
// if(keywordBuffer.length() > 0){
// try {
// //System.out.println(keywordBuffer.toString());
// Query q = qp.parse(keywordBuffer.toString());
// return q;
// } catch (ParseException e) {
// e.printStackTrace();
// }
// }
// }
// return null;
// }
//}
* @param quickMode
* @return
*/
private static Query getSWMCQuery(String fieldName , List<Lexeme> lexemes , boolean quickMode){
//构造SWMC的查询表达式
StringBuffer keywordBuffer = new StringBuffer();
//精简的SWMC的查询表达式
StringBuffer keywordBuffer_Short = new StringBuffer();
//记录最后词元长度
int lastLexemeLength = 0;
//记录最后词元结束位置
int lastLexemeEnd = -1;
int shortCount = 0;
int totalCount = 0;
for(Lexeme l : lexemes){
totalCount += l.getLength();
//精简表达式
if(l.getLength() > 1){
keywordBuffer_Short.append(' ').append(l.getLexemeText());
shortCount += l.getLength();
}
if(lastLexemeLength == 0){
keywordBuffer.append(l.getLexemeText());
}else if(lastLexemeLength == 1 && l.getLength() == 1
&& lastLexemeEnd == l.getBeginPosition()){//单字位置相邻,长度为一,合并)
keywordBuffer.append(l.getLexemeText());
}else{
keywordBuffer.append(' ').append(l.getLexemeText());
}
lastLexemeLength = l.getLength();
lastLexemeEnd = l.getEndPosition();
}
//借助lucene queryparser 生成SWMC Query
QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, new StandardAnalyzer(Version.LUCENE_40));
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
qp.setAutoGeneratePhraseQueries(true);
if(quickMode && (shortCount * 1.0f / totalCount) > 0.5f){
try {
//System.out.println(keywordBuffer.toString());
Query q = qp.parse(keywordBuffer_Short.toString());
return q;
} catch (ParseException e) {
e.printStackTrace();
}
}else{
if(keywordBuffer.length() > 0){
try {
//System.out.println(keywordBuffer.toString());
Query q = qp.parse(keywordBuffer.toString());
return q;
} catch (ParseException e) {
e.printStackTrace();
}
}
}
return null;
}
}
///**
// * IK 中文分词 版本 5.0
// * IK Analyzer release 5.0
// *
// * Licensed to the Apache Software Foundation (ASF) under one or more
// * contributor license agreements. See the NOTICE file distributed with
// * this work for additional information regarding copyright ownership.
// * The ASF licenses this file to You under the Apache License, Version 2.0
// * (the "License"); you may not use this file except in compliance with
// * the License. You may obtain a copy of the License at
// *
// * http://www.apache.org/licenses/LICENSE-2.0
// *
// * Unless required by applicable law or agreed to in writing, software
// * distributed under the License is distributed on an "AS IS" BASIS,
// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// * See the License for the specific language governing permissions and
// * limitations under the License.
// *
// * 源代码由林良益(linliangyi2005@gmail.com)提供
// * 版权声明 2012,乌龙茶工作室
// * provided by Linliangyi and copyright 2012 by Oolong studio
// *
// *
// */
//package org.wltea.analyzer.sample;
//
//import java.io.IOException;
//
//import org.apache.lucene.analysis.Analyzer;
//import org.apache.lucene.document.Document;
//import org.apache.lucene.document.Field;
//import org.apache.lucene.document.StringField;
//import org.apache.lucene.document.TextField;
//import org.apache.lucene.index.CorruptIndexException;
//import org.apache.lucene.index.DirectoryReader;
//import org.apache.lucene.index.IndexReader;
//import org.apache.lucene.index.IndexWriter;
//import org.apache.lucene.index.IndexWriterConfig;
//import org.apache.lucene.index.IndexWriterConfig.OpenMode;
//import org.apache.lucene.queryparser.classic.ParseException;
//import org.apache.lucene.queryparser.classic.QueryParser;
//import org.apache.lucene.search.IndexSearcher;
//import org.apache.lucene.search.Query;
//import org.apache.lucene.search.ScoreDoc;
//import org.apache.lucene.search.TopDocs;
//import org.apache.lucene.store.Directory;
//import org.apache.lucene.store.LockObtainFailedException;
//import org.apache.lucene.store.RAMDirectory;
//import org.apache.lucene.util.Version;
//import org.wltea.analyzer.lucene.IKAnalyzer;
//
//
//
//
///**
// * 使用IKAnalyzer进行Lucene索引和查询的演示
// * 2012-3-2
// *
// * 以下是结合Lucene4.0 API的写法
// *
// */
//public class LuceneIndexAndSearchDemo {
//
//
// /**
// * 模拟:
// * 创建一个单条记录的索引,并对其进行搜索
// * @param args
// */
// public static void main(String[] args){
// //Lucene Document的域名
// String fieldName = "text";
// //检索内容
// String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
//
// //实例化IKAnalyzer分词器
// Analyzer analyzer = new IKAnalyzer(true);
//
// Directory directory = null;
// IndexWriter iwriter = null;
// IndexReader ireader = null;
// IndexSearcher isearcher = null;
// try {
// //建立内存索引对象
// directory = new RAMDirectory();
//
// //配置IndexWriterConfig
// IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer);
// iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
// iwriter = new IndexWriter(directory , iwConfig);
// //写入索引
// Document doc = new Document();
// doc.add(new StringField("ID", "10000", Field.Store.YES));
// doc.add(new TextField(fieldName, text, Field.Store.YES));
// iwriter.addDocument(doc);
// iwriter.close();
//
//
// //搜索过程**********************************
// //实例化搜索器
// ireader = DirectoryReader.open(directory);
// isearcher = new IndexSearcher(ireader);
//
// String keyword = "中文分词工具包";
// //使用QueryParser查询分析器构造Query对象
// QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
// qp.setDefaultOperator(QueryParser.AND_OPERATOR);
// Query query = qp.parse(keyword);
// System.out.println("Query = " + query);
//
// //搜索相似度最高的5条记录
// TopDocs topDocs = isearcher.search(query , 5);
// System.out.println("命中:" + topDocs.totalHits);
// //输出结果
// ScoreDoc[] scoreDocs = topDocs.scoreDocs;
// for (int i = 0; i < topDocs.totalHits; i++){
// Document targetDoc = isearcher.doc(scoreDocs[i].doc);
// System.out.println("内容:" + targetDoc.toString());
// }
//
// } catch (CorruptIndexException e) {
// e.printStackTrace();
// } catch (LockObtainFailedException e) {
// e.printStackTrace();
// } catch (IOException e) {
// e.printStackTrace();
// } catch (ParseException e) {
// e.printStackTrace();
// } finally{
// if(ireader != null){
// try {
// ireader.close();
// } catch (IOException e) {
// e.printStackTrace();
// }
// }
// if(directory != null){
// try {
// directory.close();
// } catch (IOException e) {
// e.printStackTrace();
// }
// }
// }
// }
//}
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
package org.wltea.analyzer.sample;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
/**
* 使用IKAnalyzer进行Lucene索引和查询的演示
* 2012-3-2
*
* 以下是结合Lucene4.0 API的写法
*
*/
public class LuceneIndexAndSearchDemo {
/**
* 模拟:
* 创建一个单条记录的索引,并对其进行搜索
* @param args
*/
public static void main(String[] args){
//Lucene Document的域名
String fieldName = "text";
//检索内容
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
//实例化IKAnalyzer分词器
Analyzer analyzer = new IKAnalyzer(true);
Directory directory = null;
IndexWriter iwriter = null;
IndexReader ireader = null;
IndexSearcher isearcher = null;
try {
//建立内存索引对象
directory = new RAMDirectory();
//配置IndexWriterConfig
IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_40 , analyzer);
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
iwriter = new IndexWriter(directory , iwConfig);
//写入索引
Document doc = new Document();
doc.add(new StringField("ID", "10000", Field.Store.YES));
doc.add(new TextField(fieldName, text, Field.Store.YES));
iwriter.addDocument(doc);
iwriter.close();
//搜索过程**********************************
//实例化搜索器
ireader = DirectoryReader.open(directory);
isearcher = new IndexSearcher(ireader);
String keyword = "中文分词工具包";
//使用QueryParser查询分析器构造Query对象
QueryParser qp = new QueryParser(Version.LUCENE_40, fieldName, analyzer);
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query = qp.parse(keyword);
System.out.println("Query = " + query);
//搜索相似度最高的5条记录
TopDocs topDocs = isearcher.search(query , 5);
System.out.println("命中:" + topDocs.totalHits);
//输出结果
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (int i = 0; i < topDocs.totalHits; i++){
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
System.out.println("内容:" + targetDoc.toString());
}
} catch (CorruptIndexException e) {
e.printStackTrace();
} catch (LockObtainFailedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
e.printStackTrace();
} finally{
if(ireader != null){
try {
ireader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if(directory != null){
try {
directory.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册