提交 1f80b50a 编写于 作者: weixin_43283383's avatar weixin_43283383

fix offset problemn while create the custom analyzer combined with html_strip filter, @xpandan

上级 44b63715
......@@ -78,6 +78,11 @@
<version>4.10</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>4.9.0</version>
</dependency>
</dependencies>
<build>
......
......@@ -37,6 +37,7 @@ import org.wltea.analyzer.core.Lexeme;
import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
/**
* IK分词器 Lucene Tokenizer适配器类
......@@ -55,8 +56,13 @@ public final class IKTokenizer extends Tokenizer {
private final TypeAttribute typeAtt;
//记录最后一个词元的结束位置
private int endPosition;
/**
private int skippedPositions;
private PositionIncrementAttribute posIncrAtt;
/**
* Lucene 4.0 Tokenizer适配器类构造函数
* @param in
*/
......@@ -65,8 +71,9 @@ public final class IKTokenizer extends Tokenizer {
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
_IKImplement = new IKSegmenter(input , settings, environment);
_IKImplement = new IKSegmenter(input , settings, environment);
}
/* (non-Javadoc)
......@@ -76,16 +83,22 @@ public final class IKTokenizer extends Tokenizer {
public boolean incrementToken() throws IOException {
//清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
skippedPositions = 0;
Lexeme nextLexeme = _IKImplement.next();
if(nextLexeme != null){
posIncrAtt.setPositionIncrement(skippedPositions +1 );
//将Lexeme转成Attributes
//设置词元文本
termAtt.append(nextLexeme.getLexemeText());
//设置词元长度
termAtt.setLength(nextLexeme.getLength());
//设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
//记录分词的最后位置
// offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
offsetAtt.setOffset(correctOffset(nextLexeme.getBeginPosition()), correctOffset(nextLexeme.getEndPosition()));
//记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
//记录词元分类
typeAtt.setType(nextLexeme.getLexemeTypeString());
......@@ -104,12 +117,15 @@ public final class IKTokenizer extends Tokenizer {
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
skippedPositions = 0;
}
@Override
public final void end() {
public final void end() throws IOException {
super.end();
// set final offset
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册