IKSegmenter.java 4.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
/**
 * IK 中文分词  版本 5.0
 * IK Analyzer release 5.0
 * 
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * 源代码由林良益(linliangyi2005@gmail.com)提供
 * 版权声明 2012,乌龙茶工作室
 * provided by Linliangyi and copyright 2012 by Oolong studio
 */
package org.wltea.analyzer.core;

weixin_43283383's avatar
weixin_43283383 已提交
26
import org.elasticsearch.common.settings.Settings;
A
arkxu 已提交
27
import org.elasticsearch.env.Environment;
W
wangweihua 已提交
28 29 30
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;

31 32 33 34 35
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

36 37 38 39 40 41 42 43
/**
 * IK分词器主类
 *
 */
public final class IKSegmenter {
	
	//字符窜reader
	private Reader input;
W
wangweihua 已提交
44 45
	//分词器配置项
	private Configuration cfg;
46 47 48 49 50 51
	//分词器上下文
	private AnalyzeContext context;
	//分词处理器列表
	private List<ISegmenter> segmenters;
	//分词歧义裁决器
	private IKArbitrator arbitrator;
W
wangweihua 已提交
52 53
    private  boolean useSmart = false;
	
54

W
wangweihua 已提交
55 56 57
	/**
	 * IK分词器构造函数
	 * @param input
weixin_43283383's avatar
weixin_43283383 已提交
58
     */
A
arkxu 已提交
59
	public IKSegmenter(Reader input , Settings settings, Environment environment){
W
wangweihua 已提交
60
		this.input = input;
A
arkxu 已提交
61
		this.cfg = new Configuration(environment);
weixin_43283383's avatar
weixin_43283383 已提交
62 63
        this.useSmart = settings.get("use_smart", "true").equals("true");
        this.init();
64 65
	}
	
A
arkxu 已提交
66 67 68 69
	public IKSegmenter(Reader input){
		new IKSegmenter(input, null,null);
	}
	
weixin_43283383's avatar
weixin_43283383 已提交
70 71 72 73 74 75 76 77 78 79 80 81
//	/**
//	 * IK分词器构造函数
//	 * @param input
//	 * @param cfg 使用自定义的Configuration构造分词器
//	 *
//	 */
//	public IKSegmenter(Reader input , Configuration cfg){
//		this.input = input;
//		this.cfg = cfg;
//		this.init();
//	}
	
82 83 84 85
	/**
	 * 初始化
	 */
	private void init(){
W
wangweihua 已提交
86
		//初始化词典单例
weixin_43283383's avatar
weixin_43283383 已提交
87
		Dictionary.initial(this.cfg);
88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
		//初始化分词上下文
		this.context = new AnalyzeContext(useSmart);
		//加载子分词器
		this.segmenters = this.loadSegmenters();
		//加载歧义裁决器
		this.arbitrator = new IKArbitrator();
	}
	
	/**
	 * 初始化词典,加载子分词器实现
	 * @return List<ISegmenter>
	 */
	private List<ISegmenter> loadSegmenters(){
		List<ISegmenter> segmenters = new ArrayList<ISegmenter>(4);
		//处理字母的子分词器
		segmenters.add(new LetterSegmenter()); 
		//处理中文数量词的子分词器
		segmenters.add(new CN_QuantifierSegmenter());
		//处理中文词的子分词器
		segmenters.add(new CJKSegmenter());
		return segmenters;
	}
	
	/**
	 * 分词,获取下一个词元
	 * @return Lexeme 词元对象
weixin_43283383's avatar
weixin_43283383 已提交
114
	 * @throws java.io.IOException
115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170
	 */
	public synchronized Lexeme next()throws IOException{
		Lexeme l = null;
		while((l = context.getNextLexeme()) == null ){
			/*
			 * 从reader中读取数据,填充buffer
			 * 如果reader是分次读入buffer的,那么buffer要  进行移位处理
			 * 移位处理上次读入的但未处理的数据
			 */
			int available = context.fillBuffer(this.input);
			if(available <= 0){
				//reader已经读完
				context.reset();
				return null;
				
			}else{
				//初始化指针
				context.initCursor();
				do{
        			//遍历子分词器
        			for(ISegmenter segmenter : segmenters){
        				segmenter.analyze(context);
        			}
        			//字符缓冲区接近读完,需要读入新的字符
        			if(context.needRefillBuffer()){
        				break;
        			}
   				//向前移动指针
				}while(context.moveCursor());
				//重置子分词器,为下轮循环进行初始化
				for(ISegmenter segmenter : segmenters){
					segmenter.reset();
				}
			}
			//对分词进行歧义处理
			this.arbitrator.process(context, useSmart);
			//将分词结果输出到结果集,并处理未切分的单个CJK字符
			context.outputToResult();
			//记录本次分词的缓冲区位移
			context.markBufferOffset();			
		}
		return l;
	}

	/**
     * 重置分词器到初始状态
     * @param input
     */
	public synchronized void reset(Reader input) {
		this.input = input;
		context.reset();
		for(ISegmenter segmenter : segmenters){
			segmenter.reset();
		}
	}
}