AnalyzeContext.java 11.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
/**
 * IK 中文分词  版本 5.0
 * IK Analyzer release 5.0
 * 
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * 源代码由林良益(linliangyi2005@gmail.com)提供
 * 版权声明 2012,乌龙茶工作室
 * provided by Linliangyi and copyright 2012 by Oolong studio
 * 
 */
package org.wltea.analyzer.core;

import java.io.IOException;
import java.io.Reader;
W
wangweihua 已提交
29 30 31 32 33 34
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;

35
import org.wltea.analyzer.cfg.Configuration;
W
wangweihua 已提交
36
import org.wltea.analyzer.dic.Dictionary;
37 38 39 40 41 42 43 44 45 46 47 48 49 50

/**
 * 
 * 分词器上下文状态
 * 
 */
class AnalyzeContext {
	
	//默认缓冲区大小
	private static final int BUFF_SIZE = 4096;
	//缓冲区耗尽的临界值
	private static final int BUFF_EXHAUST_CRITICAL = 100;	
	
 
B
byronhe 已提交
51
	//字符串读取缓冲
52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
    private char[] segmentBuff;
    //字符类型数组
    private int[] charTypes;
    
    
    //记录Reader内已分析的字串总长度
    //在分多段分析词元时,该变量累计当前的segmentBuff相对于reader起始位置的位移
	private int buffOffset;	
    //当前缓冲区位置指针
    private int cursor;
    //最近一次读入的,可处理的字串长度
	private int available;

	
	//子分词器锁
    //该集合非空,说明有子分词器在占用segmentBuff
    private Set<String> buffLocker;
    
    //原始分词结果集合,未经歧义处理
    private QuickSortSet orgLexemes;    
    //LexemePath位置索引表
    private Map<Integer , LexemePath> pathMap;    
    //最终分词结果集
    private LinkedList<Lexeme> results;
	//分词器配置项
77
	private Configuration cfg;
W
wangweihua 已提交
78

79 80
    public AnalyzeContext(Configuration configuration){
        this.cfg = configuration;
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
    	this.segmentBuff = new char[BUFF_SIZE];
    	this.charTypes = new int[BUFF_SIZE];
    	this.buffLocker = new HashSet<String>();
    	this.orgLexemes = new QuickSortSet();
    	this.pathMap = new HashMap<Integer , LexemePath>();    	
    	this.results = new LinkedList<Lexeme>();
    }
    
    int getCursor(){
    	return this.cursor;
    }
    
    char[] getSegmentBuff(){
    	return this.segmentBuff;
    }
    
    char getCurrentChar(){
    	return this.segmentBuff[this.cursor];
    }
    
    int getCurrentCharType(){
    	return this.charTypes[this.cursor];
    }
    
    int getBufferOffset(){
    	return this.buffOffset;
    }
	
    /**
     * 根据context的上下文情况,填充segmentBuff 
     * @param reader
     * @return 返回待分析的(有效的)字串长度
weixin_43283383's avatar
weixin_43283383 已提交
113
     * @throws java.io.IOException
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
     */
    int fillBuffer(Reader reader) throws IOException{
    	int readCount = 0;
    	if(this.buffOffset == 0){
    		//首次读取reader
    		readCount = reader.read(segmentBuff);
    	}else{
    		int offset = this.available - this.cursor;
    		if(offset > 0){
    			//最近一次读取的>最近一次处理的,将未处理的字串拷贝到segmentBuff头部
    			System.arraycopy(this.segmentBuff , this.cursor , this.segmentBuff , 0 , offset);
    			readCount = offset;
    		}
    		//继续读取reader ,以onceReadIn - onceAnalyzed为起始位置,继续填充segmentBuff剩余的部分
    		readCount += reader.read(this.segmentBuff , offset , BUFF_SIZE - offset);
    	}            	
    	//记录最后一次从Reader中读入的可用字符长度
    	this.available = readCount;
    	//重置当前指针
    	this.cursor = 0;
    	return readCount;
    }

    /**
     * 初始化buff指针,处理第一个字符
     */
    void initCursor(){
    	this.cursor = 0;
142
    	this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],cfg.isEnableLowercase());
143 144 145 146 147 148 149 150 151 152 153
    	this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
    }
    
    /**
     * 指针+1
     * 成功返回 true; 指针已经到了buff尾部,不能前进,返回false
     * 并处理当前字符
     */
    boolean moveCursor(){
    	if(this.cursor < this.available - 1){
    		this.cursor++;
154
        	this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],cfg.isEnableLowercase());
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
        	this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
    		return true;
    	}else{
    		return false;
    	}
    }
	
    /**
     * 设置当前segmentBuff为锁定状态
     * 加入占用segmentBuff的子分词器名称,表示占用segmentBuff
     * @param segmenterName
     */
	void lockBuffer(String segmenterName){
		this.buffLocker.add(segmenterName);
	}
	
	/**
	 * 移除指定的子分词器名,释放对segmentBuff的占用
	 * @param segmenterName
	 */
	void unlockBuffer(String segmenterName){
		this.buffLocker.remove(segmenterName);
	}
	
	/**
	 * 只要buffLocker中存在segmenterName
	 * 则buffer被锁定
	 * @return boolean 缓冲去是否被锁定
	 */
	boolean isBufferLocked(){
		return this.buffLocker.size() > 0;
	}

	/**
	 * 判断当前segmentBuff是否已经用完
	 * 当前执针cursor移至segmentBuff末端this.available - 1
	 * @return
	 */
	boolean isBufferConsumed(){
		return this.cursor == this.available - 1;
	}
	
	/**
	 * 判断segmentBuff是否需要读取新数据
	 * 
	 * 满足一下条件时,
	 * 1.available == BUFF_SIZE 表示buffer满载
	 * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内
	 * 3.!context.isBufferLocked()表示没有segmenter在占用buffer
	 * 要中断当前循环(buffer要进行移位,并再读取数据的操作)
	 * @return
	 */
	boolean needRefillBuffer(){
		return this.available == BUFF_SIZE 
			&& this.cursor < this.available - 1   
			&& this.cursor  > this.available - BUFF_EXHAUST_CRITICAL
			&& !this.isBufferLocked();
	}
	
	/**
	 * 累计当前的segmentBuff相对于reader起始位置的位移
	 */
	void markBufferOffset(){
		this.buffOffset += this.cursor;
	}
	
	/**
	 * 向分词结果集添加词元
	 * @param lexeme
	 */
	void addLexeme(Lexeme lexeme){
		this.orgLexemes.addLexeme(lexeme);
	}
	
	/**
	 * 添加分词结果路径
	 * 路径起始位置 ---> 路径 映射表
	 * @param path
	 */
	void addLexemePath(LexemePath path){
		if(path != null){
			this.pathMap.put(path.getPathBegin(), path);
		}
	}
	
	
	/**
	 * 返回原始分词结果
	 * @return
	 */
	QuickSortSet getOrgLexemes(){
		return this.orgLexemes;
	}
	
	/**
	 * 推送分词结果到结果集合
	 * 1.从buff头部遍历到this.cursor已处理位置
	 * 2.将map中存在的分词结果推入results
	 * 3.将map中不存在的CJDK字符以单字方式推入results
	 */
	void outputToResult(){
		int index = 0;
		for( ; index <= this.cursor ;){
			//跳过非CJK字符
			if(CharacterUtil.CHAR_USELESS == this.charTypes[index]){
				index++;
				continue;
			}
			//从pathMap找出对应index位置的LexemePath
			LexemePath path = this.pathMap.get(index);
			if(path != null){
				//输出LexemePath中的lexeme到results集合
				Lexeme l = path.pollFirst();
				while(l != null){
					this.results.add(l);
P
pengcong90 已提交
270
					//字典中无单字,但是词元冲突了,切分出相交词元的前一个词元中的单字
Z
zhipingpan 已提交
271
					/*int innerIndex = index + 1;
P
pengcong90 已提交
272 273 274 275 276
					for (; innerIndex < index + l.getLength(); innerIndex++) {
						Lexeme innerL = path.peekFirst();
						if (innerL != null && innerIndex == innerL.getBegin()) {
							this.outputSingleCJK(innerIndex - 1);
						}
Z
zhipingpan 已提交
277
					}*/
P
pengcong90 已提交
278
					
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
					//将index移至lexeme后
					index = l.getBegin() + l.getLength();					
					l = path.pollFirst();
					if(l != null){
						//输出path内部,词元间遗漏的单字
						for(;index < l.getBegin();index++){
							this.outputSingleCJK(index);
						}
					}
				}
			}else{//pathMap中找不到index对应的LexemePath
				//单字输出
				this.outputSingleCJK(index);
				index++;
			}
		}
		//清空当前的Map
		this.pathMap.clear();
	}
	
	/**
	 * 对CJK字符进行单字输出
	 * @param index
	 */
	private void outputSingleCJK(int index){
		if(CharacterUtil.CHAR_CHINESE == this.charTypes[index]){			
			Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_CNCHAR);
			this.results.add(singleCharLexeme);
		}else if(CharacterUtil.CHAR_OTHER_CJK == this.charTypes[index]){
			Lexeme singleCharLexeme = new Lexeme(this.buffOffset , index , 1 , Lexeme.TYPE_OTHER_CJK);
			this.results.add(singleCharLexeme);
		}
	}
		
	/**
	 * 返回lexeme 
	 * 
	 * 同时处理合并
	 * @return
	 */
	Lexeme getNextLexeme(){
		//从结果集取出,并移除第一个Lexme
		Lexeme result = this.results.pollFirst();
		while(result != null){
    		//数量词合并
    		this.compound(result);
W
wangweihua 已提交
325
    		if(Dictionary.getSingleton().isStopWord(this.segmentBuff ,  result.getBegin() , result.getLength())){
326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
       			//是停止词继续取列表的下一个
    			result = this.results.pollFirst(); 				
    		}else{
	 			//不是停止词, 生成lexeme的词元文本,输出
	    		result.setLexemeText(String.valueOf(segmentBuff , result.getBegin() , result.getLength()));
	    		break;
    		}
		}
		return result;
	}
	
	/**
	 * 重置分词上下文状态
	 */
	void reset(){		
		this.buffLocker.clear();
        this.orgLexemes = new QuickSortSet();
        this.available =0;
        this.buffOffset = 0;
    	this.charTypes = new int[BUFF_SIZE];
    	this.cursor = 0;
    	this.results.clear();
    	this.segmentBuff = new char[BUFF_SIZE];
    	this.pathMap.clear();
	}
	
	/**
	 * 组合词元
	 */
	private void compound(Lexeme result){
W
wangweihua 已提交
356

357
		if(!this.cfg.isUseSmart()){
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
			return ;
		}
   		//数量词合并处理
		if(!this.results.isEmpty()){

			if(Lexeme.TYPE_ARABIC == result.getLexemeType()){
				Lexeme nextLexeme = this.results.peekFirst();
				boolean appendOk = false;
				if(Lexeme.TYPE_CNUM == nextLexeme.getLexemeType()){
					//合并英文数词+中文数词
					appendOk = result.append(nextLexeme, Lexeme.TYPE_CNUM);
				}else if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
					//合并英文数词+中文量词
					appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
				}
				if(appendOk){
					//弹出
					this.results.pollFirst(); 
				}
			}
			
			//可能存在第二轮合并
			if(Lexeme.TYPE_CNUM == result.getLexemeType() && !this.results.isEmpty()){
				Lexeme nextLexeme = this.results.peekFirst();
				boolean appendOk = false;
				 if(Lexeme.TYPE_COUNT == nextLexeme.getLexemeType()){
					 //合并中文数词+中文量词
 					appendOk = result.append(nextLexeme, Lexeme.TYPE_CQUAN);
 				}  
				if(appendOk){
					//弹出
					this.results.pollFirst();   				
				}
			}

		}
	}
	
}