提交 341b5863 编写于 作者: weixin_43283383's avatar weixin_43283383

add config to enable/disable lowercase and remote_dict, Closes #241

上级 b6625969
......@@ -230,7 +230,12 @@ mvn compile
mvn package
```
copy & unzip file #{project_path}/elasticsearch-analysis-ik/target/releases/elasticsearch-analysis-ik-*.zip to your elasticsearch's folder: plugins/ik
拷贝和解压release下的文件: #{project_path}/elasticsearch-analysis-ik/target/releases/elasticsearch-analysis-ik-*.zip 到你的 elasticsearch 插件目录, 如: plugins/ik
重启elasticsearch
3.分词测试失败
请在某个索引下调用analyze接口测试,而不是直接调用analyze接口
如:http://localhost:9200/your_index/_analyze?text=中华人民共和国MN&tokenizer=my_ik
Thanks
......
package org.elasticsearch.index.analysis;
@Deprecated
public class IkAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor {
......
......@@ -10,17 +10,16 @@ import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.lucene.IKAnalyzer;
@Deprecated
public class IkAnalyzerProvider extends AbstractIndexAnalyzerProvider<IKAnalyzer> {
private final IKAnalyzer analyzer;
private boolean useSmart=false;
@Inject
public IkAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettingsService.getSettings(), name, settings);
Dictionary.initial(new Configuration(env));
useSmart = settings.get("use_smart", "false").equals("true");
analyzer=new IKAnalyzer(useSmart);
Configuration configuration=new Configuration(env,settings);
analyzer=new IKAnalyzer(configuration);
}
@Override public IKAnalyzer get() {
......
......@@ -8,25 +8,18 @@ import org.elasticsearch.env.Environment;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.settings.IndexSettingsService;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.lucene.IKTokenizer;
@Deprecated
public class IkTokenizerFactory extends AbstractTokenizerFactory {
private final Settings settings;
private boolean useSmart=false;
private Configuration configuration;
@Inject
public IkTokenizerFactory(Index index, IndexSettingsService indexSettingsService,Environment env, @Assisted String name, @Assisted Settings settings) {
super(index, indexSettingsService.getSettings(), name, settings);
this.settings=settings;
Dictionary.initial(new Configuration(env));
configuration=new Configuration(env,settings);
}
@Override
public Tokenizer create() {
this.useSmart = settings.get("use_smart", "false").equals("true");
return new IKTokenizer(useSmart); }
return new IKTokenizer(configuration); }
}
......@@ -3,6 +3,7 @@ package org.elasticsearch.indices.analysis;
import org.apache.lucene.analysis.Tokenizer;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.analysis.AnalyzerScope;
......@@ -26,21 +27,20 @@ public class IKIndicesAnalysis extends AbstractComponent {
public IKIndicesAnalysis(final Settings settings,
IndicesAnalysisService indicesAnalysisService,Environment env) {
super(settings);
Dictionary.initial(new Configuration(env));
this.useSmart = settings.get("use_smart", "false").equals("true");
final Configuration configuration=new Configuration(env,settings).setUseSmart(false);
final Configuration smartConfiguration=new Configuration(env,settings).setUseSmart(true);
indicesAnalysisService.analyzerProviderFactories().put("ik",
new PreBuiltAnalyzerProviderFactory("ik", AnalyzerScope.GLOBAL,
new IKAnalyzer(useSmart)));
new IKAnalyzer(configuration)));
indicesAnalysisService.analyzerProviderFactories().put("ik_smart",
new PreBuiltAnalyzerProviderFactory("ik_smart", AnalyzerScope.GLOBAL,
new IKAnalyzer(true)));
new IKAnalyzer(smartConfiguration)));
indicesAnalysisService.analyzerProviderFactories().put("ik_max_word",
new PreBuiltAnalyzerProviderFactory("ik_max_word", AnalyzerScope.GLOBAL,
new IKAnalyzer(false)));
new IKAnalyzer(configuration)));
indicesAnalysisService.tokenizerFactories().put("ik",
new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
......@@ -51,7 +51,7 @@ public class IKIndicesAnalysis extends AbstractComponent {
@Override
public Tokenizer create() {
return new IKTokenizer(false);
return new IKTokenizer(configuration);
}
}));
......@@ -64,7 +64,7 @@ public class IKIndicesAnalysis extends AbstractComponent {
@Override
public Tokenizer create() {
return new IKTokenizer(true);
return new IKTokenizer(smartConfiguration);
}
}));
......@@ -77,8 +77,8 @@ public class IKIndicesAnalysis extends AbstractComponent {
@Override
public Tokenizer create() {
return new IKTokenizer(false);
return new IKTokenizer(configuration);
}
}));
}
}
\ No newline at end of file
}
......@@ -7,8 +7,10 @@ import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.io.PathUtils;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
import org.wltea.analyzer.dic.Dictionary;
import java.io.*;
import java.net.URL;
......@@ -20,132 +22,61 @@ import java.util.Properties;
public class Configuration {
private static String FILE_NAME = "IKAnalyzer.cfg.xml";
private static final String EXT_DICT = "ext_dict";
private static final String REMOTE_EXT_DICT = "remote_ext_dict";
private static final String EXT_STOP = "ext_stopwords";
private static final String REMOTE_EXT_STOP = "remote_ext_stopwords";
private static ESLogger logger = Loggers.getLogger("ik-analyzer");
private Path conf_dir;
private Properties props;
private Environment environment;
private Settings settings;
//是否启用智能分词
private boolean useSmart;
//是否启用远程词典加载
private boolean enableRemoteDict=false;
//是否启用小写处理
private boolean enableLowercase=true;
@Inject
public Configuration(Environment env) {
props = new Properties();
environment = env;
conf_dir = environment.configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME);
Path configFile = conf_dir.resolve(FILE_NAME);
InputStream input = null;
try {
logger.info("try load config from {}", configFile);
input = new FileInputStream(configFile.toFile());
} catch (FileNotFoundException e) {
conf_dir = this.getConfigInPluginDir();
configFile = conf_dir.resolve(FILE_NAME);
try {
logger.info("try load config from {}", configFile);
input = new FileInputStream(configFile.toFile());
} catch (FileNotFoundException ex) {
// We should report origin exception
logger.error("ik-analyzer", e);
}
}
if (input != null) {
try {
props.loadFromXML(input);
} catch (InvalidPropertiesFormatException e) {
logger.error("ik-analyzer", e);
} catch (IOException e) {
logger.error("ik-analyzer", e);
}
}
public Configuration(Environment env,Settings settings) {
this.environment = env;
this.settings=settings;
this.useSmart = settings.get("use_smart", "false").equals("true");
this.enableLowercase = settings.get("enable_lowercase", "true").equals("true");
this.enableRemoteDict = settings.get("enable_remote_dict", "true").equals("true");
Dictionary.initial(this);
}
public List<String> getExtDictionarys() {
List<String> extDictFiles = new ArrayList<String>(2);
String extDictCfg = props.getProperty(EXT_DICT);
if (extDictCfg != null) {
String[] filePaths = extDictCfg.split(";");
if (filePaths != null) {
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
Path file = PathUtils.get(filePath.trim());
extDictFiles.add(file.toString());
}
}
}
}
return extDictFiles;
public Path getConfigInPluginDir() {
return PathUtils
.get(new File(AnalysisIkPlugin.class.getProtectionDomain().getCodeSource().getLocation().getPath())
.getParent(), "config")
.toAbsolutePath();
}
public List<String> getRemoteExtDictionarys() {
List<String> remoteExtDictFiles = new ArrayList<String>(2);
String remoteExtDictCfg = props.getProperty(REMOTE_EXT_DICT);
if (remoteExtDictCfg != null) {
String[] filePaths = remoteExtDictCfg.split(";");
if (filePaths != null) {
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
remoteExtDictFiles.add(filePath);
}
}
}
}
return remoteExtDictFiles;
public boolean isUseSmart() {
return useSmart;
}
public List<String> getExtStopWordDictionarys() {
List<String> extStopWordDictFiles = new ArrayList<String>(2);
String extStopWordDictCfg = props.getProperty(EXT_STOP);
if (extStopWordDictCfg != null) {
String[] filePaths = extStopWordDictCfg.split(";");
if (filePaths != null) {
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
Path file = PathUtils.get(filePath.trim());
extStopWordDictFiles.add(file.toString());
}
}
}
}
return extStopWordDictFiles;
public Configuration setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
return this;
}
public List<String> getRemoteExtStopWordDictionarys() {
List<String> remoteExtStopWordDictFiles = new ArrayList<String>(2);
String remoteExtStopWordDictCfg = props.getProperty(REMOTE_EXT_STOP);
if (remoteExtStopWordDictCfg != null) {
String[] filePaths = remoteExtStopWordDictCfg.split(";");
if (filePaths != null) {
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
remoteExtStopWordDictFiles.add(filePath);
}
}
}
}
return remoteExtStopWordDictFiles;
public Environment getEnvironment() {
return environment;
}
public String getDictRoot() {
return conf_dir.toAbsolutePath().toString();
public Settings getSettings() {
return settings;
}
private Path getConfigInPluginDir() {
return PathUtils
.get(new File(AnalysisIkPlugin.class.getProtectionDomain().getCodeSource().getLocation().getPath())
.getParent(), "config")
.toAbsolutePath();
public boolean isEnableRemoteDict() {
return enableRemoteDict;
}
public boolean isEnableLowercase() {
return enableLowercase;
}
}
......@@ -32,6 +32,7 @@ import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
/**
......@@ -72,12 +73,11 @@ class AnalyzeContext {
private Map<Integer , LexemePath> pathMap;
//最终分词结果集
private LinkedList<Lexeme> results;
private boolean useSmart;
//分词器配置项
// private Configuration cfg;
private Configuration cfg;
public AnalyzeContext(boolean useSmart){
this.useSmart = useSmart;
public AnalyzeContext(Configuration configuration){
this.cfg = configuration;
this.segmentBuff = new char[BUFF_SIZE];
this.charTypes = new int[BUFF_SIZE];
this.buffLocker = new HashSet<String>();
......@@ -139,7 +139,7 @@ class AnalyzeContext {
*/
void initCursor(){
this.cursor = 0;
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],cfg.isEnableLowercase());
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
}
......@@ -151,7 +151,7 @@ class AnalyzeContext {
boolean moveCursor(){
if(this.cursor < this.available - 1){
this.cursor++;
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]);
this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor],cfg.isEnableLowercase());
this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]);
return true;
}else{
......@@ -345,7 +345,7 @@ class AnalyzeContext {
*/
private void compound(Lexeme result){
if(!this.useSmart){
if(!this.cfg.isUseSmart()){
return ;
}
//数量词合并处理
......
......@@ -86,14 +86,14 @@ class CharacterUtil {
* @param input
* @return char
*/
static char regularize(char input){
static char regularize(char input,boolean lowercase){
if (input == 12288) {
input = (char) 32;
}else if (input > 65280 && input < 65375) {
input = (char) (input - 65248);
}else if (input >= 'A' && input <= 'Z') {
}else if (input >= 'A' && input <= 'Z' && lowercase) {
input += 32;
}
......
......@@ -23,10 +23,7 @@
*/
package org.wltea.analyzer.core;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.dic.Dictionary;
import java.io.IOException;
import java.io.Reader;
......@@ -47,16 +44,16 @@ public final class IKSegmenter {
private List<ISegmenter> segmenters;
//分词歧义裁决器
private IKArbitrator arbitrator;
private boolean useSmart = false;
private Configuration configuration;
/**
* IK分词器构造函数
* @param input
*/
public IKSegmenter(Reader input ,boolean useSmart){
public IKSegmenter(Reader input ,Configuration configuration){
this.input = input;
this.useSmart = useSmart;
this.configuration = configuration;
this.init();
}
......@@ -66,7 +63,7 @@ public final class IKSegmenter {
*/
private void init(){
//初始化分词上下文
this.context = new AnalyzeContext(useSmart);
this.context = new AnalyzeContext(configuration);
//加载子分词器
this.segmenters = this.loadSegmenters();
//加载歧义裁决器
......@@ -127,7 +124,7 @@ public final class IKSegmenter {
}
}
//对分词进行歧义处理
this.arbitrator.process(context, useSmart);
this.arbitrator.process(context, configuration.isUseSmart());
//将分词结果输出到结果集,并处理未切分的单个CJK字符
context.outputToResult();
//记录本次分词的缓冲区位移
......
......@@ -33,9 +33,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.*;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
......@@ -49,6 +47,7 @@ import org.apache.http.impl.client.HttpClients;
import org.elasticsearch.common.io.PathUtils;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin;
import org.wltea.analyzer.cfg.Configuration;
/**
......@@ -88,10 +87,53 @@ public class Dictionary {
public static final String PATH_DIC_PREP = "preposition.dic";
public static final String PATH_DIC_STOP = "stopword.dic";
private Dictionary() {
private final static String FILE_NAME = "IKAnalyzer.cfg.xml";
private final static String EXT_DICT = "ext_dict";
private final static String REMOTE_EXT_DICT = "remote_ext_dict";
private final static String EXT_STOP = "ext_stopwords";
private final static String REMOTE_EXT_STOP = "remote_ext_stopwords";
private Path conf_dir;
private Properties props;
private Dictionary(Configuration cfg) {
this.configuration = cfg;
this.props = new Properties();
this.conf_dir = cfg.getEnvironment().configFile().resolve(AnalysisIkPlugin.PLUGIN_NAME);
Path configFile = conf_dir.resolve(FILE_NAME);
InputStream input = null;
try {
logger.info("try load config from {}", configFile);
input = new FileInputStream(configFile.toFile());
} catch (FileNotFoundException e) {
conf_dir = cfg.getConfigInPluginDir();
configFile = conf_dir.resolve(FILE_NAME);
try {
logger.info("try load config from {}", configFile);
input = new FileInputStream(configFile.toFile());
} catch (FileNotFoundException ex) {
// We should report origin exception
logger.error("ik-analyzer", e);
}
}
if (input != null) {
try {
props.loadFromXML(input);
} catch (InvalidPropertiesFormatException e) {
logger.error("ik-analyzer", e);
} catch (IOException e) {
logger.error("ik-analyzer", e);
}
}
}
public String getProperty(String key){
if(props!=null){
return props.getProperty(key);
}
return null;
}
/**
* 词典初始化 由于IK Analyzer的词典采用Dictionary类的静态方法进行词典初始化
* 只有当Dictionary类被实际调用时,才会开始载入词典, 这将延长首次分词操作的时间 该方法提供了一个在应用加载阶段就初始化字典的手段
......@@ -102,8 +144,8 @@ public class Dictionary {
if (singleton == null) {
synchronized (Dictionary.class) {
if (singleton == null) {
singleton = new Dictionary();
singleton.configuration = cfg;
singleton = new Dictionary(cfg);
singleton.loadMainDict();
singleton.loadSurnameDict();
singleton.loadQuantifierDict();
......@@ -111,13 +153,15 @@ public class Dictionary {
singleton.loadPrepDict();
singleton.loadStopWordDict();
// 建立监控线程
for (String location : cfg.getRemoteExtDictionarys()) {
// 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
}
for (String location : cfg.getRemoteExtStopWordDictionarys()) {
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
if(cfg.isEnableRemoteDict()){
// 建立监控线程
for (String location : singleton.getRemoteExtDictionarys()) {
// 10 秒是初始延迟可以修改的 60是间隔时间 单位秒
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
}
for (String location : singleton.getRemoteExtStopWordDictionarys()) {
pool.scheduleAtFixedRate(new Monitor(location), 10, 60, TimeUnit.SECONDS);
}
}
return singleton;
......@@ -127,6 +171,77 @@ public class Dictionary {
return singleton;
}
public List<String> getExtDictionarys() {
List<String> extDictFiles = new ArrayList<String>(2);
String extDictCfg = getProperty(EXT_DICT);
if (extDictCfg != null) {
String[] filePaths = extDictCfg.split(";");
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
Path file = PathUtils.get(filePath.trim());
extDictFiles.add(file.toString());
}
}
}
return extDictFiles;
}
public List<String> getRemoteExtDictionarys() {
List<String> remoteExtDictFiles = new ArrayList<String>(2);
String remoteExtDictCfg = getProperty(REMOTE_EXT_DICT);
if (remoteExtDictCfg != null) {
String[] filePaths = remoteExtDictCfg.split(";");
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
remoteExtDictFiles.add(filePath);
}
}
}
return remoteExtDictFiles;
}
public List<String> getExtStopWordDictionarys() {
List<String> extStopWordDictFiles = new ArrayList<String>(2);
String extStopWordDictCfg = getProperty(EXT_STOP);
if (extStopWordDictCfg != null) {
String[] filePaths = extStopWordDictCfg.split(";");
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
Path file = PathUtils.get(filePath.trim());
extStopWordDictFiles.add(file.toString());
}
}
}
return extStopWordDictFiles;
}
public List<String> getRemoteExtStopWordDictionarys() {
List<String> remoteExtStopWordDictFiles = new ArrayList<String>(2);
String remoteExtStopWordDictCfg = getProperty(REMOTE_EXT_STOP);
if (remoteExtStopWordDictCfg != null) {
String[] filePaths = remoteExtStopWordDictCfg.split(";");
for (String filePath : filePaths) {
if (filePath != null && !"".equals(filePath.trim())) {
remoteExtStopWordDictFiles.add(filePath);
}
}
}
return remoteExtStopWordDictFiles;
}
public String getDictRoot() {
return conf_dir.toAbsolutePath().toString();
}
/**
* 获取词典单子实例
*
......@@ -139,6 +254,7 @@ public class Dictionary {
return singleton;
}
/**
* 批量加载新词条
*
......@@ -224,7 +340,7 @@ public class Dictionary {
_MainDict = new DictSegment((char) 0);
// 读取主词典文件
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_MAIN);
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_MAIN);
InputStream is = null;
try {
......@@ -267,13 +383,13 @@ public class Dictionary {
*/
private void loadExtDict() {
// 加载扩展词典配置
List<String> extDictFiles = configuration.getExtDictionarys();
List<String> extDictFiles = getExtDictionarys();
if (extDictFiles != null) {
InputStream is = null;
for (String extDictName : extDictFiles) {
// 读取扩展词典文件
logger.info("[Dict Loading] " + extDictName);
Path file = PathUtils.get(configuration.getDictRoot(), extDictName);
Path file = PathUtils.get(getDictRoot(), extDictName);
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
......@@ -315,7 +431,7 @@ public class Dictionary {
* 加载远程扩展词典到主词库表
*/
private void loadRemoteExtDict() {
List<String> remoteExtDictFiles = configuration.getRemoteExtDictionarys();
List<String> remoteExtDictFiles = getRemoteExtDictionarys();
for (String location : remoteExtDictFiles) {
logger.info("[Dict Loading] " + location);
List<String> lists = getRemoteWords(location);
......@@ -386,7 +502,7 @@ public class Dictionary {
_StopWords = new DictSegment((char) 0);
// 读取主词典文件
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_STOP);
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_STOP);
InputStream is = null;
try {
......@@ -420,14 +536,14 @@ public class Dictionary {
}
// 加载扩展停止词典
List<String> extStopWordDictFiles = configuration.getExtStopWordDictionarys();
List<String> extStopWordDictFiles = getExtStopWordDictionarys();
if (extStopWordDictFiles != null) {
is = null;
for (String extStopWordDictName : extStopWordDictFiles) {
logger.info("[Dict Loading] " + extStopWordDictName);
// 读取扩展词典文件
file = PathUtils.get(configuration.getDictRoot(), extStopWordDictName);
file = PathUtils.get(getDictRoot(), extStopWordDictName);
try {
is = new FileInputStream(file.toFile());
} catch (FileNotFoundException e) {
......@@ -465,7 +581,7 @@ public class Dictionary {
}
// 加载远程停用词典
List<String> remoteExtStopWordDictFiles = configuration.getRemoteExtStopWordDictionarys();
List<String> remoteExtStopWordDictFiles = getRemoteExtStopWordDictionarys();
for (String location : remoteExtStopWordDictFiles) {
logger.info("[Dict Loading] " + location);
List<String> lists = getRemoteWords(location);
......@@ -492,7 +608,7 @@ public class Dictionary {
// 建立一个量词典实例
_QuantifierDict = new DictSegment((char) 0);
// 读取量词词典文件
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_QUANTIFIER);
InputStream is = null;
try {
is = new FileInputStream(file.toFile());
......@@ -527,7 +643,7 @@ public class Dictionary {
private void loadSurnameDict() {
_SurnameDict = new DictSegment((char) 0);
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SURNAME);
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SURNAME);
InputStream is = null;
try {
is = new FileInputStream(file.toFile());
......@@ -563,7 +679,7 @@ public class Dictionary {
private void loadSuffixDict() {
_SuffixDict = new DictSegment((char) 0);
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_SUFFIX);
InputStream is = null;
try {
is = new FileInputStream(file.toFile());
......@@ -598,7 +714,7 @@ public class Dictionary {
private void loadPrepDict() {
_PrepDict = new DictSegment((char) 0);
Path file = PathUtils.get(configuration.getDictRoot(), Dictionary.PATH_DIC_PREP);
Path file = PathUtils.get(getDictRoot(), Dictionary.PATH_DIC_PREP);
InputStream is = null;
try {
is = new FileInputStream(file.toFile());
......@@ -634,7 +750,7 @@ public class Dictionary {
public void reLoadMainDict() {
logger.info("重新加载词典...");
// 新开一个实例加载词典,减少加载过程对当前词典使用的影响
Dictionary tmpDict = new Dictionary();
Dictionary tmpDict = new Dictionary(configuration);
tmpDict.configuration = getSingleton().configuration;
tmpDict.loadMainDict();
tmpDict.loadStopWordDict();
......@@ -643,4 +759,4 @@ public class Dictionary {
logger.info("重新加载词典完毕...");
}
}
\ No newline at end of file
}
......@@ -26,6 +26,7 @@ package org.wltea.analyzer.lucene;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.wltea.analyzer.cfg.Configuration;
/**
* IK分词器,Lucene Analyzer接口实现
......@@ -33,15 +34,7 @@ import org.apache.lucene.analysis.Tokenizer;
*/
public final class IKAnalyzer extends Analyzer{
private boolean useSmart;
public boolean useSmart() {
return useSmart;
}
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
private Configuration configuration;
/**
* IK分词器Lucene Analyzer接口实现类
......@@ -54,11 +47,11 @@ public final class IKAnalyzer extends Analyzer{
/**
* IK分词器Lucene Analyzer接口实现类
*
* @param useSmart 当为true时,分词器进行智能切分
* @param configuration IK配置
*/
public IKAnalyzer(boolean useSmart){
public IKAnalyzer(Configuration configuration){
super();
this.useSmart = useSmart;
this.configuration = configuration;
}
......@@ -67,7 +60,7 @@ public final class IKAnalyzer extends Analyzer{
*/
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer _IKTokenizer = new IKTokenizer(useSmart);
Tokenizer _IKTokenizer = new IKTokenizer(configuration);
return new TokenStreamComponents(_IKTokenizer);
}
......
......@@ -32,6 +32,7 @@ import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
......@@ -64,16 +65,15 @@ public final class IKTokenizer extends Tokenizer {
/**
* Lucene 4.0 Tokenizer适配器类构造函数
* @param in
*/
public IKTokenizer(boolean useSmart){
public IKTokenizer(Configuration configuration){
super();
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
_IKImplement = new IKSegmenter(input,useSmart);
_IKImplement = new IKSegmenter(input,configuration);
}
/* (non-Javadoc)
......
/**
* IK 中文分词 版本 5.0.1
* IK Analyzer release 5.0.1
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
package org.wltea.analyzer.sample;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.wltea.analyzer.lucene.IKAnalyzer;
/**
* 使用IKAnalyzer进行分词的演示
* 2012-10-22
*
*/
public class IKAnalzyerDemo {
public static ESLogger logger= Loggers.getLogger("ik-analyzer");
public static void main(String[] args){
//构建IK分词器,使用smart分词模式
Analyzer analyzer = new IKAnalyzer(true);
//获取Lucene的TokenStream对象
TokenStream ts = null;
try {
ts = analyzer.tokenStream("myfield", new StringReader("WORLD ,.. html DATA</html>HELLO"));
// ts = analyzer.tokenStream("myfield", new StringReader("这是一个中文分词的例子,你可以直接运行它!IKAnalyer can analysis english text too"));
//获取词元位置属性
OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
//获取词元文本属性
CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
//获取词元文本属性
TypeAttribute type = ts.addAttribute(TypeAttribute.class);
//重置TokenStream(重置StringReader)
ts.reset();
//迭代获取分词结果
while (ts.incrementToken()) {
System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type());
}
//关闭TokenStream(关闭StringReader)
ts.end(); // Perform end-of-stream operations, e.g. set the final offset.
} catch (IOException e) {
logger.error(e.getMessage(), e);
} finally {
//释放TokenStream的所有资源
if(ts != null){
try {
ts.close();
} catch (IOException e) {
logger.error(e.getMessage(), e);
}
}
}
}
}
\ No newline at end of file
/**
* IK 中文分词 版本 5.0
* IK Analyzer release 5.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* 源代码由林良益(linliangyi2005@gmail.com)提供
* 版权声明 2012,乌龙茶工作室
* provided by Linliangyi and copyright 2012 by Oolong studio
*
*
*/
package org.wltea.analyzer.sample;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.wltea.analyzer.lucene.IKAnalyzer;
/**
* 使用IKAnalyzer进行Lucene索引和查询的演示
* 2012-3-2
*
* 以下是结合Lucene4.0 API的写法
*
*/
public class LuceneIndexAndSearchDemo {
public static ESLogger logger= Loggers.getLogger("ik-analyzer");
/**
* 模拟:
* 创建一个单条记录的索引,并对其进行搜索
* @param args
*/
public static void main(String[] args){
//Lucene Document的域名
String fieldName = "text";
//检索内容
String text = "IK Analyzer是一个结合词典分词和文法分词的中文分词开源工具包。它使用了全新的正向迭代最细粒度切分算法。";
//实例化IKAnalyzer分词器
Analyzer analyzer = new IKAnalyzer(true);
Directory directory = null;
IndexWriter iwriter = null;
IndexReader ireader = null;
IndexSearcher isearcher = null;
try {
//建立内存索引对象
directory = new RAMDirectory();
//配置IndexWriterConfig
IndexWriterConfig iwConfig = new IndexWriterConfig(analyzer);
iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
iwriter = new IndexWriter(directory , iwConfig);
//写入索引
Document doc = new Document();
doc.add(new StringField("ID", "10000", Field.Store.YES));
doc.add(new TextField(fieldName, text, Field.Store.YES));
iwriter.addDocument(doc);
iwriter.close();
//搜索过程**********************************
//实例化搜索器
ireader = DirectoryReader.open(directory);
isearcher = new IndexSearcher(ireader);
String keyword = "中文分词工具包";
//使用QueryParser查询分析器构造Query对象
QueryParser qp = new QueryParser(fieldName, analyzer);
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
Query query = qp.parse(keyword);
System.out.println("Query = " + query);
//搜索相似度最高的5条记录
TopDocs topDocs = isearcher.search(query , 5);
System.out.println("命中:" + topDocs.totalHits);
//输出结果
ScoreDoc[] scoreDocs = topDocs.scoreDocs;
for (int i = 0; i < topDocs.totalHits; i++){
Document targetDoc = isearcher.doc(scoreDocs[i].doc);
System.out.println("内容:" + targetDoc.toString());
}
} catch (CorruptIndexException e) {
logger.error(e.getMessage(), e);
} catch (LockObtainFailedException e) {
logger.error(e.getMessage(), e);
} catch (IOException e) {
logger.error(e.getMessage(), e);
} catch (ParseException e) {
logger.error(e.getMessage(), e);
} finally{
if(ireader != null){
try {
ireader.close();
} catch (IOException e) {
logger.error(e.getMessage(), e);
}
}
if(directory != null){
try {
directory.close();
} catch (IOException e) {
logger.error(e.getMessage(), e);
}
}
}
}
}
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<Diagram>
<ID>JAVA</ID>
<OriginalElement>org.elasticsearch.index.analysis.IKAnalysisBinderProcessor</OriginalElement>
<nodes>
<node x="1244.0" y="553.0">org.elasticsearch.index.analysis.IKAnalysisBinderProcessor</node>
<node x="2212.0" y="489.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.AnalyzersBindings</node>
<node x="1316.0" y="0.0">java.lang.Object</node>
<node x="1244.0" y="329.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor</node>
<node x="616.0" y="510.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenFiltersBindings</node>
<node x="0.0" y="510.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.CharFiltersBindings</node>
<node x="1608.0" y="510.0">org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenizersBindings</node>
</nodes>
<notes />
<edges>
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenFiltersBindings" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
<point x="152.0" y="-77.0" />
<point x="1072.0" y="469.0" />
<point x="1347.2" y="469.0" />
<point x="-68.79999999999995" y="55.0" />
</edge>
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.CharFiltersBindings" target="java.lang.Object">
<point x="-149.0" y="-77.0" />
<point x="149.0" y="299.0" />
<point x="1336.0" y="299.0" />
<point x="-80.0" y="139.5" />
</edge>
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor" target="java.lang.Object">
<point x="0.0" y="-55.0" />
<point x="0.0" y="139.5" />
</edge>
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.AnalyzersBindings" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
<point x="-180.5" y="-98.0" />
<point x="2392.5" y="459.0" />
<point x="1553.6" y="459.0" />
<point x="137.5999999999999" y="55.0" />
</edge>
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.CharFiltersBindings" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
<point x="149.0" y="-77.0" />
<point x="447.0" y="459.0" />
<point x="1278.4" y="459.0" />
<point x="-137.5999999999999" y="55.0" />
</edge>
<edge source="org.elasticsearch.index.analysis.IKAnalysisBinderProcessor" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
<point x="0.0" y="-34.0" />
<point x="0.0" y="55.0" />
</edge>
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenFiltersBindings" target="java.lang.Object">
<point x="-152.0" y="-77.0" />
<point x="768.0" y="309.0" />
<point x="1376.0" y="309.0" />
<point x="-40.0" y="139.5" />
</edge>
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.AnalyzersBindings" target="java.lang.Object">
<point x="180.5" y="-98.0" />
<point x="2753.5" y="299.0" />
<point x="1496.0" y="299.0" />
<point x="80.0" y="139.5" />
</edge>
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenizersBindings" target="java.lang.Object">
<point x="146.0" y="-77.0" />
<point x="2046.0" y="309.0" />
<point x="1456.0" y="309.0" />
<point x="40.0" y="139.5" />
</edge>
<edge source="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor.TokenizersBindings" target="org.elasticsearch.index.analysis.AnalysisModule.AnalysisBinderProcessor">
<point x="-146.0" y="-77.0" />
<point x="1754.0" y="469.0" />
<point x="1484.8" y="469.0" />
<point x="68.79999999999995" y="55.0" />
</edge>
</edges>
<settings layout="Hierarchic Group" zoom="1.0" x="110.5" y="89.0" />
<SelectedNodes />
<Categories>
<Category>Fields</Category>
<Category>Methods</Category>
<Category>Constructors</Category>
<Category>Inner Classes</Category>
<Category>Properties</Category>
</Categories>
</Diagram>
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册