提交 a2070598 编写于 作者: weixin_43283383's avatar weixin_43283383

Merge pull request #40 from RickyHu/master

提供远程加载词典配置
IK Analysis for ElasticSearch IK Analysis for ElasticSearch
================================== ==================================
更新说明:
对于使用es集群,用ik作为分词插件,经常会修改自定义词典,增加远程加载,每次更新都会重新加载词典,不必重启es服务。
The IK Analysis plugin integrates Lucene IK analyzer into elasticsearch, support customized dictionary. The IK Analysis plugin integrates Lucene IK analyzer into elasticsearch, support customized dictionary.
Tokenizer: `ik` Tokenizer: `ik`
...@@ -52,7 +57,11 @@ https://github.com/medcl/elasticsearch-analysis-ik/blob/master/config/ik/IKAnaly ...@@ -52,7 +57,11 @@ https://github.com/medcl/elasticsearch-analysis-ik/blob/master/config/ik/IKAnaly
<!--用户可以在这里配置自己的扩展字典 --> <!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry> <entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
<!--用户可以在这里配置自己的扩展停止词字典--> <!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">custom/ext_stopword.dic</entry> <entry key="ext_stopwords">custom/ext_stopword.dic</entry>
<!--用户可以在这里配置远程扩展字典 -->
<entry key="remote_ext_dict">location</entry>
<!--用户可以在这里配置远程扩展停止词字典-->
<entry key="remote_ext_stopwords">location</entry>
</properties> </properties>
</pre> </pre>
......
...@@ -5,5 +5,9 @@ ...@@ -5,5 +5,9 @@
<!--用户可以在这里配置自己的扩展字典 --> <!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry> <entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
<!--用户可以在这里配置自己的扩展停止词字典--> <!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">custom/ext_stopword.dic</entry> <entry key="ext_stopwords">custom/ext_stopword.dic</entry>
</properties> <!--用户可以在这里配置远程扩展字典 -->
\ No newline at end of file <entry key="remote_ext_dict">words_location</entry>
<!--用户可以在这里配置远程扩展停止词字典-->
<entry key="remote_ext_stopwords">words_location</entry>
</properties>
...@@ -51,6 +51,13 @@ ...@@ -51,6 +51,13 @@
<version>${elasticsearch.version}</version> <version>${elasticsearch.version}</version>
<scope>compile</scope> <scope>compile</scope>
</dependency> </dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.3.5</version>
<scope>compile</scope>
</dependency>
<dependency> <dependency>
<groupId>log4j</groupId> <groupId>log4j</groupId>
......
...@@ -17,7 +17,9 @@ public class Configuration { ...@@ -17,7 +17,9 @@ public class Configuration {
private static String FILE_NAME = "ik/IKAnalyzer.cfg.xml"; private static String FILE_NAME = "ik/IKAnalyzer.cfg.xml";
private static final String EXT_DICT = "ext_dict"; private static final String EXT_DICT = "ext_dict";
private static final String REMOTE_EXT_DICT = "remote_ext_dict";
private static final String EXT_STOP = "ext_stopwords"; private static final String EXT_STOP = "ext_stopwords";
private static final String REMOTE_EXT_STOP = "remote_ext_stopwords";
private static ESLogger logger = null; private static ESLogger logger = null;
private Properties props; private Properties props;
private Environment environment; private Environment environment;
...@@ -64,6 +66,24 @@ public class Configuration { ...@@ -64,6 +66,24 @@ public class Configuration {
} }
return extDictFiles; return extDictFiles;
} }
public List<String> getRemoteExtDictionarys(){
List<String> remoteExtDictFiles = new ArrayList<String>(2);
String remoteExtDictCfg = props.getProperty(REMOTE_EXT_DICT);
if(remoteExtDictCfg != null){
String[] filePaths = remoteExtDictCfg.split(";");
if(filePaths != null){
for(String filePath : filePaths){
if(filePath != null && !"".equals(filePath.trim())){
remoteExtDictFiles.add(filePath);
}
}
}
}
return remoteExtDictFiles;
}
public List<String> getExtStopWordDictionarys(){ public List<String> getExtStopWordDictionarys(){
List<String> extStopWordDictFiles = new ArrayList<String>(2); List<String> extStopWordDictFiles = new ArrayList<String>(2);
...@@ -83,6 +103,24 @@ public class Configuration { ...@@ -83,6 +103,24 @@ public class Configuration {
} }
return extStopWordDictFiles; return extStopWordDictFiles;
} }
public List<String> getRemoteExtStopWordDictionarys(){
List<String> remoteExtStopWordDictFiles = new ArrayList<String>(2);
String remoteExtStopWordDictCfg = props.getProperty(REMOTE_EXT_STOP);
if(remoteExtStopWordDictCfg != null){
String[] filePaths = remoteExtStopWordDictCfg.split(";");
if(filePaths != null){
for(String filePath : filePaths){
if(filePath != null && !"".equals(filePath.trim())){
remoteExtStopWordDictFiles.add(filePath);
}
}
}
}
return remoteExtStopWordDictFiles;
}
public File getDictRoot() { public File getDictRoot() {
return environment.configFile(); return environment.configFile();
......
...@@ -25,11 +25,18 @@ ...@@ -25,11 +25,18 @@
*/ */
package org.wltea.analyzer.dic; package org.wltea.analyzer.dic;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.common.logging.Loggers;
import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.cfg.Configuration;
import java.io.*; import java.io.*;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
...@@ -92,6 +99,17 @@ public class Dictionary { ...@@ -92,6 +99,17 @@ public class Dictionary {
singleton.loadSuffixDict(); singleton.loadSuffixDict();
singleton.loadPrepDict(); singleton.loadPrepDict();
singleton.loadStopWordDict(); singleton.loadStopWordDict();
//建立监控线程
for(String location:cfg.getRemoteExtDictionarys()){
Thread monitor = new Thread(new Monitor(location));
monitor.start();
}
for(String location:cfg.getRemoteExtStopWordDictionarys()){
Thread monitor = new Thread(new Monitor(location));
monitor.start();
}
return singleton; return singleton;
} }
} }
...@@ -224,6 +242,8 @@ public class Dictionary { ...@@ -224,6 +242,8 @@ public class Dictionary {
} }
//加载扩展词典 //加载扩展词典
this.loadExtDict(); this.loadExtDict();
//加载远程自定义词库
this.loadRemoteExtDict();
} }
/** /**
...@@ -275,6 +295,76 @@ public class Dictionary { ...@@ -275,6 +295,76 @@ public class Dictionary {
} }
} }
/**
* 加载远程扩展词典到主词库表
*/
private void loadRemoteExtDict(){
List<String> remoteExtDictFiles = configuration.getRemoteExtDictionarys();
for(String location:remoteExtDictFiles){
logger.info("[Dict Loading]" + location);
List<String> lists = getRemoteWords(location);
//如果找不到扩展的字典,则忽略
if(lists == null){
logger.error("[Dict Loading]"+location+"加载失败");
continue;
}
for(String theWord:lists){
if (theWord != null && !"".equals(theWord.trim())) {
//加载扩展词典数据到主内存词典中
logger.info(theWord);
_MainDict.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
}
}
}
/**
* 从远程服务器上下载自定义词条
*/
private static List<String> getRemoteWords(String location){
List<String> buffer = new ArrayList<String>();
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
.setConnectTimeout(10*1000).setSocketTimeout(60*1000).build();
CloseableHttpClient httpclient = HttpClients.createDefault();
CloseableHttpResponse response;
BufferedReader in;
HttpGet get = new HttpGet(location);
get.setConfig(rc);
try {
response = httpclient.execute(get);
if(response.getStatusLine().getStatusCode()==200){
String charset = "UTF-8";
//获取编码,默认为utf-8
if(response.getEntity().getContentType().getValue().contains("charset=")){
String contentType=response.getEntity().getContentType().getValue();
charset=contentType.substring(contentType.lastIndexOf("=")+1);
}
in = new BufferedReader(new InputStreamReader(response.getEntity().getContent(),charset));
String line ;
while((line = in.readLine())!=null){
buffer.add(line);
}
in.close();
response.close();
return buffer;
}
response.close();
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IllegalStateException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return buffer;
}
/** /**
* 加载用户扩展的停止词词典 * 加载用户扩展的停止词词典
*/ */
...@@ -360,7 +450,28 @@ public class Dictionary { ...@@ -360,7 +450,28 @@ public class Dictionary {
} }
} }
} }
} }
//加载远程停用词典
List<String> remoteExtStopWordDictFiles = configuration.getRemoteExtStopWordDictionarys();
for(String location:remoteExtStopWordDictFiles){
logger.info("[Dict Loading]" + location);
List<String> lists = getRemoteWords(location);
//如果找不到扩展的字典,则忽略
if(lists == null){
logger.error("[Dict Loading]"+location+"加载失败");
continue;
}
for(String theWord:lists){
if (theWord != null && !"".equals(theWord.trim())) {
//加载远程词典数据到主内存中
logger.info(theWord);
_StopWords.fillSegment(theWord.trim().toLowerCase().toCharArray());
}
}
}
} }
/** /**
...@@ -511,6 +622,11 @@ public class Dictionary { ...@@ -511,6 +622,11 @@ public class Dictionary {
} }
} }
} }
public void reLoadMainDict(){
logger.info("重新加载词典...");
loadMainDict();
loadStopWordDict();
}
} }
package org.wltea.analyzer.dic;
import java.io.IOException;
import org.apache.http.Header;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpHead;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.wltea.analyzer.help.Sleep;
import org.wltea.analyzer.help.Sleep.Type;
public class Monitor implements Runnable {
private static CloseableHttpClient httpclient = HttpClients.createDefault();
/*
* 上次更改时间
*/
private String last_modified;
/*
* 资源属性
*/
private String eTags;
/*
* 请求地址
*/
private String location;
public Monitor(String location) {
this.location = location;
this.last_modified = null;
this.eTags = null;
}
/**
* 监控流程:
* ①向词库服务器发送Head请求
* ②从响应中获取Last-Modify、ETags字段值,判断是否变化
* ③如果未变化,休眠1min,返回第①步
* ④如果有变化,重新加载词典
* ⑤休眠1min,返回第①步
*/
public void run() {
//超时设置
RequestConfig rc = RequestConfig.custom().setConnectionRequestTimeout(10*1000)
.setConnectTimeout(10*1000).setSocketTimeout(15*1000).build();
while (true) {
HttpHead head = new HttpHead(location);
head.setConfig(rc);
//设置请求头
if (last_modified != null) {
head.setHeader("If-Modified-Since", last_modified);
}
if (eTags != null) {
head.setHeader("If-None-Match", eTags);
}
CloseableHttpResponse response = null;
try {
response = httpclient.execute(head);
//返回304 Not Modified,词库未更新
if(response.getStatusLine().getStatusCode()==304){
continue;
}else if(response.getStatusLine().getStatusCode()==200){
if (!response.getLastHeader("Last-Modified").getValue().equalsIgnoreCase(last_modified)
||!response.getLastHeader("ETags").getValue().equalsIgnoreCase(eTags)) {
// 远程词库有更新,需要重新加载词典,并修改last_modified,eTags
Dictionary.getSingleton().reLoadMainDict();
last_modified = response.getLastHeader("Last-Modified")==null?null:response.getLastHeader("Last-Modified").getValue();
eTags = response.getLastHeader("ETags")==null?null:response.getLastHeader("ETags").getValue();
}
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}finally{
try {
response.close();
} catch (IOException e) {
e.printStackTrace();
}
Sleep.sleep(Type.SEC, 60);
}
}
}
}
package org.wltea.analyzer.help;
public class Sleep {
public enum Type{MSEC,SEC,MIN,HOUR};
public static void sleep(Type type,int num){
try {
switch(type){
case MSEC:
Thread.sleep(num);
return;
case SEC:
Thread.sleep(num*1000);
return;
case MIN:
Thread.sleep(num*60*1000);
return;
case HOUR:
Thread.sleep(num*60*60*1000);
return;
default:
System.err.println("输入类型错误,应为MSEC,SEC,MIN,HOUR之一");
return;
}
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册