Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
南宫伊儿
elasticsearch-analysis-ik
提交
a2070598
E
elasticsearch-analysis-ik
项目概览
南宫伊儿
/
elasticsearch-analysis-ik
与 Fork 源项目一致
从无法访问的项目Fork
通知
3
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
E
elasticsearch-analysis-ik
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
a2070598
编写于
9月 24, 2014
作者:
weixin_43283383
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #40 from RickyHu/master
提供远程加载词典配置
上级
1f80b50a
8a91d7c9
变更
7
隐藏空白更改
内联
并排
Showing
7 changed file
with
306 addition
and
6 deletion
+306
-6
README.textile
README.textile
+10
-1
config/ik/IKAnalyzer.cfg.xml
config/ik/IKAnalyzer.cfg.xml
+6
-2
pom.xml
pom.xml
+7
-0
src/main/java/org/wltea/analyzer/cfg/Configuration.java
src/main/java/org/wltea/analyzer/cfg/Configuration.java
+38
-0
src/main/java/org/wltea/analyzer/dic/Dictionary.java
src/main/java/org/wltea/analyzer/dic/Dictionary.java
+119
-3
src/main/java/org/wltea/analyzer/dic/Monitor.java
src/main/java/org/wltea/analyzer/dic/Monitor.java
+96
-0
src/main/java/org/wltea/analyzer/help/Sleep.java
src/main/java/org/wltea/analyzer/help/Sleep.java
+30
-0
未找到文件。
README.textile
浏览文件 @
a2070598
IK Analysis for ElasticSearch
IK Analysis for ElasticSearch
==================================
==================================
更新说明:
对于使用es集群,用ik作为分词插件,经常会修改自定义词典,增加远程加载,每次更新都会重新加载词典,不必重启es服务。
The IK Analysis plugin integrates Lucene IK analyzer into elasticsearch, support customized dictionary.
The IK Analysis plugin integrates Lucene IK analyzer into elasticsearch, support customized dictionary.
Tokenizer: `ik`
Tokenizer: `ik`
...
@@ -52,7 +57,11 @@ https://github.com/medcl/elasticsearch-analysis-ik/blob/master/config/ik/IKAnaly
...
@@ -52,7 +57,11 @@ https://github.com/medcl/elasticsearch-analysis-ik/blob/master/config/ik/IKAnaly
<!--用户可以在这里配置自己的扩展字典 -->
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
<entry key="ext_dict">custom/mydict.dic;custom/single_word_low_freq.dic</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
<entry key="ext_stopwords">custom/ext_stopword.dic</entry>
<!--用户可以在这里配置远程扩展字典 -->
<entry key="remote_ext_dict">location</entry>
<!--用户可以在这里配置远程扩展停止词字典-->
<entry key="remote_ext_stopwords">location</entry>
</properties>
</properties>
</pre>
</pre>
...
...
config/ik/IKAnalyzer.cfg.xml
浏览文件 @
a2070598
...
@@ -5,5 +5,9 @@
...
@@ -5,5 +5,9 @@
<!--用户可以在这里配置自己的扩展字典 -->
<!--用户可以在这里配置自己的扩展字典 -->
<entry
key=
"ext_dict"
>
custom/mydict.dic;custom/single_word_low_freq.dic
</entry>
<entry
key=
"ext_dict"
>
custom/mydict.dic;custom/single_word_low_freq.dic
</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry
key=
"ext_stopwords"
>
custom/ext_stopword.dic
</entry>
<entry
key=
"ext_stopwords"
>
custom/ext_stopword.dic
</entry>
</properties>
<!--用户可以在这里配置远程扩展字典 -->
\ No newline at end of file
<entry
key=
"remote_ext_dict"
>
words_location
</entry>
<!--用户可以在这里配置远程扩展停止词字典-->
<entry
key=
"remote_ext_stopwords"
>
words_location
</entry>
</properties>
pom.xml
浏览文件 @
a2070598
...
@@ -51,6 +51,13 @@
...
@@ -51,6 +51,13 @@
<version>
${elasticsearch.version}
</version>
<version>
${elasticsearch.version}
</version>
<scope>
compile
</scope>
<scope>
compile
</scope>
</dependency>
</dependency>
<dependency>
<groupId>
org.apache.httpcomponents
</groupId>
<artifactId>
httpclient
</artifactId>
<version>
4.3.5
</version>
<scope>
compile
</scope>
</dependency>
<dependency>
<dependency>
<groupId>
log4j
</groupId>
<groupId>
log4j
</groupId>
...
...
src/main/java/org/wltea/analyzer/cfg/Configuration.java
浏览文件 @
a2070598
...
@@ -17,7 +17,9 @@ public class Configuration {
...
@@ -17,7 +17,9 @@ public class Configuration {
private
static
String
FILE_NAME
=
"ik/IKAnalyzer.cfg.xml"
;
private
static
String
FILE_NAME
=
"ik/IKAnalyzer.cfg.xml"
;
private
static
final
String
EXT_DICT
=
"ext_dict"
;
private
static
final
String
EXT_DICT
=
"ext_dict"
;
private
static
final
String
REMOTE_EXT_DICT
=
"remote_ext_dict"
;
private
static
final
String
EXT_STOP
=
"ext_stopwords"
;
private
static
final
String
EXT_STOP
=
"ext_stopwords"
;
private
static
final
String
REMOTE_EXT_STOP
=
"remote_ext_stopwords"
;
private
static
ESLogger
logger
=
null
;
private
static
ESLogger
logger
=
null
;
private
Properties
props
;
private
Properties
props
;
private
Environment
environment
;
private
Environment
environment
;
...
@@ -64,6 +66,24 @@ public class Configuration {
...
@@ -64,6 +66,24 @@ public class Configuration {
}
}
return
extDictFiles
;
return
extDictFiles
;
}
}
public
List
<
String
>
getRemoteExtDictionarys
(){
List
<
String
>
remoteExtDictFiles
=
new
ArrayList
<
String
>(
2
);
String
remoteExtDictCfg
=
props
.
getProperty
(
REMOTE_EXT_DICT
);
if
(
remoteExtDictCfg
!=
null
){
String
[]
filePaths
=
remoteExtDictCfg
.
split
(
";"
);
if
(
filePaths
!=
null
){
for
(
String
filePath
:
filePaths
){
if
(
filePath
!=
null
&&
!
""
.
equals
(
filePath
.
trim
())){
remoteExtDictFiles
.
add
(
filePath
);
}
}
}
}
return
remoteExtDictFiles
;
}
public
List
<
String
>
getExtStopWordDictionarys
(){
public
List
<
String
>
getExtStopWordDictionarys
(){
List
<
String
>
extStopWordDictFiles
=
new
ArrayList
<
String
>(
2
);
List
<
String
>
extStopWordDictFiles
=
new
ArrayList
<
String
>(
2
);
...
@@ -83,6 +103,24 @@ public class Configuration {
...
@@ -83,6 +103,24 @@ public class Configuration {
}
}
return
extStopWordDictFiles
;
return
extStopWordDictFiles
;
}
}
public
List
<
String
>
getRemoteExtStopWordDictionarys
(){
List
<
String
>
remoteExtStopWordDictFiles
=
new
ArrayList
<
String
>(
2
);
String
remoteExtStopWordDictCfg
=
props
.
getProperty
(
REMOTE_EXT_STOP
);
if
(
remoteExtStopWordDictCfg
!=
null
){
String
[]
filePaths
=
remoteExtStopWordDictCfg
.
split
(
";"
);
if
(
filePaths
!=
null
){
for
(
String
filePath
:
filePaths
){
if
(
filePath
!=
null
&&
!
""
.
equals
(
filePath
.
trim
())){
remoteExtStopWordDictFiles
.
add
(
filePath
);
}
}
}
}
return
remoteExtStopWordDictFiles
;
}
public
File
getDictRoot
()
{
public
File
getDictRoot
()
{
return
environment
.
configFile
();
return
environment
.
configFile
();
...
...
src/main/java/org/wltea/analyzer/dic/Dictionary.java
浏览文件 @
a2070598
...
@@ -25,11 +25,18 @@
...
@@ -25,11 +25,18 @@
*/
*/
package
org.wltea.analyzer.dic
;
package
org.wltea.analyzer.dic
;
import
org.apache.http.client.ClientProtocolException
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpGet
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClients
;
import
org.elasticsearch.common.logging.ESLogger
;
import
org.elasticsearch.common.logging.ESLogger
;
import
org.elasticsearch.common.logging.Loggers
;
import
org.elasticsearch.common.logging.Loggers
;
import
org.wltea.analyzer.cfg.Configuration
;
import
org.wltea.analyzer.cfg.Configuration
;
import
java.io.*
;
import
java.io.*
;
import
java.util.ArrayList
;
import
java.util.Collection
;
import
java.util.Collection
;
import
java.util.List
;
import
java.util.List
;
...
@@ -92,6 +99,17 @@ public class Dictionary {
...
@@ -92,6 +99,17 @@ public class Dictionary {
singleton
.
loadSuffixDict
();
singleton
.
loadSuffixDict
();
singleton
.
loadPrepDict
();
singleton
.
loadPrepDict
();
singleton
.
loadStopWordDict
();
singleton
.
loadStopWordDict
();
//建立监控线程
for
(
String
location:
cfg
.
getRemoteExtDictionarys
()){
Thread
monitor
=
new
Thread
(
new
Monitor
(
location
));
monitor
.
start
();
}
for
(
String
location:
cfg
.
getRemoteExtStopWordDictionarys
()){
Thread
monitor
=
new
Thread
(
new
Monitor
(
location
));
monitor
.
start
();
}
return
singleton
;
return
singleton
;
}
}
}
}
...
@@ -224,6 +242,8 @@ public class Dictionary {
...
@@ -224,6 +242,8 @@ public class Dictionary {
}
}
//加载扩展词典
//加载扩展词典
this
.
loadExtDict
();
this
.
loadExtDict
();
//加载远程自定义词库
this
.
loadRemoteExtDict
();
}
}
/**
/**
...
@@ -275,6 +295,76 @@ public class Dictionary {
...
@@ -275,6 +295,76 @@ public class Dictionary {
}
}
}
}
/**
* 加载远程扩展词典到主词库表
*/
private
void
loadRemoteExtDict
(){
List
<
String
>
remoteExtDictFiles
=
configuration
.
getRemoteExtDictionarys
();
for
(
String
location:
remoteExtDictFiles
){
logger
.
info
(
"[Dict Loading]"
+
location
);
List
<
String
>
lists
=
getRemoteWords
(
location
);
//如果找不到扩展的字典,则忽略
if
(
lists
==
null
){
logger
.
error
(
"[Dict Loading]"
+
location
+
"加载失败"
);
continue
;
}
for
(
String
theWord:
lists
){
if
(
theWord
!=
null
&&
!
""
.
equals
(
theWord
.
trim
()))
{
//加载扩展词典数据到主内存词典中
logger
.
info
(
theWord
);
_MainDict
.
fillSegment
(
theWord
.
trim
().
toLowerCase
().
toCharArray
());
}
}
}
}
/**
* 从远程服务器上下载自定义词条
*/
private
static
List
<
String
>
getRemoteWords
(
String
location
){
List
<
String
>
buffer
=
new
ArrayList
<
String
>();
RequestConfig
rc
=
RequestConfig
.
custom
().
setConnectionRequestTimeout
(
10
*
1000
)
.
setConnectTimeout
(
10
*
1000
).
setSocketTimeout
(
60
*
1000
).
build
();
CloseableHttpClient
httpclient
=
HttpClients
.
createDefault
();
CloseableHttpResponse
response
;
BufferedReader
in
;
HttpGet
get
=
new
HttpGet
(
location
);
get
.
setConfig
(
rc
);
try
{
response
=
httpclient
.
execute
(
get
);
if
(
response
.
getStatusLine
().
getStatusCode
()==
200
){
String
charset
=
"UTF-8"
;
//获取编码,默认为utf-8
if
(
response
.
getEntity
().
getContentType
().
getValue
().
contains
(
"charset="
)){
String
contentType
=
response
.
getEntity
().
getContentType
().
getValue
();
charset
=
contentType
.
substring
(
contentType
.
lastIndexOf
(
"="
)+
1
);
}
in
=
new
BufferedReader
(
new
InputStreamReader
(
response
.
getEntity
().
getContent
(),
charset
));
String
line
;
while
((
line
=
in
.
readLine
())!=
null
){
buffer
.
add
(
line
);
}
in
.
close
();
response
.
close
();
return
buffer
;
}
response
.
close
();
}
catch
(
ClientProtocolException
e
)
{
e
.
printStackTrace
();
}
catch
(
IllegalStateException
e
)
{
e
.
printStackTrace
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
return
buffer
;
}
/**
/**
* 加载用户扩展的停止词词典
* 加载用户扩展的停止词词典
*/
*/
...
@@ -360,7 +450,28 @@ public class Dictionary {
...
@@ -360,7 +450,28 @@ public class Dictionary {
}
}
}
}
}
}
}
}
//加载远程停用词典
List
<
String
>
remoteExtStopWordDictFiles
=
configuration
.
getRemoteExtStopWordDictionarys
();
for
(
String
location:
remoteExtStopWordDictFiles
){
logger
.
info
(
"[Dict Loading]"
+
location
);
List
<
String
>
lists
=
getRemoteWords
(
location
);
//如果找不到扩展的字典,则忽略
if
(
lists
==
null
){
logger
.
error
(
"[Dict Loading]"
+
location
+
"加载失败"
);
continue
;
}
for
(
String
theWord:
lists
){
if
(
theWord
!=
null
&&
!
""
.
equals
(
theWord
.
trim
()))
{
//加载远程词典数据到主内存中
logger
.
info
(
theWord
);
_StopWords
.
fillSegment
(
theWord
.
trim
().
toLowerCase
().
toCharArray
());
}
}
}
}
}
/**
/**
...
@@ -511,6 +622,11 @@ public class Dictionary {
...
@@ -511,6 +622,11 @@ public class Dictionary {
}
}
}
}
}
}
public
void
reLoadMainDict
(){
logger
.
info
(
"重新加载词典..."
);
loadMainDict
();
loadStopWordDict
();
}
}
}
src/main/java/org/wltea/analyzer/dic/Monitor.java
0 → 100644
浏览文件 @
a2070598
package
org.wltea.analyzer.dic
;
import
java.io.IOException
;
import
org.apache.http.Header
;
import
org.apache.http.client.ClientProtocolException
;
import
org.apache.http.client.config.RequestConfig
;
import
org.apache.http.client.methods.CloseableHttpResponse
;
import
org.apache.http.client.methods.HttpHead
;
import
org.apache.http.impl.client.CloseableHttpClient
;
import
org.apache.http.impl.client.HttpClients
;
import
org.wltea.analyzer.help.Sleep
;
import
org.wltea.analyzer.help.Sleep.Type
;
public
class
Monitor
implements
Runnable
{
private
static
CloseableHttpClient
httpclient
=
HttpClients
.
createDefault
();
/*
* 上次更改时间
*/
private
String
last_modified
;
/*
* 资源属性
*/
private
String
eTags
;
/*
* 请求地址
*/
private
String
location
;
public
Monitor
(
String
location
)
{
this
.
location
=
location
;
this
.
last_modified
=
null
;
this
.
eTags
=
null
;
}
/**
* 监控流程:
* ①向词库服务器发送Head请求
* ②从响应中获取Last-Modify、ETags字段值,判断是否变化
* ③如果未变化,休眠1min,返回第①步
* ④如果有变化,重新加载词典
* ⑤休眠1min,返回第①步
*/
public
void
run
()
{
//超时设置
RequestConfig
rc
=
RequestConfig
.
custom
().
setConnectionRequestTimeout
(
10
*
1000
)
.
setConnectTimeout
(
10
*
1000
).
setSocketTimeout
(
15
*
1000
).
build
();
while
(
true
)
{
HttpHead
head
=
new
HttpHead
(
location
);
head
.
setConfig
(
rc
);
//设置请求头
if
(
last_modified
!=
null
)
{
head
.
setHeader
(
"If-Modified-Since"
,
last_modified
);
}
if
(
eTags
!=
null
)
{
head
.
setHeader
(
"If-None-Match"
,
eTags
);
}
CloseableHttpResponse
response
=
null
;
try
{
response
=
httpclient
.
execute
(
head
);
//返回304 Not Modified,词库未更新
if
(
response
.
getStatusLine
().
getStatusCode
()==
304
){
continue
;
}
else
if
(
response
.
getStatusLine
().
getStatusCode
()==
200
){
if
(!
response
.
getLastHeader
(
"Last-Modified"
).
getValue
().
equalsIgnoreCase
(
last_modified
)
||!
response
.
getLastHeader
(
"ETags"
).
getValue
().
equalsIgnoreCase
(
eTags
))
{
// 远程词库有更新,需要重新加载词典,并修改last_modified,eTags
Dictionary
.
getSingleton
().
reLoadMainDict
();
last_modified
=
response
.
getLastHeader
(
"Last-Modified"
)==
null
?
null
:
response
.
getLastHeader
(
"Last-Modified"
).
getValue
();
eTags
=
response
.
getLastHeader
(
"ETags"
)==
null
?
null
:
response
.
getLastHeader
(
"ETags"
).
getValue
();
}
}
}
catch
(
ClientProtocolException
e
)
{
e
.
printStackTrace
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
finally
{
try
{
response
.
close
();
}
catch
(
IOException
e
)
{
e
.
printStackTrace
();
}
Sleep
.
sleep
(
Type
.
SEC
,
60
);
}
}
}
}
src/main/java/org/wltea/analyzer/help/Sleep.java
0 → 100644
浏览文件 @
a2070598
package
org.wltea.analyzer.help
;
public
class
Sleep
{
public
enum
Type
{
MSEC
,
SEC
,
MIN
,
HOUR
};
public
static
void
sleep
(
Type
type
,
int
num
){
try
{
switch
(
type
){
case
MSEC:
Thread
.
sleep
(
num
);
return
;
case
SEC:
Thread
.
sleep
(
num
*
1000
);
return
;
case
MIN:
Thread
.
sleep
(
num
*
60
*
1000
);
return
;
case
HOUR:
Thread
.
sleep
(
num
*
60
*
60
*
1000
);
return
;
default
:
System
.
err
.
println
(
"输入类型错误,应为MSEC,SEC,MIN,HOUR之一"
);
return
;
}
}
catch
(
InterruptedException
e
)
{
e
.
printStackTrace
();
}
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录