From 6c8b826debfc07b485cfdeb8e5310ac1e78c1fc8 Mon Sep 17 00:00:00 2001
From: nicky <362330721@qq.com>
Date: Wed, 18 Apr 2018 16:23:36 +0800
Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E5=85=A8=E5=B1=80=E6=90=9C?=
=?UTF-8?q?=E7=B4=A2=E5=BC=95=E6=93=8E=E5=B7=A5=E7=A8=8B=EF=BC=8C=E9=87=87?=
=?UTF-8?q?=E7=94=A8Apache=20Lucene=E6=90=AD=E5=BB=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
src/project-search/ReadMe.txt | 41 ++++++
src/project-search/pom.xml | 95 ++++++++++++
.../java/com/test/lucene/LuceneConstant.java | 9 ++
.../java/com/test/lucene/LuceneIndexer.java | 138 ++++++++++++++++++
.../test/java/com/test/lucene/LuceneTest.java | 5 +
.../java/com/test/lucene/SearchBuilder.java | 5 +
6 files changed, 293 insertions(+)
create mode 100644 src/project-search/ReadMe.txt
create mode 100644 src/project-search/pom.xml
create mode 100644 src/project-search/src/test/java/com/test/lucene/LuceneConstant.java
create mode 100644 src/project-search/src/test/java/com/test/lucene/LuceneIndexer.java
create mode 100644 src/project-search/src/test/java/com/test/lucene/LuceneTest.java
create mode 100644 src/project-search/src/test/java/com/test/lucene/SearchBuilder.java
diff --git a/src/project-search/ReadMe.txt b/src/project-search/ReadMe.txt
new file mode 100644
index 0000000..4e1528f
--- /dev/null
+++ b/src/project-search/ReadMe.txt
@@ -0,0 +1,41 @@
+全局搜索引擎
+
+
+### 数据库索引和Lucene检索对比
+
+|比较项 |Lucene检索| 数据库检索|
+|:--------|:--------|:--------|
+|数据检索 | 从Lucene的索引文件中检出 | 由数据库索引检索记录|
+|索引结构 | Document(文档)| Record(记录)|
+|全文检索 | 支持 | 不支持|
+|模糊查询 | 支持 | 不支持|
+|结果排序 | 支持排序 | 不能排序|
+
+Lucene搜索的API类主要有4个 IndexSearch,Query,QueryParser,Hits
+
+### Lucene搜索过程
+Lucene的索引结构是文档(Document)形式的,下面简单介绍一下Lucene搜索的过程
+1. 将文档传给分词组件(Tokenizer),分词组件根据标点符号和停词将文档分成词元(Token),并将标点符号和停词去掉。
+
+停词是指没有特别意思的词。英语的是指比如a、the等等单词
+
+文章1内容:Tom favorite fruit is apple.
+
+经过分词处理后,变成[Tom][facorite][fruit][apple]
+
+
+2. 再将词元传给语言处理组件(Linguistic Processor)
+
+英语的单词经过语言处理组件处理后,字母变为小写,词元会变成最基本的词根形式,比如likes变成like
+
+经过分词处理后,变成[tom][favorite][fruit][apple]
+
+3. 然后得到的词元传给索引组件(Indexer),索引组件处理得到索引结构,得到关键字、出现频率、出现位置分别作为词典文件(Term Dictionary)、频率文件(frequencies)和位置文件(positions)保存起来,然后通过二元搜索算法快速查找关键字
+
+|关键字 |文章号[出现频率]| 出现位置|
+|:--------|:--------|:--------|
+|tom | 1[1] | 1 |
+|favorite| 1[2] | 2 |
+|fruit| 1[3] | 3 |
+[apple| 1[4] | 4 |
+
diff --git a/src/project-search/pom.xml b/src/project-search/pom.xml
new file mode 100644
index 0000000..baeafb3
--- /dev/null
+++ b/src/project-search/pom.xml
@@ -0,0 +1,95 @@
+
+
+
+ * Lucene创建索引、全局搜索服务类 + *+ * + * @author nicky + * @version 1.00.00 + * + *
+ * 修改记录 + * 修改后版本: 修改人: 修改日期:2018年04月18日 修改内容: + *+ */ +@Component +public class LuceneIndexer { + + private volatile static LuceneIndexer instance; +// +// private LuceneIndexer(){} + + /** + * 双检锁/双重校验锁(DCL,即 double-checked locking) + * @return instance + */ +// public static LuceneIndexer getInstance(){ +// if(instance == null){ +// synchronized (LuceneIndexer.class) { +// if(instance == null){ +// instance = new LuceneIndexer(); +// } +// } +// } +// return instance; +// } + +// private static Analyzer analyzer; +// private static Directory directory; + private IndexWriter indexWriter; +// private static IndexWriterConfig config; + + private final static String INDEX_DIR = "D:\\lucene"; + + private final static String DATA_DIR = "D:\\lucene\\data"; + + private static class SingletonHolder{ + private final static LuceneIndexer instance=new LuceneIndexer(); + } + + public static LuceneIndexer getInstance(){ + return SingletonHolder.instance; + } + + public static boolean createIndex(String indexDir , String dataDir) throws IOException{ + long startTime = System.currentTimeMillis();//记录索引开始时间 + + Analyzer analyzer = new SmartChineseAnalyzer(); + Directory directory = FSDirectory.open(Paths.get(indexDir)); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + + IndexWriter indexWriter = new IndexWriter(directory, config); + + File[] files = new File(dataDir).listFiles(); + for(File file : files){ + Document doc = new Document(); + //添加字段 + doc.add(new TextField("contents", new FileReader(file))); //添加内容 + doc.add(new TextField("fileName", file.getName(), Field.Store.YES)); //添加文件名,并把这个字段存到索引文件里 + doc.add(new TextField("fullPath", file.getCanonicalPath(), Field.Store.YES)); //添加文件路径 + + indexWriter.addDocument(doc); + } + + + + System.out.println("共索引了"+indexWriter.numDocs()+"个文件"); + + indexWriter.commit(); + indexWriter.close(); + System.out.println("创建索引所用时间:"+(System.currentTimeMillis()-startTime)); + + return true; + } + + + + private void addDocument(File file) throws IOException{ + Document doc = new Document(); + //添加字段 + doc.add(new TextField("contents", new FileReader(file))); //添加内容 + doc.add(new TextField("fileName", file.getName(), Field.Store.YES)); //添加文件名,并把这个字段存到索引文件里 + doc.add(new TextField("fullPath", file.getCanonicalPath(), Field.Store.YES)); //添加文件路径 + + indexWriter.addDocument(doc); + } + + private void closeWriter() throws IOException{ + if (indexWriter != null) { + indexWriter.close(); + } + } + + public static void main(String[] args) { + try { + boolean r = LuceneIndexer.getInstance().createIndex(INDEX_DIR,DATA_DIR); + if(r){ + System.out.println("创建成功!"); + }else{ + System.out.println("创建失败!"); + } + } catch (IOException e) { + e.printStackTrace(); + } + + } + + + +} diff --git a/src/project-search/src/test/java/com/test/lucene/LuceneTest.java b/src/project-search/src/test/java/com/test/lucene/LuceneTest.java new file mode 100644 index 0000000..2b17cd5 --- /dev/null +++ b/src/project-search/src/test/java/com/test/lucene/LuceneTest.java @@ -0,0 +1,5 @@ +package com.test.lucene; + +public class LuceneTest { + +} diff --git a/src/project-search/src/test/java/com/test/lucene/SearchBuilder.java b/src/project-search/src/test/java/com/test/lucene/SearchBuilder.java new file mode 100644 index 0000000..7aff7d2 --- /dev/null +++ b/src/project-search/src/test/java/com/test/lucene/SearchBuilder.java @@ -0,0 +1,5 @@ +package com.test.lucene; + +public class SearchBuilder { + +} -- GitLab