From 6c8b826debfc07b485cfdeb8e5310ac1e78c1fc8 Mon Sep 17 00:00:00 2001 From: nicky <362330721@qq.com> Date: Wed, 18 Apr 2018 16:23:36 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E5=85=A8=E5=B1=80=E6=90=9C?= =?UTF-8?q?=E7=B4=A2=E5=BC=95=E6=93=8E=E5=B7=A5=E7=A8=8B=EF=BC=8C=E9=87=87?= =?UTF-8?q?=E7=94=A8Apache=20Lucene=E6=90=AD=E5=BB=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/project-search/ReadMe.txt | 41 ++++++ src/project-search/pom.xml | 95 ++++++++++++ .../java/com/test/lucene/LuceneConstant.java | 9 ++ .../java/com/test/lucene/LuceneIndexer.java | 138 ++++++++++++++++++ .../test/java/com/test/lucene/LuceneTest.java | 5 + .../java/com/test/lucene/SearchBuilder.java | 5 + 6 files changed, 293 insertions(+) create mode 100644 src/project-search/ReadMe.txt create mode 100644 src/project-search/pom.xml create mode 100644 src/project-search/src/test/java/com/test/lucene/LuceneConstant.java create mode 100644 src/project-search/src/test/java/com/test/lucene/LuceneIndexer.java create mode 100644 src/project-search/src/test/java/com/test/lucene/LuceneTest.java create mode 100644 src/project-search/src/test/java/com/test/lucene/SearchBuilder.java diff --git a/src/project-search/ReadMe.txt b/src/project-search/ReadMe.txt new file mode 100644 index 0000000..4e1528f --- /dev/null +++ b/src/project-search/ReadMe.txt @@ -0,0 +1,41 @@ +全局搜索引擎 + + +### 数据库索引和Lucene检索对比 + +|比较项 |Lucene检索| 数据库检索| +|:--------|:--------|:--------| +|数据检索 | 从Lucene的索引文件中检出 | 由数据库索引检索记录| +|索引结构 | Document(文档)| Record(记录)| +|全文检索 | 支持 | 不支持| +|模糊查询 | 支持 | 不支持| +|结果排序 | 支持排序 | 不能排序| + +Lucene搜索的API类主要有4个 IndexSearch,Query,QueryParser,Hits + +### Lucene搜索过程 +Lucene的索引结构是文档(Document)形式的,下面简单介绍一下Lucene搜索的过程 +1. 将文档传给分词组件(Tokenizer),分词组件根据标点符号和停词将文档分成词元(Token),并将标点符号和停词去掉。 + +停词是指没有特别意思的词。英语的是指比如a、the等等单词 + +文章1内容:Tom favorite fruit is apple. + +经过分词处理后,变成[Tom][facorite][fruit][apple] + + +2. 再将词元传给语言处理组件(Linguistic Processor) + +英语的单词经过语言处理组件处理后,字母变为小写,词元会变成最基本的词根形式,比如likes变成like + +经过分词处理后,变成[tom][favorite][fruit][apple] + +3. 然后得到的词元传给索引组件(Indexer),索引组件处理得到索引结构,得到关键字、出现频率、出现位置分别作为词典文件(Term Dictionary)、频率文件(frequencies)和位置文件(positions)保存起来,然后通过二元搜索算法快速查找关键字 + +|关键字 |文章号[出现频率]| 出现位置| +|:--------|:--------|:--------| +|tom | 1[1] | 1 | +|favorite| 1[2] | 2 | +|fruit| 1[3] | 3 | +[apple| 1[4] | 4 | + diff --git a/src/project-search/pom.xml b/src/project-search/pom.xml new file mode 100644 index 0000000..baeafb3 --- /dev/null +++ b/src/project-search/pom.xml @@ -0,0 +1,95 @@ + + + + + taoshop + org.muses + 1.0-SNAPSHOT + + 4.0.0 + + project-search + jar + + project-search Maven Webapp + + http://www.example.com + + + UTF-8 + 1.7 + 1.7 + + + + + junit + junit + 4.11 + test + + + org.apache.lucene + lucene-core + 5.3.1 + + + org.apache.lucene + lucene-analyzers-common + 5.3.1 + + + org.apache.lucene + lucene-queryparser + 5.3.1 + + + org.apache.lucene + lucene-analyzers-smartcn + 5.3.1 + + + org.apache.lucene + lucene-highlighter + 5.3.1 + + + + + diff --git a/src/project-search/src/test/java/com/test/lucene/LuceneConstant.java b/src/project-search/src/test/java/com/test/lucene/LuceneConstant.java new file mode 100644 index 0000000..3eb22db --- /dev/null +++ b/src/project-search/src/test/java/com/test/lucene/LuceneConstant.java @@ -0,0 +1,9 @@ +package com.test.lucene; + +public class LuceneConstant { + + private final static String INDEX_DIR = "D:\\lucene"; + + private final static String INDEX_DATA_DIR = "D:\\lucene\\data"; + +} diff --git a/src/project-search/src/test/java/com/test/lucene/LuceneIndexer.java b/src/project-search/src/test/java/com/test/lucene/LuceneIndexer.java new file mode 100644 index 0000000..a06a6a4 --- /dev/null +++ b/src/project-search/src/test/java/com/test/lucene/LuceneIndexer.java @@ -0,0 +1,138 @@ +package com.test.lucene; + + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Paths; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.springframework.stereotype.Component; + + +/** + *
+ * 	Lucene创建索引、全局搜索服务类
+ * 
+ * + * @author nicky + * @version 1.00.00 + * + *
+ * 修改记录
+ *    修改后版本:     修改人:  修改日期:2018年04月18日     修改内容:
+ *          
+ */ +@Component +public class LuceneIndexer { + + private volatile static LuceneIndexer instance; +// +// private LuceneIndexer(){} + + /** + * 双检锁/双重校验锁(DCL,即 double-checked locking) + * @return instance + */ +// public static LuceneIndexer getInstance(){ +// if(instance == null){ +// synchronized (LuceneIndexer.class) { +// if(instance == null){ +// instance = new LuceneIndexer(); +// } +// } +// } +// return instance; +// } + +// private static Analyzer analyzer; +// private static Directory directory; + private IndexWriter indexWriter; +// private static IndexWriterConfig config; + + private final static String INDEX_DIR = "D:\\lucene"; + + private final static String DATA_DIR = "D:\\lucene\\data"; + + private static class SingletonHolder{ + private final static LuceneIndexer instance=new LuceneIndexer(); + } + + public static LuceneIndexer getInstance(){ + return SingletonHolder.instance; + } + + public static boolean createIndex(String indexDir , String dataDir) throws IOException{ + long startTime = System.currentTimeMillis();//记录索引开始时间 + + Analyzer analyzer = new SmartChineseAnalyzer(); + Directory directory = FSDirectory.open(Paths.get(indexDir)); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + + IndexWriter indexWriter = new IndexWriter(directory, config); + + File[] files = new File(dataDir).listFiles(); + for(File file : files){ + Document doc = new Document(); + //添加字段 + doc.add(new TextField("contents", new FileReader(file))); //添加内容 + doc.add(new TextField("fileName", file.getName(), Field.Store.YES)); //添加文件名,并把这个字段存到索引文件里 + doc.add(new TextField("fullPath", file.getCanonicalPath(), Field.Store.YES)); //添加文件路径 + + indexWriter.addDocument(doc); + } + + + + System.out.println("共索引了"+indexWriter.numDocs()+"个文件"); + + indexWriter.commit(); + indexWriter.close(); + System.out.println("创建索引所用时间:"+(System.currentTimeMillis()-startTime)); + + return true; + } + + + + private void addDocument(File file) throws IOException{ + Document doc = new Document(); + //添加字段 + doc.add(new TextField("contents", new FileReader(file))); //添加内容 + doc.add(new TextField("fileName", file.getName(), Field.Store.YES)); //添加文件名,并把这个字段存到索引文件里 + doc.add(new TextField("fullPath", file.getCanonicalPath(), Field.Store.YES)); //添加文件路径 + + indexWriter.addDocument(doc); + } + + private void closeWriter() throws IOException{ + if (indexWriter != null) { + indexWriter.close(); + } + } + + public static void main(String[] args) { + try { + boolean r = LuceneIndexer.getInstance().createIndex(INDEX_DIR,DATA_DIR); + if(r){ + System.out.println("创建成功!"); + }else{ + System.out.println("创建失败!"); + } + } catch (IOException e) { + e.printStackTrace(); + } + + } + + + +} diff --git a/src/project-search/src/test/java/com/test/lucene/LuceneTest.java b/src/project-search/src/test/java/com/test/lucene/LuceneTest.java new file mode 100644 index 0000000..2b17cd5 --- /dev/null +++ b/src/project-search/src/test/java/com/test/lucene/LuceneTest.java @@ -0,0 +1,5 @@ +package com.test.lucene; + +public class LuceneTest { + +} diff --git a/src/project-search/src/test/java/com/test/lucene/SearchBuilder.java b/src/project-search/src/test/java/com/test/lucene/SearchBuilder.java new file mode 100644 index 0000000..7aff7d2 --- /dev/null +++ b/src/project-search/src/test/java/com/test/lucene/SearchBuilder.java @@ -0,0 +1,5 @@ +package com.test.lucene; + +public class SearchBuilder { + +} -- GitLab