提交 6c8b826d 编写于 作者: N nicky

提交全局搜索引擎工程,采用Apache Lucene搭建

上级 2cd8b481
全局搜索引擎
### 数据库索引和Lucene检索对比
|比较项 |Lucene检索| 数据库检索|
|:--------|:--------|:--------|
|数据检索 | 从Lucene的索引文件中检出 | 由数据库索引检索记录|
|索引结构 | Document(文档)| Record(记录)|
|全文检索 | 支持 | 不支持|
|模糊查询 | 支持 | 不支持|
|结果排序 | 支持排序 | 不能排序|
Lucene搜索的API类主要有4个 IndexSearch,Query,QueryParser,Hits
### Lucene搜索过程
Lucene的索引结构是文档(Document)形式的,下面简单介绍一下Lucene搜索的过程
1. 将文档传给分词组件(Tokenizer),分词组件根据标点符号和停词将文档分成词元(Token),并将标点符号和停词去掉。
停词是指没有特别意思的词。英语的是指比如a、the等等单词
文章1内容:Tom favorite fruit is apple.
经过分词处理后,变成[Tom][facorite][fruit][apple]
2. 再将词元传给语言处理组件(Linguistic Processor)
英语的单词经过语言处理组件处理后,字母变为小写,词元会变成最基本的词根形式,比如likes变成like
经过分词处理后,变成[tom][favorite][fruit][apple]
3. 然后得到的词元传给索引组件(Indexer),索引组件处理得到索引结构,得到关键字、出现频率、出现位置分别作为词典文件(Term Dictionary)、频率文件(frequencies)和位置文件(positions)保存起来,然后通过二元搜索算法快速查找关键字
|关键字 |文章号[出现频率]| 出现位置|
|:--------|:--------|:--------|
|tom | 1[1] | 1 |
|favorite| 1[2] | 2 |
|fruit| 1[3] | 3 |
[apple| 1[4] | 4 |
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>taoshop</artifactId>
<groupId>org.muses</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>project-search</artifactId>
<packaging>jar</packaging>
<name>project-search Maven Webapp</name>
<!-- FIXME change it to the project's website -->
<url>http://www.example.com</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>5.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>5.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>5.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-smartcn</artifactId>
<version>5.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>5.3.1</version>
</dependency>
</dependencies>
<!-- <build>
<finalName>project-search</finalName>
<pluginManagement>&lt;!&ndash; lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) &ndash;&gt;
<plugins>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.0.0</version>
</plugin>
&lt;!&ndash; see http://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_war_packaging &ndash;&gt;
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.7.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.20.1</version>
</plugin>
<plugin>
<artifactId>maven-war-plugin</artifactId>
<version>3.2.0</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
</plugins>
</pluginManagement>
</build>-->
</project>
package com.test.lucene;
public class LuceneConstant {
private final static String INDEX_DIR = "D:\\lucene";
private final static String INDEX_DATA_DIR = "D:\\lucene\\data";
}
package com.test.lucene;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.springframework.stereotype.Component;
/**
* <pre>
* Lucene创建索引、全局搜索服务类
* </pre>
*
* @author nicky
* @version 1.00.00
*
* <pre>
* 修改记录
* 修改后版本: 修改人: 修改日期:2018年04月18日 修改内容:
* </pre>
*/
@Component
public class LuceneIndexer {
private volatile static LuceneIndexer instance;
//
// private LuceneIndexer(){}
/**
* 双检锁/双重校验锁(DCL,即 double-checked locking)
* @return instance
*/
// public static LuceneIndexer getInstance(){
// if(instance == null){
// synchronized (LuceneIndexer.class) {
// if(instance == null){
// instance = new LuceneIndexer();
// }
// }
// }
// return instance;
// }
// private static Analyzer analyzer;
// private static Directory directory;
private IndexWriter indexWriter;
// private static IndexWriterConfig config;
private final static String INDEX_DIR = "D:\\lucene";
private final static String DATA_DIR = "D:\\lucene\\data";
private static class SingletonHolder{
private final static LuceneIndexer instance=new LuceneIndexer();
}
public static LuceneIndexer getInstance(){
return SingletonHolder.instance;
}
public static boolean createIndex(String indexDir , String dataDir) throws IOException{
long startTime = System.currentTimeMillis();//记录索引开始时间
Analyzer analyzer = new SmartChineseAnalyzer();
Directory directory = FSDirectory.open(Paths.get(indexDir));
IndexWriterConfig config = new IndexWriterConfig(analyzer);
IndexWriter indexWriter = new IndexWriter(directory, config);
File[] files = new File(dataDir).listFiles();
for(File file : files){
Document doc = new Document();
//添加字段
doc.add(new TextField("contents", new FileReader(file))); //添加内容
doc.add(new TextField("fileName", file.getName(), Field.Store.YES)); //添加文件名,并把这个字段存到索引文件里
doc.add(new TextField("fullPath", file.getCanonicalPath(), Field.Store.YES)); //添加文件路径
indexWriter.addDocument(doc);
}
System.out.println("共索引了"+indexWriter.numDocs()+"个文件");
indexWriter.commit();
indexWriter.close();
System.out.println("创建索引所用时间:"+(System.currentTimeMillis()-startTime));
return true;
}
private void addDocument(File file) throws IOException{
Document doc = new Document();
//添加字段
doc.add(new TextField("contents", new FileReader(file))); //添加内容
doc.add(new TextField("fileName", file.getName(), Field.Store.YES)); //添加文件名,并把这个字段存到索引文件里
doc.add(new TextField("fullPath", file.getCanonicalPath(), Field.Store.YES)); //添加文件路径
indexWriter.addDocument(doc);
}
private void closeWriter() throws IOException{
if (indexWriter != null) {
indexWriter.close();
}
}
public static void main(String[] args) {
try {
boolean r = LuceneIndexer.getInstance().createIndex(INDEX_DIR,DATA_DIR);
if(r){
System.out.println("创建成功!");
}else{
System.out.println("创建失败!");
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
package com.test.lucene;
public class LuceneTest {
}
package com.test.lucene;
public class SearchBuilder {
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册