1.Maven Dependency
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>Lucene3.6</groupId> <artifactId>Lucene3.6</artifactId> <version>1</version> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <lucene-version>3.6.2</lucene-version> <junit-version>4.11</junit-version> </properties> <dependencies> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>${lucene-version}</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>${junit-version}</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.4</version> </dependency> </dependencies> <build> <sourceDirectory>src</sourceDirectory> <plugins> <plugin> <artifactId>maven-compiler-plugin</artifactId> <version>3.0</version> <configuration> <source>1.7</source> <target>1.7</target> </configuration> </plugin> </plugins> </build> </project>
2.全文搜索
分为三部分:
索引
1、创建Directory
2、创建IndexWriter
3、创建Document对象
4、为Document对象添加Field
5、通过IndexWriter添加文档到索引中
分词
搜索
1、创建Directory
2、创建IndexReader
3、根据IndexReader创建IndexSearcher
4、创建搜索的Query
5、根据searcher搜索并且返回TopDocs
6、根据TopDocs获取ScoreDocs对象
7、根据searcher和ScoreDocs对象获取具体的Document对象
8、根据Document对象获取需要的值
3.src
HelloLucene.java
package org.fool.lucene;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.Version;
public class HelloLucene
{
/**
* 建立索引
*/
public void index()
{
Directory directory = null;
IndexWriter writer = null;
try
{
// 1、创建Directory
directory = FSDirectory.open(new File("C:/index")); // 创建在硬盘上
// 2、创建IndexWriter
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36,
new StandardAnalyzer(Version.LUCENE_36));
writer = new IndexWriter(directory, iwc);
// 3、创建Document对象
Document doc = null;
// 4、为Document对象添加Field
File f = new File("C:/lucene");
for (File file : f.listFiles())
{
doc = new Document();
// String content = FileUtils.readFileToString(file);
// System.out.println(content);
// doc.add(new Field("content", content, Field.Store.YES,
// Field.Index.ANALYZED_NO_NORMS));
doc.add(new Field("content", new FileReader(file)));
doc.add(new Field("fileName", file.getName(), Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field("filePath", file.getAbsolutePath(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
// 5、通过IndexWriter添加文档到索引中
writer.addDocument(doc);
}
}
catch (CorruptIndexException | LockObtainFailedException
| FileNotFoundException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
finally
{
if (writer != null)
{
try
{
writer.close();
}
catch (CorruptIndexException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}
}
/**
* 搜索
*/
public void search()
{
Directory directory = null;
IndexReader reader = null;
IndexSearcher searcher = null;
try
{
// 1、创建Directory
directory = FSDirectory.open(new File("C:/index")); // 创建在硬盘上
// 2、创建IndexReader
reader = IndexReader.open(directory);
// 3、根据IndexReader创建IndexSearcher
searcher = new IndexSearcher(reader);
// 4、创建搜索的Query
QueryParser parser = new QueryParser(Version.LUCENE_36, "content",
new StandardAnalyzer(Version.LUCENE_36));
Query query = parser.parse("World");
// 5、根据searcher搜索并且返回TopDocs
TopDocs tds = searcher.search(query, 10);
// 6、根据TopDocs获取ScoreDocs对象
ScoreDoc[] sds = tds.scoreDocs;
for (ScoreDoc sd : sds)
{
// 7、根据searcher和ScoreDocs对象获取具体的Document对象
Document document = searcher.doc(sd.doc);
// 8、根据Document对象获取需要的值
System.out.println(document.get("fileName") + "("
+ document.get("filePath") + ")");
}
}
catch (CorruptIndexException | ParseException e)
{
e.printStackTrace();
}
catch (IOException e)
{
e.printStackTrace();
}
finally
{
// 9、关闭reader
try
{
searcher.close();
reader.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}
}
4.test
TestHelloLucene.java
package org.fool.lucene;
import org.junit.Before;
import org.junit.Test;
public class TestHelloLucene
{
private HelloLucene lucene;
@Before
public void setUp() throws Exception
{
lucene = new HelloLucene();
}
@Test
public void testIndex()
{
lucene.index();
}
@Test
public void testSearch()
{
lucene.search();
}
}
5.Details
Field.Store.YES/NO(存储域选项)
设置为YES表示或把这个域中的内容完全存储到文件中,方便进行文本的还原
设置为NO表示把这个域的内容不存储到文件中,但是可以被索引,此时内容无法完全还原(doc.get)
Field.Index.*(索引域选项)
Index.ANALYZED:进行分词和索引,适用于标题、内容等
Index.NOT_ANALYZED:进行索引,但是不进行分词,如果身份证号、姓名、ID等,适用于精确搜索
Index.ANALYZED_NOT_NORMS:进行分词但是不存储norms信息,这个norms中包括了创建索引的时间和权值等信息
Index.NOT_ANALYZED_NOT_NORMS:即不进行分词也不存储norms信息
Index.NO:不进行索引
最佳实践
Index.NOT_ANALYZED_NOT_NORMS YES 标识符(主键、文件名),电话号码,身份证号,姓名,日期
Index.ANALYZED YES 文档标题和摘要
Index.ANALYZED NO 文档正文
Index.NO YES 文档类型,数据库主键(不进行索引)
Index.NOT_ANALYZED NO 隐藏关键字
更多细节可以参考Lucene实战(第二版)