Lucene介绍
Lucene的定位是,核心索引和搜索的模块,并非完整的搜索程序。
搜索程序的组件
其中阴影部分的可以由Lucene完成。
索引组件
为何需要索引操作?
假设有一个3GB大小的文本,需要找出“程序”两个字,一种最简单的方式是顺序扫描,但这种扫描明显效率很低。所以就要选中一种高效的数据结构来存储,使得查询速度达到最快,这个过程就叫做索引操作
,它的输出就叫索引
。
索引的流程
搜索组件
搜索质量
搜索的流程
索引过程核心类
搜索过程核心类
实例程序
创建索引
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.FileReader;
import java.nio.file.Paths;
public class IndexTest {
//索引写入对象
private IndexWriter writer;
public static void main(String[] args) throws Exception {
String rootPath = "/searchengine/surfbird-search";
String indexDir = rootPath + "/index"; //1 指定目录创建索引
String dataDir = rootPath + "/data"; //2 对指定目录中的*.txt文件进行索引
long start = System.currentTimeMillis();
IndexTest indexer = new IndexTest(indexDir);
int numIndexed;
try {
numIndexed = indexer.index(dataDir, new TextFilesFilter());
} finally {
indexer.close();
}
long end = System.currentTimeMillis();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
}
public IndexTest(String indexDir) throws IOException {
//FS file system文件系统,指定文档索引存储目录
Directory dir = FSDirectory.open(Paths.get(indexDir));
//标准分词器,会自动去掉空格啊,is a the等单词,如果是中文,会单字分词
Analyzer analyzer = new StandardAnalyzer();
//将标准分词器配到写索引的配置中
IndexWriterConfig config = new IndexWriterConfig(analyzer);
//实例化索引写入对象,
writer = new IndexWriter(dir, config);
}
public void close() throws IOException {
writer.close(); //4 关闭IndexWriter
}
public int index(String dataDir, FileFilter filter)
throws Exception {
File[] files = new File(dataDir).listFiles();
for (File f : files) {
if (!f.isDirectory() &&
!f.isHidden() &&
f.exists() &&
f.canRead() &&
(filter == null || filter.accept(f))) {
indexFile(f);
}
}
return writer.numDocs(); //5 返回被索引的文档数
}
private static class TextFilesFilter implements FileFilter {
public boolean accept(File path) {
return path.getName().toLowerCase() //6 只索引*.txt文件,采用FileFilter
.endsWith(".txt"); //6
}
}
protected Document getDocument(File f) throws Exception {
//文档对象
Document doc = new Document();
//向文档中添加域(域名,域值)
doc.add(new Field("contents", new FileReader(f))); //7 索引文件内容
doc.add(new Field("filename", f.getName(), //8 索引文件名
Field.Store.YES, Field.Index.NOT_ANALYZED));//8
//Store是否存储
doc.add(new Field("fullpath", f.getCanonicalPath(), //9 索引文件完整路径
Field.Store.YES, Field.Index.NOT_ANALYZED));//9
return doc;
}
private void indexFile(File f) throws Exception {
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc); //10 向Lucene索引中添加文档
}
}
创建搜索程序
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import java.io.IOException;
import java.nio.file.Paths;
public class SearchTest {
public static void search(String indexDir, String q)
throws IOException, ParseException {
Directory dir = FSDirectory.open(Paths.get(indexDir));
IndexReader reader = DirectoryReader.open(dir);
Analyzer analyzer = new StandardAnalyzer(); //标准分词器,会自动去掉空格啊,is a the等单词
QueryParser parser = new QueryParser("contents", analyzer); //查询解析器
Query query = parser.parse(q); //4
long start = System.currentTimeMillis();
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs hits = searcher.search(query, 10); //5 搜索索引
long end = System.currentTimeMillis();
System.err.println("Found " + hits.totalHits + //6 记录索引状态
" document(s) (in " + (end - start) + // 6
" milliseconds) that matched query '" + // 6
q + "':"); // 6
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc); //7 返回匹配文本
System.out.println(doc.get("fullpath")); //8 显示匹配文件名
}
reader.close(); //9 关闭IndexSearcher
}
public static void main(String[] args) {
String rootPath = "/searchengine/surfbird-search";
String indexDir = rootPath + "/index"; //1 指定目录创建索引
String q = "周瑜"; //查询这个字符串
try {
search(indexDir, q);
} catch (Exception e) {
e.printStackTrace();
}
}
}
参考文档
《Lucene实战》