Lucene实战 ——（一）初识Lucene

最新推荐文章于 2020-10-26 14:01:34 发布

coffejoy

最新推荐文章于 2020-10-26 14:01:34 发布

阅读量312

点赞数

分类专栏： Lucene Lucene实战

本文链接：https://blog.youkuaiyun.com/weixin_42142408/article/details/89166523

版权

Lucene 同时被 2 个专栏收录

2 篇文章

订阅专栏

Lucene实战

2 篇文章

订阅专栏

本文介绍了Lucene作为核心索引和搜索模块的定位，详细讲解了搜索程序的组件，包括索引和搜索组件的工作原理。讨论了为何需要进行索引操作以提高查询效率，并概述了索引和搜索的流程。同时，文章提供了创建索引和搜索程序的实例程序，是学习Lucene实战的入门指南。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

Lucene介绍

Lucene的定位是，核心索引和搜索的模块，并非完整的搜索程序。

本章思维导图

搜索程序的组件

在这里插入图片描述
其中阴影部分的可以由Lucene完成。

索引组件

为何需要索引操作？

假设有一个3GB大小的文本，需要找出“程序”两个字，一种最简单的方式是顺序扫描，但这种扫描明显效率很低。所以就要选中一种高效的数据结构来存储，使得查询速度达到最快，这个过程就叫做索引操作，它的输出就叫索引。

索引的流程

在这里插入图片描述

搜索组件

搜索质量

在这里插入图片描述

搜索的流程

在这里插入图片描述

索引过程核心类

在这里插入图片描述

搜索过程核心类

在这里插入图片描述

实例程序

创建索引


import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.FileReader;
import java.nio.file.Paths;

public class IndexTest {

    //索引写入对象
    private IndexWriter writer;


    public static void main(String[] args) throws Exception {

        String rootPath = "/searchengine/surfbird-search";
        String indexDir = rootPath + "/index";         //1 指定目录创建索引
        String dataDir = rootPath + "/data";          //2 对指定目录中的*.txt文件进行索引

        long start = System.currentTimeMillis();
        IndexTest indexer = new IndexTest(indexDir);
        int numIndexed;
        try {
            numIndexed = indexer.index(dataDir, new TextFilesFilter());
        } finally {
            indexer.close();
        }
        long end = System.currentTimeMillis();

        System.out.println("Indexing " + numIndexed + " files took "
                + (end - start) + " milliseconds");
    }

    

    public IndexTest(String indexDir) throws IOException {
        //FS file system文件系统，指定文档索引存储目录
        Directory dir = FSDirectory.open(Paths.get(indexDir));
        //标准分词器，会自动去掉空格啊，is a the等单词,如果是中文，会单字分词
        Analyzer analyzer = new StandardAnalyzer();
        //将标准分词器配到写索引的配置中
        IndexWriterConfig config = new IndexWriterConfig(analyzer); 
        //实例化索引写入对象，
        writer = new IndexWriter(dir, config); 
    }

    public void close() throws IOException {
        writer.close();                             //4 关闭IndexWriter
    }

    public int index(String dataDir, FileFilter filter)
            throws Exception {

        File[] files = new File(dataDir).listFiles();

        for (File f : files) {
            if (!f.isDirectory() &&
                    !f.isHidden() &&
                    f.exists() &&
                    f.canRead() &&
                    (filter == null || filter.accept(f))) {
                indexFile(f);
            }
        }

        return writer.numDocs();                     //5 返回被索引的文档数
    }

    private static class TextFilesFilter implements FileFilter {
        public boolean accept(File path) {
            return path.getName().toLowerCase()        //6 只索引*.txt文件，采用FileFilter
                    .endsWith(".txt");                  //6
        }
    }

    protected Document getDocument(File f) throws Exception {
        //文档对象
        Document doc = new Document();
        //向文档中添加域（域名，域值）
        doc.add(new Field("contents", new FileReader(f)));      //7 索引文件内容
        doc.add(new Field("filename", f.getName(),              //8 索引文件名
                Field.Store.YES, Field.Index.NOT_ANALYZED));//8
        //Store是否存储
        doc.add(new Field("fullpath", f.getCanonicalPath(),     //9 索引文件完整路径
                Field.Store.YES, Field.Index.NOT_ANALYZED));//9
        return doc;
    }

    private void indexFile(File f) throws Exception {
        System.out.println("Indexing " + f.getCanonicalPath());
        Document doc = getDocument(f);
        writer.addDocument(doc);                              //10 向Lucene索引中添加文档
    }
}

创建搜索程序

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import java.io.IOException;
import java.nio.file.Paths;

public class SearchTest {

    public static void search(String indexDir, String q)
            throws IOException, ParseException {

        Directory dir = FSDirectory.open(Paths.get(indexDir));
        IndexReader reader = DirectoryReader.open(dir);
        Analyzer analyzer = new StandardAnalyzer(); //标准分词器，会自动去掉空格啊，is a the等单词
        QueryParser parser = new QueryParser("contents", analyzer); //查询解析器
        Query query = parser.parse(q);              //4
        long start = System.currentTimeMillis();

        IndexSearcher searcher = new IndexSearcher(reader);
        TopDocs hits = searcher.search(query, 10); //5 搜索索引
        long end = System.currentTimeMillis();

        System.err.println("Found " + hits.totalHits +   //6 记录索引状态
                " document(s) (in " + (end - start) +        // 6
                " milliseconds) that matched query '" +     // 6
                q + "':");                                   // 6

        for (ScoreDoc scoreDoc : hits.scoreDocs) {
            Document doc = searcher.doc(scoreDoc.doc);               //7 返回匹配文本
            System.out.println(doc.get("fullpath"));  //8 显示匹配文件名
        }

        reader.close();                                //9 关闭IndexSearcher
    }

    public static void main(String[] args) {
        String rootPath = "/searchengine/surfbird-search";
        String indexDir = rootPath + "/index";         //1 指定目录创建索引
        String q = "周瑜"; //查询这个字符串
        try {
            search(indexDir, q);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

参考文档

《Lucene实战》