Lucene实战：文本索引与检索-优快云博客

luke工具是查看lucene生成的索引文件

lucene api文档地址:http://tool.oschina.net/apidocs/apidoc?api=lucene-3.6.0

Analyzer是分词器接口

IndexWriterConfig是索引编写配置

IndexWriter是用来编辑索引

Document是文档对象里面存储的是设置好的索引

QueryParser是搜索域的查询对象

Query是搜查词条的对象

IndexableField是索引对象

TopDocs 排名靠前的查询

ScoreDoc查询到的结果集

CharTermAttribute获取词元文本属性

lucene版本:7.7.1

lucene pom文件引入

        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-core -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-core</artifactId>
            <version>7.7.1</version>
        </dependency>
        <!-- Lucene解析库 -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-queryparser</artifactId>
            <version>7.7.1</version>
        </dependency>
        <!-- Lucene附加的分析库 -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers-common</artifactId>
            <version>7.7.1</version>
        </dependency>

package spring.lucene.deom;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.jupiter.api.Test;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.Iterator;
import java.util.List;

/**
 * 对文本文件建立索引
 */
public class TxtFileIndexer {
    /**
     * 添加索引
     *
     * @throws Exception
     */
    @Test
    public void addIndex() throws Exception {
        //索引文件目录
        Directory directory = FSDirectory.open(Paths.get("d:\\luceneIndex"));
        //测试文本数据目录
        File luceneData = new File("d:\\luceneData");
        //分析器(分词工具)
        /**
         * Analyzer
         * 在一个文档被索引之前，首先需要对文档内容进行分词处理，这部分工作就是由 Analyzer 来做的。
         * Analyzer 类是一个抽象类，它有多个实现。针对不同的语言和应用需要选择适合的 Analyzer。
         * Analyzer 把分词后的内容交给 IndexWriter 来建立索引
         */
        Analyzer luceneAnalyzer = new StandardAnalyzer();
        /**
         * 索引编写器配置
         */
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(luceneAnalyzer);
        //得到多个txt文档
        File[] dataFiles = luceneData.listFiles();
        /**
         * IndexWriter 是用来创建索引并添加文档到索引中的
         */
        IndexWriter writer = new IndexWriter(directory, indexWriterConfig);
        long startTime = System.currentTimeMillis();
        for (File dataFile : dataFiles) {
            //判断当前file对象是否为一个文件，并且以.txt结尾endsWith是判断后缀
            if (dataFile.isFile() && dataFile.getName().endsWith(".txt")) {
                System.out.println("indexing file:" + dataFile.getCanonicalPath());
                //创建文档对象
                Document document = new Document();
                //把设置好的索引加到document里，以便在确定被索引文档
                document.add(new TextField("contents", new FileReader(dataFile)));
                //Field.Store.Yes把文件名写在索引文件里面,为no就是不需要写在索引文件里面
                document.add(new TextField("fileName", dataFile.getName(), Field.Store.YES));
                //把完整路径存在索引文件里
                document.add(new TextField("fullPath", dataFile.getCanonicalPath(), Field.Store.YES));
                writer.addDocument(document);
            }
        }
        writer.close();
        long endTime = System.currentTimeMillis();
        System.out.println("用时:" + (endTime - startTime));
    }

    /**
     * 查询
     *
     * @throws Exception
     */
    @Test
    public void search() throws Exception {
        //设置索引目录
        Directory directory = FSDirectory.open(Paths.get("d:\\luceneIndex"));
        //设置indexReader
        IndexReader indexReader = DirectoryReader.open(directory);
        //设置indexSearcher
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        //设置分词
        Analyzer luceneAnalyzer = new StandardAnalyzer();
        //创建搜索的Query,创建parser来确定搜索文件的内容,第一个参数表示搜索的域
        QueryParser queryParser = new QueryParser("contents", luceneAnalyzer);
        Query query = queryParser.parse("你");
        TopDocs topDocs = indexSearcher.search(query, 10);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            float score = scoreDoc.score;//关键词得分
            int index = scoreDoc.doc;//索引下标
            Document document = indexSearcher.doc(index);
            IndexableField field = document.getField("fullPath");
            IndexableField fileName = document.getField("fileName");
            System.out.println("文件名称:" + fileName.stringValue());
            System.out.println("文件名称:" + field.stringValue());
        }
    }

    /**
     * 删除索引
     * @throws IOException
     */
    @Test
    public void deleteIndex() throws IOException {
        //索引文件目录
        Directory directory = FSDirectory.open(Paths.get("d:\\luceneIndex"));
        //分析器(分词工具)
        /**
         * Analyzer
         * 在一个文档被索引之前，首先需要对文档内容进行分词处理，这部分工作就是由 Analyzer 来做的。
         * Analyzer 类是一个抽象类，它有多个实现。针对不同的语言和应用需要选择适合的 Analyzer。
         * Analyzer 把分词后的内容交给 IndexWriter 来建立索引
         */
        Analyzer luceneAnalyzer = new StandardAnalyzer();
        /**
         * 索引编写器配置
         */
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(luceneAnalyzer);
        /**
         * IndexWriter 是用来创建索引并添加文档到索引中的
         */
        IndexWriter writer = new IndexWriter(directory, indexWriterConfig);
        //使用词条删除
        Term term = new Term("contents", "你");
        writer.deleteDocuments(term);
        writer.close();
    }

    /**
     * 修改索引
     * @throws Exception
     */
    @Test
    public void updateIndex() throws Exception {
        //索引文件目录
        Directory directory = FSDirectory.open(Paths.get("d:\\luceneIndex"));
        //分析器(分词工具)
        /**
         * Analyzer
         * 在一个文档被索引之前，首先需要对文档内容进行分词处理，这部分工作就是由 Analyzer 来做的。
         * Analyzer 类是一个抽象类，它有多个实现。针对不同的语言和应用需要选择适合的 Analyzer。
         * Analyzer 把分词后的内容交给 IndexWriter 来建立索引
         */
        Analyzer luceneAnalyzer = new StandardAnalyzer();
        /**
         * 索引编写器配置
         */
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(luceneAnalyzer);
        /**
         * IndexWriter 是用来创建索引并添加文档到索引中的
         */
        IndexWriter writer = new IndexWriter(directory, indexWriterConfig);
        //设置indexReader
        IndexReader indexReader = DirectoryReader.open(directory);
        //设置indexSearcher
        IndexSearcher indexSearcher = new IndexSearcher(indexReader);
        Term term = new Term("contents", "world");
        //创建搜索的Query,创建parser来确定搜索文件的内容,第一个参数表示搜索的域
        QueryParser queryParser = new QueryParser("contents", luceneAnalyzer);
        Query query = queryParser.parse("world");
        TopDocs topDocs = indexSearcher.search(query, 10);
        ScoreDoc[] scoreDocs = topDocs.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            float score = scoreDoc.score;//关键词得分
            int index = scoreDoc.doc;//索引下标
            Document document = indexSearcher.doc(index);
            /**
             * 1.设置更新的条件
             * 2.设置更新的内容和对象
             */
            List<IndexableField> fields = document.getFields();
            Iterator<IndexableField> iterator = fields.iterator();
            while (iterator.hasNext()){
                IndexableField next = iterator.next();
                System.out.println("**************************分割线****************************TxtFileIndexer.java");
            }
            writer.updateDocument(term, document);
            writer.close();
        }
    }


    /**
     * 分词器
     *
     * @throws IOException
     */
    @Test
    public void anlyzer() throws IOException {
        Analyzer analyzer = new StandardAnalyzer();
        TokenStream stream = analyzer.tokenStream("", "你好啊 早上好");
        //获取每个单词信息,获取词元文本属性
        CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            System.out.print("[" + cta + "]");
        }
        System.out.println();
        analyzer.close();
    }
}

转载于:https://my.oschina.net/undwin/blog/3018641