lucene索引查询

导入依赖

<dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-core</artifactId>
            <version>6.0.0</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queryparser -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-queryparser</artifactId>
            <version>6.0.0</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers-common -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-analyzers-common</artifactId>
            <version>6.0.0</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queries -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-queries</artifactId>
            <version>6.0.0</version>
        </dependency>

        <!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-highlighter -->
        <dependency>
            <groupId>org.apache.lucene</groupId>
            <artifactId>lucene-highlighter</artifactId>
            <version>6.0.0</version>
        </dependency>

        <dependency>
            <groupId>commons-io</groupId>
            <artifactId>commons-io</artifactId>
            <version>2.5</version>
        </dependency>

        <dependency>
            <groupId>com.hankcs.nlp</groupId>
            <artifactId>hanlp-lucene-plugin</artifactId>
            <version>1.1.2</version>
        </dependency>

代码实现

package com.chao;

import com.hankcs.lucene.HanLPAnalyzer;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.File;
import java.io.IOException;
import java.nio.file.FileSystems;

/* I AM A PURE SHEEP
 *                   
 * WHAT YOU WANT TO DO                 
 *                 
 * FUCK AS YOU WISH                                           
 */
public class IndexUtil {
    public void index() throws IOException {
        //创建索引的目录对象
        Directory directory= FSDirectory.open(FileSystems.getDefault().
                getPath("/Users/dllo/Desktop/0402wordspace/0830Lucene/src/main/resources/tar"));

        //创建分词器
        // hanLPAnalyzer中文分词器
        // StandardAnalyzer英文文分词器
        Analyzer analyzer = new HanLPAnalyzer();

        //写入设置
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);

        //索引写入器
        IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig);

        //删除所有的索引(生产环境慎用)
        indexWriter.deleteAll();

        //获取目标文件夹内的所有TXT,遍历写入索引
        File sourceFile = new File("/Users/dllo/Desktop/0402wordspace/0830Lucene/src/main/resources/source");
        File[] listFiles = sourceFile.listFiles();
        for (File file : listFiles) {
            //每一次获取文件,都创建一个新的document
            Document document = new Document();

            //将文件内容写入到索引中
            document.add(new Field("content", FileUtils.readFileToString(file,"utf-8"), TextField.TYPE_STORED));

            //文件标题
            document.add(new Field("fileName",file.getName(),TextField.TYPE_STORED));

            document.add(new Field("filePath",file.getAbsolutePath(),TextField.TYPE_STORED));

            indexWriter.addDocument(document);

        }

        indexWriter.close();

    }

 

package com.chao;

import com.hankcs.lucene.HanLPAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

import java.io.IOException;
import java.io.StringReader;
import java.nio.file.FileSystems;

/* I AM A PURE SHEEP
 *                   
 * WHAT YOU WANT TO DO                 
 *                 
 * FUCK AS YOU WISH                                           
 */
public class SearchUtil {
    public void search(String keyword) throws IOException, ParseException, InvalidTokenOffsetsException {
        //获取保存索引的文件夹
        Directory directory = FSDirectory.open(FileSystems.getDefault().
                getPath("/Users/dllo/Desktop/0402wordspace/0830Lucene/src/main/resources/tar"));
        //索引读取工具
        DirectoryReader directoryReader = DirectoryReader.open(directory);

        //检索器
        IndexSearcher indexSearcher = new IndexSearcher(directoryReader);

        //创建中文分词器
        Analyzer analyzer = new HanLPAnalyzer();

        //查询解析器
        QueryParser queryParser = new QueryParser("content",analyzer);

        //使用分词器对用户的输入内容进行分词,得到要进行查询的关键词集
        Query query = queryParser.parse(keyword);

        //使用检索器对query进行检索,得到结果集
        TopDocs hits = indexSearcher.search(query, 2);

        //显示结果数量
        int i = hits.totalHits;

        System.out.println("查询出"+i+"条结果");

        //查询结果
        ScoreDoc[] scoreDocs = hits.scoreDocs;
        for (ScoreDoc scoreDoc : scoreDocs) {
            Document document = indexSearcher.doc(scoreDoc.doc);
            String s = document.get("content");
            System.out.println(s);
        }


        //高亮查询代码开始----------------

        //查询评分
        QueryScorer queryScorer = new QueryScorer(query);

        //规定高亮的格式
        Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
        SimpleHTMLFormatter simpleHTMLFormatter =
                new SimpleHTMLFormatter("<font color='red'>","</font>");

        //高亮分析器
        Highlighter highlighter = new Highlighter(simpleHTMLFormatter,queryScorer);
        highlighter.setTextFragmenter(fragmenter);

        // 遍历结果,处理高亮关键字
        for (ScoreDoc scoreDoc : scoreDocs) {
            Document document = indexSearcher.doc(scoreDoc.doc);
            String s = document.get("content");
            if (s!=null){

                //分词器做好处理后得到的一个流
                //这个流储存了分词的各种信息
                //可以通过tokenStream有效的获取分词单元
                TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(s));
                String h = highlighter.getBestFragment(tokenStream, s);

                System.out.println(h);
            }
        }
    }

    public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException {
        SearchUtil searchUtil = new SearchUtil();
        searchUtil.search("民政局淑女");
        
    }

}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值