lucene之第一次亲密接触

最新推荐文章于 2025-11-26 15:59:01 发布

原创最新推荐文章于 2025-11-26 15:59:01 发布 · 158 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#java #开发工具 #操作系统

框架学习专栏收录该内容

14 篇文章

订阅专栏

本文介绍如何使用Lucene 3.5版本进行文档索引的创建与搜索。通过具体代码示例，详细展示了使用`IndexWriter`和`StandardAnalyzer`等核心类来建立文件索引的方法，并演示了如何利用`IndexSearcher`进行高效查询。

1.最近想学lucene ，就去下了最新的3.5版本。发现这东西不错，值得玩玩。

整个步骤无非两步

1.建立索引

先说怎么建立索引，几个核心的类

IndexWriterConfig：建立索引的配置对象，里面包含一个索引解析器Analyzer

IndexWriter：写索引的类。

好了要建立索引最为核心的就是上面两个类。

具体可以看代码

package com.mingming.xue.lecene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;

import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class FileIndexer {

    private IndexWriter indexWriter;

   // 索引文件存放目录
    private File        fileIndex = new File(FileSearchConstant.FILE_INDEX);
   // 要被建立索引的文件目录
    private File        fileDir   = new File(FileSearchConstant.FILE_DIR);

    // private static Logger logger = LoggerFactory.getLogger(FileIndexer.class);

    public static void main(String[] args) throws IOException {
        FileIndexer fileIndexer = new FileIndexer();
        fileIndexer.buildIndex();

    }

    public void buildIndex() throws IOException {

        boolean isCreate = true;

        //建立索引的配置类，包含了一个解析器    
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_35,
                                                                    new StandardAnalyzer(Version.LUCENE_35));
        //设置我们的解析器是新建还是追加更新
        setModel(isCreate, indexWriterConfig);
        
        //索引的建立类 第一个参数索引的存放位置，第二个参数索引的配置对象
        indexWriter = new IndexWriter(FSDirectory.open(fileIndex), indexWriterConfig);

        long startTime = System.currentTimeMillis();
        
        //建立索引
        indexDocs(fileDir, indexWriter);
        
        //这个方法在新增索引的情况会很有用，就是讲原来散落的索引文件重新进行整理合并！ 
        indexWriter.forceMerge(1);
        // indexWriter.commit();

        // 关闭索引写
        indexWriter.close();

        long endTime = System.currentTimeMillis();

        System.out.println("cost :" + (endTime - startTime) + "seconds");

    }

    private void setModel(boolean isCreate, IndexWriterConfig indexWriterConfig) {
        if (isCreate) {
            indexWriterConfig.setOpenMode(OpenMode.CREATE);
        } else {
            indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }
    }

    private void indexDocs(File fileDir, IndexWriter indexWriter) throws CorruptIndexException, IOException {

        if (fileDir.canRead()) {

            if (fileDir.isDirectory()) {

                String[] listFiles = fileDir.list();
                for (String file : listFiles) {
                    indexDocs(new File(fileDir, file), indexWriter);
                }

            } else {
                
                // Document 代表一个索引类型件
                Document doc = new Document();

                // Field 代表索引的项，比如我们这里对文件的路径进行建索引
                Field pathField = new Field(FileSearchConstant.PATH, fileDir.getPath(), Field.Store.YES,
                                            Field.Index.NOT_ANALYZED_NO_NORMS);
                /*索引建立有两个很关键的因素：
                  * Document Frequency 即文档频次
                  * Document Frequency (df)：即有多少文档包含次Term。df 越大说明越不重要
                  * 这里代表在建索引时不考虑这两点，只建立索引
                 */
                pathField.setIndexOptions(IndexOptions.DOCS_ONLY);
                doc.add(pathField);

                // NumericField modifyField = new NumericField(IndexerConstant.MODIFIED);
                // modifyField.setLongValue(fileDir.lastModified());
                // doc.add(modifyField);
                
                //对文件内容建立索引
                Field contentField = new Field(FileSearchConstant.CONTENTS, getContents(fileDir), Field.Store.YES,
                                               Field.Index.ANALYZED);

                // FileInputStream fileInputStream = getFileInputStream(fileDir);
                // doc.add(new Field(IndexerConstant.CONTENTS, new BufferedReader(new InputStreamReader(fileInputStream,
                // "UTF-8"))));
                // fileInputStream.close();
                contentField.setIndexOptions(IndexOptions.DOCS_ONLY);
                
                doc.add(contentField);
                
                //添加或是更新索引
                if (indexWriter.getConfig().getOpenMode() == OpenMode.CREATE) {
                    indexWriter.addDocument(doc);
                } else if (indexWriter.getConfig().getOpenMode() == OpenMode.CREATE_OR_APPEND) {
                    indexWriter.updateDocument(new Term(FileSearchConstant.PATH, fileDir.getPath()), doc);
                }

            }
        }

    }

    public String getContentByUtils(File fileDir) {

        String content = null;

        try {

            content = IOUtils.toString(new FileInputStream(fileDir), "UTF-8");

        } catch (Exception e) {
            e.printStackTrace();
        }
        return content;
    }
    

    /*
    *  这个方法不推荐使用，在readline的时候没有把换行符带入，带入在建立索引的时候不能做正确的分词，建议使用上面那个文件读取
    */
    public String getContents(File fileDir) {

        StringBuffer result = new StringBuffer();
        BufferedReader reader = null;
        try {

            reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileDir), "UTF-8"));

            String temp = null;
            while ((temp = reader.readLine()) != null) {
                result.append(temp);
            }

        } catch (Exception e) {

            e.printStackTrace();
            return null;

        } finally {

            if (null != reader) {

                try {
                    reader.close();

                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        return result.toString();
    }

}

2.按照索引查找

直接看代码吧，不废话了。

package com.mingming.xue.lecene;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class FileSearcher {

    private String content = "haha";

    public void search() throws IOException, ParseException {
         
        // FSDirectory.open(new File(FileSearchConstant.FILE_INDEX) 这个会根据你的操作系统类型，打开最为合适的索引读取器：简单文件读取，内存，NIO等三种
        IndexReader indexReader = IndexReader.open(FSDirectory.open(new File(FileSearchConstant.FILE_INDEX)));
        IndexSearcher searcher = new IndexSearcher(indexReader);
        
        //构建要查找的内容项
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);
        QueryParser queryParser = new QueryParser(Version.LUCENE_35, FileSearchConstant.CONTENTS, analyzer);
        
        //解析内容
        Query query = queryParser.parse(content);
  
        System.out.println("Searching for: " + query.toString());
         
        //TopDocs是返回结果，10代表最大取10个，在实际应用中我们一般要做分页处理，所以这里需要特别注意下！ 
        TopDocs topDocs = searcher.search(query, 10);
        ScoreDoc[] docs = topDocs.scoreDocs;

        if (null != docs) {

            for (int i = 0; i < docs.length; i++) {
                ScoreDoc scoreDoc = docs[i];
                //这步很重要的，也是最为关键的一步
                Document doc = searcher.doc(scoreDoc.doc);
               //取的自己想要的数据 
                String contents = doc.get(FileSearchConstant.CONTENTS);
                String path = doc.get(FileSearchConstant.PATH);
                System.out.println(contents);
                System.out.println(path);

            }
        }
    }

    public static void main(String[] args) throws IOException, ParseException {
        FileSearcher fileSearcher = new FileSearcher();
        fileSearcher.search();
    }
}

全部代码见附件，因为自己采用的是maven 构建，所以若要下载，请现装maven。

mvn install

mvn eclipse:eclipse -DdownloadSources=true

这样既可查看代码及源码!