【Lucene学习笔记】索引的增删改查与查看工具

最新推荐文章于 2025-03-30 09:17:32 发布

F_Guardian

最新推荐文章于 2025-03-30 09:17:32 发布

阅读量1.7k

点赞数

分类专栏： Lucene 文章标签： lucene 搜索学习笔记 java

本文链接：https://blog.youkuaiyun.com/fyfmfof/article/details/42014201

版权

Lucene 专栏收录该内容

4 篇文章

订阅专栏

本文详细介绍了Lucene的索引建立、增删改查、查看工具Luke的使用，以及最新版本中FieldType的更新。同时，提供了索引查看、更新、恢复和强制优化的操作方法，并附带了一个测试类方便实践。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

【Lucene学习笔记】索引的增删改查与查看工具

一、基本增删改查

添加和查询的方法在上一篇笔记中已经提到过了，这里就不做说明了，删除、恢复、强制优化等基本方法和说明都注释在代码里了，直接mark在这里吧

package test;

import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Date;
import java.util.Scanner;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class IndexUtil {
	/**索引建立**/
	public void index() throws IOException {
	     //1、创建Directory
	     Directory directory = FSDirectory.open(new File("F:\\luceneIndex"));
	     Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 
	     //2、创建IndexWriter
	     IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT,luceneAnalyzer);
	     IndexWriter indexWriter = new IndexWriter(directory, config);
	     long startTime = new Date().getTime();
	     //dataDir is the directory that hosts the text files that to be indexed 
	     File   dataDir  = new File("F:\\luceneData");
	     File[] dataFiles  = dataDir.listFiles();
	     for(int i = 0; i < dataFiles.length; i++){ 
	          if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){
	               System.out.println("Indexing file " + dataFiles[i].getCanonicalPath());
	               //3、创建Document对象
	               Document document = new Document(); 
	               Reader txtReader = new FileReader(dataFiles[i]);
	               //4、为Document对象添加Field
	               document.add(new Field("num",""+(i+1),StringField.TYPE_STORED));
	               document.add(new Field("content",txtReader,TextField.TYPE_NOT_STORED));
	               document.add(new Field("path",dataFiles[i].getCanonicalPath(),StringField.TYPE_STORED));
	               String title = dataFiles[i].getName();
	               document.add(new Field("title",title.substring(0, title.length()-4),TextField.TYPE_STORED));
	               //5、通过IndexWriter添加文档到索引中
	               indexWriter.addDocument(document); 
	          }
	     }
	     //6、关闭IndexWriter和Directory
	     indexWriter.close();
	     directory.close();
	     long endTime = new Date().getTime(); 
	     System.out.println("It takes " + (endTime - startTime) 
	         + " milliseconds to create index for the files in directory "
	         + dataDir.getPath());        
	}
	
	/**查询操作**/
	public void query() throws IOException, ParseException {
		//1、创建Directory
        File indexDir = new File("F:\\luceneIndex"); 
        FSDirectory directory = FSDirectory.open(indexDir);
        //2、根据Directory创建DirectoryReader
        DirectoryReader reader = DirectoryReader.open(directory);
        System.out.println("numDocs: "+reader.numDocs());
        System.out.println("maxDocs: "+reader.maxDoc());
        System.out.println("deleteDocs: "+reader.numDeletedDocs());
        //3、根据IndexReader创建IndexSearcher
        IndexSearcher searcher = new IndexSearcher(reader);
        //4、创建QueryParser对象
        Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "content", luceneAnalyzer);
        if(!indexDir.exists()){ 
        	 System.out.println("The Lucene index is not exist");
        	 return;
        }
        System.out.print("是否进一步查询?(1/0):");
        Scanner readScanner = new Scanner(System.in);
        int x = readScanner.nextInt();
        if(x==1) {
        	readScanner.nextLine();
        	int n = 10;
        	while(n--!=0) {
        		String str = readScanner.nextLine();
        		//5、创建搜索的Query
        		Query query = parser.parse(str);
        		//6、根据Searcher搜索返回TopDocs然后获取ScoreDoc
        		ScoreDoc[] sd = searcher.search(query,1000).scoreDocs;
        		System.out.println("查找共有"+sd.length+"个结果");
        		//7、根据ScoreDoc获取具体Document对象并得到所需值
        		for (int i = 0; i < sd.length; i++) {
        			Document doc = searcher.doc(sd[i].doc);
        			System.out.println(doc.get("title")+" : "+doc.get("path"));
        		}
        	}
        }
        //8、关闭IndexReader和Directory
        reader.close();
        directory.close();
	}
	
	/**删除索引**/
	public void delete() throws IOException {
		Directory directory = FSDirectory.open(new File("F:\\luceneIndex"));
	    Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
	    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT,luceneAnalyzer);
	    IndexWriter indexWriter = new IndexWriter(directory, config);
	    
	    //删除参数是一个选项，可以是一个Query或者是一个term，term是一个精确查找值
	    //此时删除的文档并不会被完全删除，而是存储在一个"回收站"(del文件)中，是可以恢复的
	    indexWriter.deleteDocuments(new Term("title","test01"));
	    
	    indexWriter.close();
	    directory.close();
	}
	
	/**索引恢复**/
	public void undelete() throws IOException {
		//使用IndexReader进行恢复,但是在4.10.2版本中已经没有了
		System.out.println("目前版本无法恢复!");
	}
	
	/**强制优化**/
	public void merge() throws IOException {
		 Directory directory = FSDirectory.open(new File("F:\\luceneIndex"));
	     Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 
	     IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT,luceneAnalyzer);
	     IndexWriter indexWriter = new IndexWriter(directory, config);
	     
	     //会将索引合并为1段，这1段中被删除的数据会被清空(强制删除)
	     //特别注意：此处Lucene在3.5之后不建议使用，因为会消耗大量的开销，Lucene会根据情况自动优化
	     indexWriter.forceMerge(1);
	     
	     indexWriter.close();
	     directory.close();
	}
	
	/**更新索引**/
	public void update() throws IOException {
		Directory directory = FSDirectory.open(new File("F:\\luceneIndex"));
		Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
	    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT,luceneAnalyzer);
	    IndexWriter indexWriter = new IndexWriter(directory, config);
	    
	    //update先删除后添加
	    File   dataDir  = new File("F:\\luceneData");
	    File[] dataFiles  = dataDir.listFiles();
	    System.out.println("Indexing file " + dataFiles[1].getCanonicalPath());
	    Document document = new Document(); 
	    Reader txtReader = new FileReader(dataFiles[1]);
	    document.add(new Field("num",""+(1+2),StringField.TYPE_STORED));
	    document.add(new Field("content",txtReader,TextField.TYPE_NOT_STORED));
	    document.add(new Field("path",dataFiles[1].getCanonicalPath(),StringField.TYPE_STORED));
	    String title = dataFiles[1].getName();
	    document.add(new Field("title",title.substring(0, title.length()-4),TextField.TYPE_STORED));
	    
	    indexWriter.updateDocument(new Term("title", "test02"), document);
	    
	    indexWriter.close();
	    directory.close();
	}
}

写一个test类方便测试，可以通过查询来看索引内容和文件的变化

package test;

import java.io.IOException;
import java.util.Scanner;

import org.apache.lucene.queryparser.classic.ParseException;

public class Test {
	public static void main(String[] args) throws IOException, ParseException {
		IndexUtil iu = new IndexUtil();
		while(true){
			System.out.println("1.索引建立");
			System.out.println("2.查询");
			System.out.println("3.删除索引");
			System.out.println("4.恢复删除");
			System.out.println("5.强制优化");
			System.out.println("6.更新索引");
			System.out.print("输入操作代号：");
			Scanner read = new Scanner(System.in);
			int o = read.nextInt();
			switch(o) {
			case 1:
				iu.index();
				break;
			case 2:
				iu.query();
				break;
			case 3:
				iu.delete();
				break;
			case 4:
				iu.undelete();
				break;
			case 5:
				iu.merge();
				break;
			case 6:
				iu.update();
				break;
			}
		}
	}
}

二、索引查看工具Luke

这是一个很好用的索引查看工具，注意使用与Lucene对应的版本，这里我使用最新的4.10.2版本

选择好索引所在的文件夹就可以看到Lucene索引中分词、document等等各种信息了，可以很方便的核对自己的操作是否正确

三、关于Lucene4.10.2中的FieldType

在这里补充一点，关于以前版本所使用的new Field方法中使用的是否索引是否分词等信息的参数如下：

Field.Index	Field.Store	说明
TOKENIZED(分词)	YES	被分词索引且存储
TOKENIZED	NO	被分词索引但不存储
NO	YES	这是不能被搜索的，它只是被搜索内容的附属物。如URL等
UN_TOKENIZED	YES/NO	不被分词，它作为一个整体被搜索,搜一部分是搜不出来的
NO	NO	没有这种用法

而在最新的4.10.2版本中是由以下这些参数来与之对应的：

FieldType	说明
TextField.TYPE_STORED	被分词索引且存储
TextField.TYPE_STORED	被分词索引但不存储
StringField.TYPE_STORED	不被分词，它作为一个整体被搜索，索引且存储
StringField.TYPE_NOT_STORED	不被分词，它作为一个整体被搜索，索引但不存储
StoredField.TYPE	这是不能被搜索的，它只是被搜索内容的附属物。如URL等