【Lucene学习笔记】索引的增删改查与查看工具
一、基本增删改查
添加和查询的方法在上一篇笔记中已经提到过了,这里就不做说明了,删除、恢复、强制优化等基本方法和说明都注释在代码里了,直接mark在这里吧
package test;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Date;
import java.util.Scanner;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class IndexUtil {
/**索引建立**/
public void index() throws IOException {
//1、创建Directory
Directory directory = FSDirectory.open(new File("F:\\luceneIndex"));
Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
//2、创建IndexWriter
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT,luceneAnalyzer);
IndexWriter indexWriter = new IndexWriter(directory, config);
long startTime = new Date().getTime();
//dataDir is the directory that hosts the text files that to be indexed
File dataDir = new File("F:\\luceneData");
File[] dataFiles = dataDir.listFiles();
for(int i = 0; i < dataFiles.length; i++){
if(dataFiles[i].isFile() && dataFiles[i].getName().endsWith(".txt")){
System.out.println("Indexing file " + dataFiles[i].getCanonicalPath());
//3、创建Document对象
Document document = new Document();
Reader txtReader = new FileReader(dataFiles[i]);
//4、为Document对象添加Field
document.add(new Field("num",""+(i+1),StringField.TYPE_STORED));
document.add(new Field("content",txtReader,TextField.TYPE_NOT_STORED));
document.add(new Field("path",dataFiles[i].getCanonicalPath(),StringField.TYPE_STORED));
String title = dataFiles[i].getName();
document.add(new Field("title",title.substring(0, title.length()-4),TextField.TYPE_STORED));
//5、通过IndexWriter添加文档到索引中
indexWriter.addDocument(document);
}
}
//6、关闭IndexWriter和Directory
indexWriter.close();
directory.close();
long endTime = new Date().getTime();
System.out.println("It takes " + (endTime - startTime)
+ " milliseconds to create index for the files in directory "
+ dataDir.getPath());
}
/**查询操作**/
public void query() throws IOException, ParseException {
//1、创建Directory
File indexDir = new File("F:\\luceneIndex");
FSDirectory directory = FSDirectory.open(indexDir);
//2、根据Directory创建DirectoryReader
DirectoryReader reader = DirectoryReader.open(directory);
System.out.println("numDocs: "+reader.numDocs());
System.out.println("maxDocs: "+reader.maxDoc());
System.out.println("deleteDocs: "+reader.numDeletedDocs());
//3、根据IndexReader创建IndexSearcher
IndexSearcher searcher = new IndexSearcher(reader);
//4、创建QueryParser对象
Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "content", luceneAnalyzer);
if(!indexDir.exists()){
System.out.println("The Lucene index is not exist");
return;
}
System.out.print("是否进一步查询?(1/0):");
Scanner readScanner = new Scanner(System.in);
int x = readScanner.nextInt();
if(x==1) {
readScanner.nextLine();
int n = 10;
while(n--!=0) {
String str = readScanner.nextLine();
//5、创建搜索的Query
Query query = parser.parse(str);
//6、根据Searcher搜索返回TopDocs然后获取ScoreDoc
ScoreDoc[] sd = searcher.search(query,1000).scoreDocs;
System.out.println("查找共有"+sd.length+"个结果");
//7、根据ScoreDoc获取具体Document对象并得到所需值
for (int i = 0; i < sd.length; i++) {
Document doc = searcher.doc(sd[i].doc);
System.out.println(doc.get("title")+" : "+doc.get("path"));
}
}
}
//8、关闭IndexReader和Directory
reader.close();
directory.close();
}
/**删除索引**/
public void delete() throws IOException {
Directory directory = FSDirectory.open(new File("F:\\luceneIndex"));
Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT,luceneAnalyzer);
IndexWriter indexWriter = new IndexWriter(directory, config);
//删除参数是一个选项,可以是一个Query或者是一个term,term是一个精确查找值
//此时删除的文档并不会被完全删除,而是存储在一个"回收站"(del文件)中,是可以恢复的
indexWriter.deleteDocuments(new Term("title","test01"));
indexWriter.close();
directory.close();
}
/**索引恢复**/
public void undelete() throws IOException {
//使用IndexReader进行恢复,但是在4.10.2版本中已经没有了
System.out.println("目前版本无法恢复!");
}
/**强制优化**/
public void merge() throws IOException {
Directory directory = FSDirectory.open(new File("F:\\luceneIndex"));
Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT,luceneAnalyzer);
IndexWriter indexWriter = new IndexWriter(directory, config);
//会将索引合并为1段,这1段中被删除的数据会被清空(强制删除)
//特别注意:此处Lucene在3.5之后不建议使用,因为会消耗大量的开销,Lucene会根据情况自动优化
indexWriter.forceMerge(1);
indexWriter.close();
directory.close();
}
/**更新索引**/
public void update() throws IOException {
Directory directory = FSDirectory.open(new File("F:\\luceneIndex"));
Analyzer luceneAnalyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT,luceneAnalyzer);
IndexWriter indexWriter = new IndexWriter(directory, config);
//update先删除后添加
File dataDir = new File("F:\\luceneData");
File[] dataFiles = dataDir.listFiles();
System.out.println("Indexing file " + dataFiles[1].getCanonicalPath());
Document document = new Document();
Reader txtReader = new FileReader(dataFiles[1]);
document.add(new Field("num",""+(1+2),StringField.TYPE_STORED));
document.add(new Field("content",txtReader,TextField.TYPE_NOT_STORED));
document.add(new Field("path",dataFiles[1].getCanonicalPath(),StringField.TYPE_STORED));
String title = dataFiles[1].getName();
document.add(new Field("title",title.substring(0, title.length()-4),TextField.TYPE_STORED));
indexWriter.updateDocument(new Term("title", "test02"), document);
indexWriter.close();
directory.close();
}
}
写一个test类方便测试,可以通过查询来看索引内容和文件的变化
package test;
import java.io.IOException;
import java.util.Scanner;
import org.apache.lucene.queryparser.classic.ParseException;
public class Test {
public static void main(String[] args) throws IOException, ParseException {
IndexUtil iu = new IndexUtil();
while(true){
System.out.println("1.索引建立");
System.out.println("2.查询");
System.out.println("3.删除索引");
System.out.println("4.恢复删除");
System.out.println("5.强制优化");
System.out.println("6.更新索引");
System.out.print("输入操作代号:");
Scanner read = new Scanner(System.in);
int o = read.nextInt();
switch(o) {
case 1:
iu.index();
break;
case 2:
iu.query();
break;
case 3:
iu.delete();
break;
case 4:
iu.undelete();
break;
case 5:
iu.merge();
break;
case 6:
iu.update();
break;
}
}
}
}
二、索引查看工具Luke
这是一个很好用的索引查看工具,注意使用与Lucene对应的版本,这里我使用最新的4.10.2版本
选择好索引所在的文件夹就可以看到Lucene索引中分词、document等等各种信息了,可以很方便的核对自己的操作是否正确
三、关于Lucene4.10.2中的FieldType
在这里补充一点,关于以前版本所使用的new Field方法中使用的是否索引是否分词等信息的参数如下:
Field.Index | Field.Store | 说明 |
---|---|---|
TOKENIZED(分词) | YES | 被分词索引且存储 |
TOKENIZED | NO | 被分词索引但不存储 |
NO | YES | 这是不能被搜索的,它只是被搜索内容的附属物。如URL等 |
UN_TOKENIZED | YES/NO | 不被分词,它作为一个整体被搜索,搜一部分是搜不出来的 |
NO | NO | 没有这种用法 |
FieldType | 说明 |
TextField.TYPE_STORED | 被分词索引且存储 |
TextField.TYPE_STORED | 被分词索引但不存储 |
StringField.TYPE_STORED | 不被分词,它作为一个整体被搜索,索引且存储 |
StringField.TYPE_NOT_STORED | 不被分词,它作为一个整体被搜索,索引但不存储 |
StoredField.TYPE | 这是不能被搜索的,它只是被搜索内容的附属物。如URL等 |
本文固定连接:http://blog.youkuaiyun.com/fyfmfof/article/details/42014201