1.最近想学lucene ,就去下了最新的3.5版本。发现这东西不错,值得玩玩。
整个步骤无非两步
1.建立索引
先说怎么建立索引,几个核心的类
IndexWriterConfig:建立索引的配置对象,里面包含一个索引解析器Analyzer
IndexWriter:写索引的类。
好了要建立索引最为核心的就是上面两个类。
具体可以看代码
package com.mingming.xue.lecene;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class FileIndexer {
private IndexWriter indexWriter;
// 索引文件存放目录
private File fileIndex = new File(FileSearchConstant.FILE_INDEX);
// 要被建立索引的文件目录
private File fileDir = new File(FileSearchConstant.FILE_DIR);
// private static Logger logger = LoggerFactory.getLogger(FileIndexer.class);
public static void main(String[] args) throws IOException {
FileIndexer fileIndexer = new FileIndexer();
fileIndexer.buildIndex();
}
public void buildIndex() throws IOException {
boolean isCreate = true;
//建立索引的配置类,包含了一个解析器
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_35,
new StandardAnalyzer(Version.LUCENE_35));
//设置我们的解析器是新建还是追加更新
setModel(isCreate, indexWriterConfig);
//索引的建立类 第一个参数索引的存放位置,第二个参数索引的配置对象
indexWriter = new IndexWriter(FSDirectory.open(fileIndex), indexWriterConfig);
long startTime = System.currentTimeMillis();
//建立索引
indexDocs(fileDir, indexWriter);
//这个方法在新增索引的情况会很有用,就是讲原来散落的索引文件重新进行整理合并!
indexWriter.forceMerge(1);
// indexWriter.commit();
// 关闭索引写
indexWriter.close();
long endTime = System.currentTimeMillis();
System.out.println("cost :" + (endTime - startTime) + "seconds");
}
private void setModel(boolean isCreate, IndexWriterConfig indexWriterConfig) {
if (isCreate) {
indexWriterConfig.setOpenMode(OpenMode.CREATE);
} else {
indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
}
private void indexDocs(File fileDir, IndexWriter indexWriter) throws CorruptIndexException, IOException {
if (fileDir.canRead()) {
if (fileDir.isDirectory()) {
String[] listFiles = fileDir.list();
for (String file : listFiles) {
indexDocs(new File(fileDir, file), indexWriter);
}
} else {
// Document 代表一个索引类型件
Document doc = new Document();
// Field 代表索引的项,比如我们这里对文件的路径进行建索引
Field pathField = new Field(FileSearchConstant.PATH, fileDir.getPath(), Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS);
/*索引建立有两个很关键的因素:
* Document Frequency 即文档频次
* Document Frequency (df):即有多少文档包含次Term。df 越大说明越不重要
* 这里代表在建索引时不考虑这两点,只建立索引
*/
pathField.setIndexOptions(IndexOptions.DOCS_ONLY);
doc.add(pathField);
// NumericField modifyField = new NumericField(IndexerConstant.MODIFIED);
// modifyField.setLongValue(fileDir.lastModified());
// doc.add(modifyField);
//对文件内容建立索引
Field contentField = new Field(FileSearchConstant.CONTENTS, getContents(fileDir), Field.Store.YES,
Field.Index.ANALYZED);
// FileInputStream fileInputStream = getFileInputStream(fileDir);
// doc.add(new Field(IndexerConstant.CONTENTS, new BufferedReader(new InputStreamReader(fileInputStream,
// "UTF-8"))));
// fileInputStream.close();
contentField.setIndexOptions(IndexOptions.DOCS_ONLY);
doc.add(contentField);
//添加或是更新索引
if (indexWriter.getConfig().getOpenMode() == OpenMode.CREATE) {
indexWriter.addDocument(doc);
} else if (indexWriter.getConfig().getOpenMode() == OpenMode.CREATE_OR_APPEND) {
indexWriter.updateDocument(new Term(FileSearchConstant.PATH, fileDir.getPath()), doc);
}
}
}
}
public String getContentByUtils(File fileDir) {
String content = null;
try {
content = IOUtils.toString(new FileInputStream(fileDir), "UTF-8");
} catch (Exception e) {
e.printStackTrace();
}
return content;
}
/*
* 这个方法不推荐使用,在readline的时候没有把换行符带入,带入在建立索引的时候不能做正确的分词,建议使用上面那个文件读取
*/
public String getContents(File fileDir) {
StringBuffer result = new StringBuffer();
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(fileDir), "UTF-8"));
String temp = null;
while ((temp = reader.readLine()) != null) {
result.append(temp);
}
} catch (Exception e) {
e.printStackTrace();
return null;
} finally {
if (null != reader) {
try {
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return result.toString();
}
}
2.按照索引查找
直接看代码吧,不废话了。
package com.mingming.xue.lecene;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class FileSearcher {
private String content = "haha";
public void search() throws IOException, ParseException {
// FSDirectory.open(new File(FileSearchConstant.FILE_INDEX) 这个会根据你的操作系统类型,打开最为合适的索引读取器:简单文件读取,内存,NIO等三种
IndexReader indexReader = IndexReader.open(FSDirectory.open(new File(FileSearchConstant.FILE_INDEX)));
IndexSearcher searcher = new IndexSearcher(indexReader);
//构建要查找的内容项
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);
QueryParser queryParser = new QueryParser(Version.LUCENE_35, FileSearchConstant.CONTENTS, analyzer);
//解析内容
Query query = queryParser.parse(content);
System.out.println("Searching for: " + query.toString());
//TopDocs是返回结果,10代表最大取10个,在实际应用中我们一般要做分页处理,所以这里需要特别注意下!
TopDocs topDocs = searcher.search(query, 10);
ScoreDoc[] docs = topDocs.scoreDocs;
if (null != docs) {
for (int i = 0; i < docs.length; i++) {
ScoreDoc scoreDoc = docs[i];
//这步很重要的,也是最为关键的一步
Document doc = searcher.doc(scoreDoc.doc);
//取的自己想要的数据
String contents = doc.get(FileSearchConstant.CONTENTS);
String path = doc.get(FileSearchConstant.PATH);
System.out.println(contents);
System.out.println(path);
}
}
}
public static void main(String[] args) throws IOException, ParseException {
FileSearcher fileSearcher = new FileSearcher();
fileSearcher.search();
}
}
全部代码见附件,因为自己采用的是maven 构建,所以若要下载,请现装maven。
mvn install
mvn eclipse:eclipse -DdownloadSources=true
这样既可查看代码及源码!