第一个Lucene例子,使用lucene-4.0.0,中文查询没有结果。
1.创建索引
package lucene.index;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* 创建文档索引
* 步骤1:创建Lucene Index Writer
* 步骤2:索引文档
*/
public class Indexer {
/*
* 创建索引的目录
*/
private String indexDir = "F:/project/Lucene/index";
/*
* 文档目录
*/
private String dataDir = "F:/project/Lucene/docs";
/*
* 是否第一次创建索引
*/
private boolean create = true;
/*
* 这个类负责创建索引或打开已有索引,以及向索引中添加、删除或更新被索引文档的信息。 提供针对索引文件的写入操作,但不能读取或搜索索引。
*/
private IndexWriter writer;
/**
* 创建Lucene Index Writer
* 步骤1:Directory创建索引存放的位置
* 步骤2:创建分析器Analyzer
* 步骤3:配置IndexWriterConfig,使用分析器Analyzer
* 步骤4:创建IndexWriter,使用Directory和IndexWriterConfig
*/
public Indexer() throws IOException {
/*
* 它是一个抽象类,它的子类负责具体指定索引的存储路径。
*/
Directory dir = FSDirectory.open(new File(indexDir));
/*
* 分析器,它负责从被索引文本文件中提取语汇单元,并剔除剩下的无用信息。
*/
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer);
if (create) {
iwc.setOpenMode(OpenMode.CREATE);
} else {
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
iwc.setInfoStream(System.out);
writer = new IndexWriter(dir, iwc);
}
/**
* 关闭Lucene Index Writer
*/
public void close() throws IOException {
writer.close();
}
/**
* 索引文档
* 步骤1:找到文档目录下所有文件
* 步骤2:循环每个文档,如果是txt文档则步骤3,否则继续循环,或到步骤6
* 步骤3:文档作为输入流FileInputStream,创建Document,为Document添加多个域
* 步骤4:创建或更新索引文档
* 步骤5:关闭输入流
* 步骤6:返回索引文档的数目
*/
public int index() throws Exception {
File[] files = new File(dataDir).listFiles();
for (File f : files) {
FileInputStream fis = null;
try {
/*
* 只索引目录下所有txt文档
*/
if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead() && f.getName().toLowerCase().endsWith(".txt")) {
System.out.println("Indexing " + f.getCanonicalPath());
fis = new FileInputStream(f);
/*
* Document对象代表Field的集合。文档的Field代表文档或文档相关的一些元数据。
*/
Document doc = new Document();
/*
* TextField、StringField、LongField等Field是包含能被索引的文本内容的类。每个Field包含一个名称和值,以及一组选项来控制Lucene索引操作各个域值。
*/
doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));
doc.add(new StringField("filename", f.getName(), Field.Store.YES));
doc.add(new StringField("fullpath", f.getCanonicalPath(), Field.Store.YES));
doc.add(new LongField("modified", f.lastModified(), Field.Store.NO));
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
System.out.println("adding " + f);
writer.addDocument(doc);
} else {
System.out.println("updating " + f);
writer.updateDocument(new Term("path", f.getPath()), doc);
}
}
} finally {
if (fis != null) {
fis.close();
}
}
}
return writer.numDocs();
}
public static void main(String[] args) throws Exception {
Indexer indexer = null;
int numIndexed;
long start = System.currentTimeMillis();
try {
indexer = new Indexer();
numIndexed = indexer.index();
} finally {
if (indexer != null) {
indexer.close();
}
}
long end = System.currentTimeMillis();
System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds");
}
}
2.搜索
package lucene.index;
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/**
* 搜索文档
* 步骤1:创建IndexReader
* 步骤2:创建IndexSearcher
* 步骤3:创建Query
* 步骤4:搜索searcher.search
*/
public class Searcher {
/*
* 索引存放目录
*/
private String indexDir = "F:/project/Lucene/index";
/**
* 搜索
*
* @param 搜索的域名
* ,如contents或filename
* @param 搜索的值
*/
public void search(String where, String q) throws IOException, ParseException {
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));
/*
* 用于搜索由IndexWriter类创建的索引
*/
IndexSearcher searcher = new IndexSearcher(reader);
/*
* Query 方法一
*/
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
QueryParser parser = new QueryParser(Version.LUCENE_40, where, analyzer);
/*
* Lucene含有许多具体的Query子类,TermQuery、BooleanQuery、PhraseQuery、PrefixQuery、PhrasePrefixQuery、TermRangeQuery、NumericRangeQuery、FilteredQuery和SpanQuery
*
*/
Query query1 = parser.parse(q);
/*
* Query 方法二
*/
/*
* TermQuery是Lucene提供的最基本的查询类型,也是简单查询类型之一。用来匹配指定域中包含特定项的文档。
*/
Query query2 = new TermQuery(new Term(where, q));
long start = System.currentTimeMillis();
/*
* 一个简单的指针容器,指向前N个排名的搜索结果。
*/
TopDocs hits = searcher.search(query1, null, 10);
long end = System.currentTimeMillis();
System.err.println("Found " + hits.totalHits + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':");
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = searcher.doc(scoreDoc.doc);
System.out.println(doc.get("fullpath"));
System.out.println(doc.get("filename"));
}
}
public static void main(String[] args) throws IOException, ParseException {
Searcher searcher = new Searcher();
searcher.search("filename", "b.txt");
searcher.search("contents", "abc");
}
}

被折叠的 条评论
为什么被折叠?



