public class Indexer {
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new IllegalArgumentException("Usage: java " + Indexer.class.getName()
+ " <index dir> <data dir>");
}
String indexDir = args[0]; //1
String dataDir = args[1]; //2
long start = System.currentTimeMillis();
Indexer indexer = new Indexer(indexDir);
int numIndexed;
try {
numIndexed = indexer.index(dataDir, new TextFilesFilter());
} finally {
indexer.close();
}
long end = System.currentTimeMillis();
System.out.println("Indexing " + numIndexed + " files took "
+ (end - start) + " milliseconds");
}
private IndexWriter writer;
public Indexer(String indexDir) throws IOException {
Directory dir = FSDirectory.open(new File(indexDir));
writer = new IndexWriter(dir, //3
new StandardAnalyzer( //3
Version.LUCENE_30),//3
true, //3
IndexWriter.MaxFieldLength.UNLIMITED); //3
}
public void close() throws IOException {
writer.close(); //4
}
public int index(String dataDir, FileFilter filter)
throws Exception {
File[] files = new File(dataDir).listFiles();
for (File f: files) {
if (!f.isDirectory() &&
!f.isHidden() &&
f.exists() &&
f.canRead() &&
(filter == null || filter.accept(f))) {
indexFile(f);
}
}
return writer.numDocs(); //5
}
private static class TextFilesFilter implements FileFilter {
public boolean accept(File path) {
return path.getName().toLowerCase() //6
.endsWith(".txt"); //6
}
}
protected Document getDocument(File f) throws Exception {
Document doc = new Document();
doc.add(new Field("contents", new FileReader(f))); //7
doc.add(new Field("filename", f.getName(), //8
Field.Store.YES, Field.Index.NOT_ANALYZED));//8
doc.add(new Field("fullpath", f.getCanonicalPath(), //9
Field.Store.YES, Field.Index.NOT_ANALYZED));//9
return doc;
}
private void indexFile(File f) throws Exception {
System.out.println("Indexing " + f.getCanonicalPath());
Document doc = getDocument(f);
writer.addDocument(doc); //10
}
}
建立索引的过程
/*
#1 Create index in this directory
#2 Index *.txt files from this directory
#3 Create Lucene IndexWriter
#4 Close IndexWriter
#5 Return number of documents indexed
#6 Index .txt files only, using FileFilter
#7 Index file content
#8 Index file name
#9 Index file full path
#10 Add document to Lucene index
*/
public class Searcher {
public static void main(String[] args) throws IllegalArgumentException,
IOException, ParseException {
if (args.length != 2) {
throw new IllegalArgumentException("Usage: java " + Searcher.class.getName()
+ " <index dir> <query>");
}
String indexDir = args[0]; //1
String q = args[1]; //2
search(indexDir, q);
}
public static void search(String indexDir, String q)
throws IOException, ParseException {
Directory dir = FSDirectory.open(new File(indexDir)); //3
IndexSearcher is = new IndexSearcher(dir); //3
QueryParser parser = new QueryParser(Version.LUCENE_30, // 4
"contents", //4
new StandardAnalyzer( //4
Version.LUCENE_30)); //4
Query query = parser.parse(q); //4
long start = System.currentTimeMillis();
TopDocs hits = is.search(query, 10); //5
long end = System.currentTimeMillis();
System.err.println("Found " + hits.totalHits + //6
" document(s) (in " + (end - start) + // 6
" milliseconds) that matched query '" + // 6
q + "':"); // 6
for(ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc); //7
System.out.println(doc.get("fullpath")); //8
}
is.close(); //9
}
}
查询过程
/*
#1 Parse provided index directory
#2 Parse provided query string
#3 Open index
#4 Parse query
#5 Search index
#6 Write search stats
#7 Retrieve matching document
#8 Display filename
#9 Close IndexSearcher
*/
Lucene索引过程的核心类
IndexWriter
索引过程的核心组件,创建新索引和打开已有索引,以及向索引中添加删除更新被索引文档的信息。为你提供针对索引文件的写入操作,但不能用于读取或搜索索引,由Directory开辟一定空间来存储索引
Directory
子类负责具体指定索引的存储路径,IndexWriter不能直接索引文件,需要由Analyzer将文本分割成独立的单词才行
Analyzer
从被索引文件中提取语汇单元,Tika
Document
代表一些域的集合,代表文档或者和文档相关的一些元数据。Lucene只处理从二进制文档中提取的以Field实例形式出现的文本,作为文档的不同域单独存储并索引。
Field
搜索过程的核心类
IndexSearcher
连接索引的中心环节,只读方式打开索引的类,利用Directory实例来掌控前期创建的索引,
Term
搜索功能的基本单元,包含一对字符串元素:域名和单词。
Query
TermQuery
TopDocs
简单地指针容器,指向前N个排名的搜索结果(匹配查询条件的文档)。类记载前N个结果中的每个结果的int docID(用来恢复文档)和浮点型分数。
1674

被折叠的 条评论
为什么被折叠?



