Lucene之Helloworld

本文介绍如何使用Lucene构建一个简易的文本搜索引擎,包括索引建立及搜索功能,并提供了完整的代码示例。
[b]Lucene[/b]不是一个完整搜索引擎,不具备爬虫功能,管理界面之类的功能,可其部分之项目中实现了网站的搜索引擎,[b][url=http://lucene.apache.org/nutch/]Nutch[/url][/b]就是其中的一个,基于Lucene实现的搜索引擎应用. .

本文记录下自己的学习点点滴滴,实现一个简单的程序,
[b]Hello world 之实现文本搜索[/b]
这里没应用中文分词的东西,可以参照[url=http://code.google.com/p/paoding/]庖丁解牛[/url]的项目,svn中已经上传了代码,上面有针对lucene3.0的.感兴趣的可自行试验.
SVN地址

svn checkout http://paoding.googlecode.com/svn/trunk/ paoding-read-only

项目是利用Maven构建的,自从开始用Maven就是疯狂的爱上了她.个人推荐使用!
Maven pom.xml

<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.15</version>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.1.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>3.0.0</version>
</dependency>


对索引的提取,数据是自己造的.下面提供[url=http://dl.iteye.com/topics/download/55c8c1a0-cd4d-3834-9a0f-3c70ca8030b1]下载[/url].
Index.java

import java.io.File;
import java.io.FileReader;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
* @author ruodao
* @since 1.0 2010-2-23 下午09:39:10
*/
public class Index {
public static void main(String[] args) throws Exception {
String indexDir = "E:\\Temp\\index";
String dataDir = "E:\\Temp\\data";

long start = System.currentTimeMillis();
Index indexer = new Index(indexDir);
int numIndex = indexer.index(dataDir);

indexer.close();

long end = System.currentTimeMillis();

System.out.println("Indexing " + numIndex + " files tooks "
+ (end - start) + " millisenconds");
}

private IndexWriter writer;
private Analyzer analyzer;

private static final Log logger = LogFactory.getLog(Index.class);

public Index(String indexDir) throws Exception {
Directory dir = FSDirectory.open(new File(indexDir));
analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
writer = new IndexWriter(dir, analyzer, MaxFieldLength.UNLIMITED);
}

public void close() throws Exception {
writer.close();
}

public int index(String dataDir) throws Exception {
File[] files = new File(dataDir).listFiles();
for (File f : files) {
if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead()
&& acceptFile(f)) {
indexFile(f);
}
}
return writer.numDocs();
}

protected boolean acceptFile(File f) {
return f.getName().endsWith(".txt");
}

protected Document getDocument(File f) throws Exception {
Document doc = new Document();
doc.add(new Field("contents", new FileReader(f)));
doc.add(new Field("filename", f.getCanonicalPath(), Store.YES,
org.apache.lucene.document.Field.Index.NOT_ANALYZED));
return doc;
}

private void indexFile(File f) throws Exception {
System.out.println("Index " + f.getCanonicalPath());

Document doc = getDocument(f);
if (doc != null) {
writer.addDocument(doc);
}


//查看分词情况 可选代码
TokenStream ts = analyzer.tokenStream("contents", new FileReader(doc
.get("filename")));
ts.addAttribute(TermAttribute.class);

while (ts.incrementToken()) {
TermAttribute ta = ts.getAttribute(TermAttribute.class);
logger.debug("{" + ta.term() + "}");
}
}
}


数据准备好了,也该提供给别人使用吧,一个简单的搜索.
Searcher.java

import java.io.File;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
* @author ruodao
* @since 1.0 2010-2-23 下午10:19:06
*/
public class Searcher {
public static void main(String[] args) throws Exception {
String indexDir = "E:\\Temp\\index";
String q = "中";

searc(indexDir, q);
}

private static void searc(String indexDir, String q) throws Exception {
Directory dir = FSDirectory.open(new File(indexDir), null);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
IndexSearcher is = new IndexSearcher(dir);
QueryParser parser = new QueryParser(Version.LUCENE_CURRENT,
"contents", analyzer);
Query query = parser.parse(q);
long start = System.currentTimeMillis();

TopDocs hits = is.search(query, 10);

long end = System.currentTimeMillis();

System.err.println("Found " + hits.totalHits + " Document(s) (in )"
+ (end - start) + "milliseconds) that matched query '" + q
+ "':");
for (int j = 0; j < hits.scoreDocs.length; j++) {
ScoreDoc scoreDoc = hits.scoreDocs[j];
Document doc = is.doc(scoreDoc.doc);
System.out.println(doc.get("filename"));
}
is.close();
}
}

一个简单而完整的程序已经完成了.可以实验实验.

--EOF--
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值