Lucene的主要逻辑图
Lucene包括两块:一是文本内容经切词后索入库;二是根据查询条件返回结果。
根据这个图来实现非常简单,如下
创建索引:
public String createIndex() throws Exception {
// 实例化分词器,使用的是中文分词器
Analyzer analyzer = new PaodingAnalyzer();
// 指定要保存的文件路径并保存到FSDirectory中
FSDirectory directory = FSDirectory.getDirectory(URLDecoder.decode(AnalyzerAction.class
.getResource("/date/index/article/").toString(),"UTF-8").substring(6));
// true表示覆盖原来已经创建的索引,如果是false表示不覆盖,而是继续添加索引
IndexWriter writer = new IndexWriter(directory, analyzer, true);
List list = articleManager.articleList();
for (Iterator it = list.iterator(); it.hasNext();) {
Document doc = new Document();
Article article = (Article) it.next();
doc.add(new Field("id", String.valueOf(article.getId()), Field.Store.YES,
Field.Index.UN_TOKENIZED));
doc.add(new Field("article_title", article.getArticleTitle(), Field.Store.YES,
Field.Index.TOKENIZED));
String content = FunctionUtil.Html2Text(article.getArticleContent());
doc.add(new Field("article_content", content, Field.Store.YES,
Field.Index.TOKENIZED));
writer.addDocument(doc);
}
writer.optimize();
writer.close();
}
搜索结果:
/**
* 通过关键词 得到结果
*/
public void searchIndex(String path, String keywords) throws Exception {
FSDirectory directory = FSDirectory.getDirectory(path);
IndexReader reader = IndexReader.open(directory);
Searcher searcher = new IndexSearcher(directory);
// MultiFieldQueryParser.parse中的参数分别为:
// 1.关键词
// 2.要查询的字段,字符串类型的数组
String[] field = { "article_title", "article_content" };
// 3.两个字段的关系(与或非)
BooleanClause.Occur[] flags = new BooleanClause.Occur[] {
BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD };
// 4.指明分词的时候所用的分词器
Analyzer analyzer = new PaodingAnalyzer();
Query query = MultiFieldQueryParser.parse(keywords, field, flags,
analyzer);
// 由于我们目前使用的查询是多字段查询,需要匹配度的排序
// QueryScorer内置计分器
query.rewrite(reader);// 用于重写query对象,目的能够让计分器识别当前的query.
// 获得结果集
Hits hits = searcher.search(query);
for (int i = 0; i < hits.length(); i++) {
Document doc = hits.doc(i);
Article article = new Article();
article.setId(Integer.valueOf(doc.get("id")));
// title
String title = doc.get("article_title");
// content
String content = doc.get("article_content");
// 以上两项需要加亮
// Highlighter的构造函数中需要添加两个参数
// 1.高亮文字的格式(这个格式是基于html)
SimpleHTMLFormatter simpleHTMLFOrmatter = new SimpleHTMLFormatter(
"<font color=red>", "</font>");
// 2.计分器
Highlighter highlighter = new Highlighter(simpleHTMLFOrmatter,
new QueryScorer(query));
// 关键字附近字符串的截取,截取120个字
Fragmenter fragmenter = new SimpleFragmenter(120);
highlighter.setTextFragmenter(fragmenter);
// 针对某个字段的加亮以及截取
TokenStream tokenStream = analyzer.tokenStream("article_content",
new StringReader(content));
//将加亮并截取的字符串取出来
String highLightText = highlighter.getBestFragment(tokenStream, content);
article.setArticleContent(highLightText);
// 针对某个字段的加亮以及截取
TokenStream title_tokenStream = analyzer.tokenStream("article_title",
new StringReader(title));
//将加亮并截取的字符串取出来
String title_highLightText = highlighter.getBestFragment(title_tokenStream, title);
article.setArticleTitle(title_highLightText);
searcheResult.add(article);
}
reader.close();
}