lucene小例子

最新推荐文章于 2025-11-25 12:11:25 发布

原创最新推荐文章于 2025-11-25 12:11:25 发布 · 108 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#java

lucene 专栏收录该内容

3 篇文章

订阅专栏

本文介绍了使用 Lucene 2.4 进行文档索引和搜索的操作流程，包括实体与文档的转换、索引目录建立、搜索方法实现以及高亮显示结果。详细阐述了如何通过 Lucene 的查询解析器和搜索器进行全文检索，并展示了高亮显示匹配关键词的文本。此外，还提供了 CRUD 操作示例，包括创建、删除、更新索引和分页搜索功能。

1. 环境 lucene 2.4

a. 实体 Article.java

public class Article {
	private Long id;
	private String title;
	private String content;

	public Long getId() {
		return id;
	}
	public void setId(Long id) {
		this.id = id;
	}
	public String getTitle() {
		return title;
	}
	public void setTitle(String title) {
		this.title = title;
	}
	public String getContent() {
		return content;
	}
	public void setContent(String content) {
		this.content = content;
	}
}

b. 将实体和 document 转换的类

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumberTools;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;

// logForj
public class ArticleDocumentUtils {

	/**
	 * Article --> Document 
	 * @param article
	 * @return
	 */
	public static Document article2Document(Article article) {
		Document doc = new Document();

		// article.properties --> doc.fieldList
		doc.add(new Field("id", NumberTools.longToString(article.getId()), Store.YES, Index.NOT_ANALYZED));
		
		Field field = new Field("title", article.getTitle(), Store.YES, Index.ANALYZED);
		field.setBoost(2.0F); // 默认为1.0F
		doc.add(field);
		
		doc.add(new Field("content", article.getContent(), Store.YES, Index.ANALYZED));

		return doc;
	}

	/**
	 * Document --> Article
	 * 
	 * @param doc
	 * @return
	 */
	public static Article document2Article(Document doc) {
		Article article = new Article();

		Long id =NumberTools.stringToLong(doc.getField("id").stringValue());
		String title = doc.getField("title").stringValue();
		String content = doc.getField("content").stringValue();

		article.setId(id);
		article.setTitle(title);
		article.setContent(content);

		return article;
	}
}

// 时间类弄的转换
doc.add(new Field("postTime", DateTools.dateToString(article.getPostTime(), Resolution.SECOND), Store.YES, Index.NO));

article.setPostTime(DateTools.stringToDate(doc.get("postTime")));

c. 测试方法 HelloWorld.java

import java.util.ArrayList;
import java.util.List;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.junit.Test;

// Lucene 2.4
public class HelloWorld {

	// 索引目录
	private String indexPath = "./index/";
	// 分词器
	// private Analyzer analyzer = new StandardAnalyzer();
	private Analyzer analyzer = new MMAnalyzer();

	// 建立索引
	@Test
	public void createIndex() throws Exception {
		// 模拟一个已经存在的文章
		Article article = new Article();
		article.setId(1L);
		article.setTitle("小笑话 -- 牛人发帖"); // 笑话
		article
				.setContent("有一牛人发一帖，然后马上就用发帖id疯狂回复自己的帖子：自己回帖1：楼主太有才了自己回帖2：楼主说的不错，挺有道理的自己回帖3：楼主真是太牛了，好崇拜你.最后终于有人回复他的帖子： 我靠，好歹你也换个id啊");

		// article --> Document
		Document doc = ArticleDocumentUtils.article2Document(article);

		// 建立索引（放到索引库中）
		IndexWriter indexWriter = new IndexWriter(indexPath, analyzer,
				MaxFieldLength.LIMITED);
		indexWriter.addDocument(doc);
		indexWriter.close();
	}

	// 搜索
	@Test
	public void search() throws Exception {
		// String queryString = "笑话";
		String queryString = "幽默";

		// =============================================
		// 1，queryString --> query
		// hql --> Hibernate.Query
		String[] fields = new String[] { "title", "content" };
		QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer);
		Query query = queryParser.parse(queryString);

		// 2，搜索-->搜索结果 // 在所有文章的"标题"和"内容"中搜索
		IndexSearcher indexSearcher = new IndexSearcher(indexPath); // 在指定的索引库中搜索
		TopDocs topDocs = indexSearcher.search(query, null, 100);// TopDocs是包装了查询结果的对象

		// 3，处理搜索结果
		// topDocs.totalHits; 数字类型，代表匹配的结果的数量
		// topDocs.scoreDocs; ScoreDoc数组，代表匹配的所有结果（ScoreDoc只有Document的内部编号）
		System.out.println("匹配的结果的数量：" + topDocs.totalHits);
		List<Article> list = new ArrayList<Article>();
		for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
			int docSn = scoreDoc.doc; // 文档对应的内部编码
			Document doc = indexSearcher.doc(docSn); // 根据内部编号取出Document
			list.add(ArticleDocumentUtils.document2Article(doc));
		}
		indexSearcher.close();
		// =============================================

		// 打印结果
		for (Article a : list) {
			System.out.println("---------------------------> " + a.getId());
			System.out.println("Id	= " + a.getId());
			System.out.println("Title	= " + a.getTitle());
			System.out.println("Content	= " + a.getContent());
		}
	}
}

高亮器测试

import java.util.ArrayList;
import java.util.List;

import jeasy.analysis.MMAnalyzer;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.junit.Test;

import cn.itcast.lucene.helloworld.Article;
import cn.itcast.lucene.helloworld.ArticleDocumentUtils;

public class HighLighterTest {

	// 索引目录
	private String indexPath = "./index/";
	// 分词器
	private Analyzer analyzer = new MMAnalyzer();// new StandardAnalyzer();

	@Test
	public void test() throws Exception {
		String queryString = "回帖";
		// String queryString = "幽默";

		// =============================================
		// 1，queryString --> query
		QueryParser queryParser = new MultiFieldQueryParser(new String[] { "title", "content" }, analyzer);
		Query query = queryParser.parse(queryString);

		// 2，搜索-->搜索结果 // 在所有文章的"标题"和"内容"中搜索
		IndexSearcher indexSearcher = new IndexSearcher(indexPath); // 在指定的索引库中搜索
		TopDocs topDocs = indexSearcher.search(query, null, 100);// TopDocs是包装了查询结果的对象

		// ===================== 初始化高亮器
		Formatter formatter = new SimpleHTMLFormatter("<span class='keyword'>", "</span>");// 默认为<b>和</b>
		Scorer scorer = new QueryScorer(query);
		Highlighter highlighter = new Highlighter(formatter, scorer);

		Fragmenter fragmenter = new SimpleFragmenter(50); // 默认为100
		highlighter.setTextFragmenter(fragmenter);
		// =====================

		// 3，处理搜索结果
		// topDocs.totalHits; 数字类型，代表匹配的结果的数量
		// topDocs.scoreDocs; ScoreDoc数组，代表匹配的所有结果（ScoreDoc只有Document的内部编号）
		System.out.println("匹配的结果的数量：" + topDocs.totalHits);
		List<Article> list = new ArrayList<Article>();
		for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
			int docSn = scoreDoc.doc; // 文档对应的内部编码
			// scoreDoc.score;
			Document doc = indexSearcher.doc(docSn); // 根据内部编号取出Document

			// =================== 使用高亮器
			// doc.getField("content").stringValue() --> doc.get("content")
			// 高亮操作不影响原始数据
			// 如果高亮的属性值中没有出现关键词，就返回null
			String ht = highlighter.getBestFragment(analyzer, "content", doc.get("content"));
			if (ht != null) {
				doc.getField("content").setValue(ht);
			}
			// ===================

			list.add(ArticleDocumentUtils.document2Article(doc));
		}
		indexSearcher.close();
		// =============================================

		// 打印结果
		for (Article a : list) {
			System.out.println("---------------------------> " + a.getId());
			System.out.println("Id	= " + a.getId());
			System.out.println("Title	= " + a.getTitle());
			System.out.println("Content	= " + a.getContent());
		}
	}
}

CRUD 操作

import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;

import cn.itcast.lucene.helloworld.Article;
import cn.itcast.lucene.helloworld.ArticleDocumentUtils;

public class IndexDao {

	// 索引目录
	private String indexPath = "./index/";
	// 分词器
	private Analyzer analyzer = new StandardAnalyzer();

	/**
	 * 建立索引（保存到索引库）
	 * 
	 * @param article
	 */
	public void save(Article article) {
		// 1, article --> Document
		Document doc = ArticleDocumentUtils.article2Document(article);

		// 2, indexWriter.add( doc )
		IndexWriter indexWriter = null;
		try {
			indexWriter = new IndexWriter(indexPath, analyzer, MaxFieldLength.LIMITED);
			indexWriter.addDocument(doc);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			if (indexWriter != null) {
				try {
					indexWriter.close();
				} catch (Exception e) {
					throw new RuntimeException(e);
				}
			}
		}
	}

	/**
	 * 删除索引
	 * 
	 * @param id
	 * 
	 * delete from table_article WHERE ?(term.name)=?(term.value)
	 */
	public void delete(Long id) {
		IndexWriter indexWriter = null;
		try {
			indexWriter = new IndexWriter(indexPath, analyzer, MaxFieldLength.LIMITED);

			Term term = new Term("id", id.toString());
			// 含有term的所有Document都将被删掉
			indexWriter.deleteDocuments(term);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			if (indexWriter != null) {
				try {
					indexWriter.close();
				} catch (Exception e) {
					throw new RuntimeException(e);
				}
			}
		}
	}

	/**
	 * 更新索引
	 * 
	 * @param article
	 * 
	 * update table_article set xxx=xxx,yy=yyy... WHERE id=?( article.getId() )
	 */
	public void update(Article article) {
		IndexWriter indexWriter = null;
		try {
			Term term = new Term("id", article.getId().toString());
			Document doc = ArticleDocumentUtils.article2Document(article);

			indexWriter = new IndexWriter(indexPath, analyzer, MaxFieldLength.LIMITED);
			// 更新含有term的Document，更新后的状态在doc中
			indexWriter.updateDocument(term, doc);

			// 更新就是“先删除，再创建”
			// indexWriter.deleteDocuments(term);
			// indexWriter.addDocument(doc);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			if (indexWriter != null) {
				try {
					indexWriter.close();
				} catch (Exception e) {
					throw new RuntimeException(e);
				}
			}
		}
	}

	/**
	 * 搜索，分页（符合某条件的某一页的数据）
	 * 
	 * @param queryString
	 * @param firstResult
	 * @param maxResults
	 * @return
	 * 
	 * select * from table_article limit ?:first,?:max
	 * 
	 * select count(*) from table_article
	 */
	public SearchResult search(String queryString, int firstResult, int maxResults) {
		// 1, queryString --> Query
		IndexSearcher indexSearcher = null;
		try {
			QueryParser queryParser = new MultiFieldQueryParser(new String[] { "title", "content" }, analyzer);
			Query query = queryParser.parse(queryString);

			// 2, 进行搜索, 在title与content中搜索 --> TopDocs( totalHits, scoreDocs )
			indexSearcher = new IndexSearcher(indexPath);
			TopDocs topDocs = indexSearcher.search(query, null, 100);

			// 3, 处理结果，返回
			List<Article> list = new ArrayList<Article>();
			int end = Math.min(firstResult + maxResults, topDocs.scoreDocs.length);

			for (int i = firstResult; i < end; i++) {
				ScoreDoc scoreDoc = topDocs.scoreDocs[i];
				int docSn = scoreDoc.doc; // Document的内部编号
				Document doc = indexSearcher.doc(docSn); // 根据Document的内部编号取出相应的Document

				Article article = ArticleDocumentUtils.document2Article(doc);
				list.add(article);
			}

			return new SearchResult(topDocs.totalHits, list);
		} catch (Exception e) {
			throw new RuntimeException(e);
		} finally {
			if (indexSearcher != null) {
				try {
					indexSearcher.close(); // 取完数据在关闭
				} catch (Exception e) {
					throw new RuntimeException(e);
				}
			}
		}

	}

}