转自:http://hi.baidu.com/z57354658/blog/item/b80f524b2c92e1fa82025cbd.html
- public class Test {
- Analyzer analyzer = new SimpleAnalyzer();
- Directory ramDir = new RAMDirectory();
- public void createRamIndex() throws CorruptIndexException, LockObtainFailedException, IOException{
- IndexWriter writer = new IndexWriter(ramDir,analyzer,IndexWriter.MaxFieldLength.LIMITED);
- Document doc1 = new Document();
- doc1.add(new Field("title","java",Store.YES,Index.ANALYZED));
- doc1.add(new Field("author","callan",Store.YES,Index.ANALYZED));
- doc1.add(new Field("subject","java一门编程语言,用java的人很多,编程语言也不少,但是java最流行",Store.YES,Index.ANALYZED,.));
- Document doc2 = new Document();
- doc2.add(new Field("title","english",Store.YES,Index.ANALYZED));
- doc2.add(new Field("author","wcq",Store.YES,Index.ANALYZED));
- doc2.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,.));
- Document doc3 = new Document();
- doc3.add(new Field("title","asp",Store.YES,Index.ANALYZED));
- doc3.add(new Field("author","ca",Store.YES,Index.ANALYZED));
- doc3.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,.));
- writer.addDocument(doc1);
- writer.addDocument(doc2);
- writer.addDocument(doc3);
- writer.optimize();
- writer.close();
- }
- public void search() throws CorruptIndexException, IOException{
- IndexReader reader = IndexReader.open(ramDir);
- IndexSearcher searcher = new IndexSearcher(reader);
- Term term = new Term("title","java"); //在title里查询java词条
- TermQuery query = new TermQuery(term);
- Hits hits = searcher.search(query);
- for (int i = 0; i < hits.length(); i++)
- {
- Document doc = hits.doc(i);
- System.out.println(doc.get("title"));
- System.out.println(doc.get("subject"));
- System.out.println("moreLike search: ");
- morelikeSearch(reader,hits.id(i));
- }
- }
- private void morelikeSearch(IndexReader reader,int id) throws IOException
- {
- //根据这个document的id获取这个field的Term Vector 信息,就是这个field分词之后在这个field里的频率、位置、等信息
- TermFreqVector vector = reader.getTermFreqVector(id, "subject");
- BooleanQuery query = new BooleanQuery();
- for (int i = 0; i < vector.size(); i++)
- {
- TermQuery tq = new TermQuery(new Term("subject",
- vector.getTerms()[i])); //获取每个term保存的Token
- query.add(tq, BooleanClause.Occur.SHOULD);
- }
- IndexSearcher searcher = new IndexSearcher(ramDir);
- Hits hits = searcher.search(query);
- //显示代码,略
- }
- //Lucene使用提高高亮显示性能
- public void highterLightSearch() throws CorruptIndexException, IOException{
- IndexReader reader = IndexReader.open(ramDir);
- IndexSearcher searcher = new IndexSearcher(reader);
- TermQuery query = new TermQuery(new Term("subject","java"));
- Hits hits = searcher.search(query);
- //高亮显示设置
- SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>","</font>");
- Highlighter highlighter =new Highlighter(simpleHTMLFormatter,new QueryScorer(query));
- // 这个100是指定关键字字符串的context的长度,你可以自己设定,因为不可能返回整篇正文内容
- highlighter.setTextFragmenter(new SimpleFragmenter(100));
- for(int i = 0; i < hits.length(); i++){
- Document doc = hits.doc(i);
- TermPositionVector termFreqVector = (TermPositionVector)reader.getTermFreqVector(hits.id(i), "subject");
- TermFreqVector vector = reader.getTermFreqVector(hits.id(i), "subject");
- TokenStream tokenStream = TokenSources.getTokenStream(termFreqVector);
- String result = highlighter.getBestFragment(tokenStream, doc.get("subject"));
- System.out.println(doc.get("title"));
- System.out.println(result);
- }
- }
- public static void main(String[] args) throws CorruptIndexException, IOException
- {
- Test t = new Test();
- t.createRamIndex();
- t.search();
- }
- }
本文通过一个具体的例子展示了如何使用Lucene进行文档索引和检索。包括创建索引、基本搜索、相似文档搜索及利用TermVector实现高亮显示等功能,有助于理解Lucene的工作原理。
1115

被折叠的 条评论
为什么被折叠?



