Lucene TermVector用法:相关搜索功能及提高高亮显示性能


public class TermVectorTest {

Analyzer analyzer = new SimpleAnalyzer();
Directory ramDir = new RAMDirectory();

public void createRamIndex() throws CorruptIndexException, LockObtainFailedException, IOException{

IndexWriter writer = new IndexWriter(ramDir,analyzer,IndexWriter.MaxFieldLength.LIMITED);

Document doc1 = new Document();
doc1.add(new Field("title","java",Store.YES,Index.ANALYZED));
doc1.add(new Field("author","callan",Store.YES,Index.ANALYZED));
doc1.add(new Field("subject","java一门编程语言,用java的人很多,编程语言也不少,但是java最流行",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));

Document doc2 = new Document();
doc2.add(new Field("title","english",Store.YES,Index.ANALYZED));
doc2.add(new Field("author","wcq",Store.YES,Index.ANALYZED));
doc2.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));

Document doc3 = new Document();
doc3.add(new Field("title","asp",Store.YES,Index.ANALYZED));
doc3.add(new Field("author","ca",Store.YES,Index.ANALYZED));
doc3.add(new Field("subject","英语用的人很多",Store.YES,Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));

writer.addDocument(doc1);
writer.addDocument(doc2);
writer.addDocument(doc3);

writer.optimize();
writer.close();
}

public void search() throws CorruptIndexException, IOException{
IndexReader reader = IndexReader.open(ramDir);
IndexSearcher searcher = new IndexSearcher(reader);
Term term = new Term("title","java"); //在title里查询java词条
TermQuery query = new TermQuery(term);
Hits hits = searcher.search(query);
for (int i = 0; i < hits.length(); i++)
{
Document doc = hits.doc(i);
System.out.println(doc.get("title"));
System.out.println(doc.get("subject"));
System.out.println("moreLike search: ");

morelikeSearch(reader,hits.id(i));
}
}

private void morelikeSearch(IndexReader reader,int id) throws IOException
{
//根据这个document的id获取这个field的Term Vector 信息,就是这个field分词之后在这个field里的频率、位置、等信息
TermFreqVector vector = reader.getTermFreqVector(id, "subject");

BooleanQuery query = new BooleanQuery();

for (int i = 0; i < vector.size(); i++)
{
TermQuery tq = new TermQuery(new Term("subject",
vector.getTerms()[i])); //获取每个term保存的Token

query.add(tq, BooleanClause.Occur.SHOULD);

}

IndexSearcher searcher = new IndexSearcher(ramDir);

Hits hits = searcher.search(query);

//显示代码,略


}

//Lucene使用TermVector提高高亮显示性能
public void highterLightSearch() throws CorruptIndexException, IOException{
IndexReader reader = IndexReader.open(ramDir);

IndexSearcher searcher = new IndexSearcher(reader);

TermQuery query = new TermQuery(new Term("subject","java"));

Hits hits = searcher.search(query);

//高亮显示设置
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>","</font>");

Highlighter highlighter =new Highlighter(simpleHTMLFormatter,new QueryScorer(query));

// 这个100是指定关键字字符串的context的长度,你可以自己设定,因为不可能返回整篇正文内容
highlighter.setTextFragmenter(new SimpleFragmenter(100));

for(int i = 0; i < hits.length(); i++){

Document doc = hits.doc(i);

TermPositionVector termFreqVector = (TermPositionVector)reader.getTermFreqVector(hits.id(i), "subject");

TermFreqVector vector = reader.getTermFreqVector(hits.id(i), "subject");
TokenStream tokenStream = TokenSources.getTokenStream(termFreqVector);

String result = highlighter.getBestFragment(tokenStream, doc.get("subject"));

System.out.println(doc.get("title"));

System.out.println(result);

}


}

public static void main(String[] args) throws CorruptIndexException, IOException
{
TermVectorTest t = new TermVectorTest();
t.createRamIndex();
t.search();
}

}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值