Lucene提高搜索排名(文档域加权)

最新推荐文章于 2021-02-04 14:48:50 发布

原创最新推荐文章于 2021-02-04 14:48:50 发布 · 468 阅读

0 ·

CC 4.0 BY-SA版权

Lucene 专栏收录该内容

5 篇文章

订阅专栏

本文介绍了一个使用Lucene进行加权搜索的实战案例。通过对比未加权和加权后的搜索结果，展示了如何为特定字段设置权重以优化搜索效果。在示例中，根据职位的不同，对文档标题字段进行了不同的加权。

简单Demo 对比查看 (没有加权的)

package com.gcx.lucene;

import java.io.IOException;
import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.junit.Test;

public class IndexTest2 {
	private String ids[]={"1","2","3","4"};
	private String authors[]={"Jack","Marry","John","Json"};
	private String positions[]={"accounting","technician","salesperson","boss"};
	private String titles[]={"Java is a good language.","Java is a cross platform language","Java powerful","You should learn java"};
	private String contents[]={
			"If possible, use the same JRE major version at both index and search time.",
			"When upgrading to a different JRE major version, consider re-indexing. ",
			"Different JRE major versions may implement different versions of Unicode,",
			"For example: with Java 1.4, `LetterTokenizer` will split around the character U+02C6,"
	};
	
	private Directory dir;
	
	/**
	 * 获取IndexWriter实例
	 * @return
	 * @throws Exception
	 */
	private IndexWriter getWriter()throws Exception{
		Analyzer analyzer=new StandardAnalyzer(); // 标准分词器
		IndexWriterConfig iwc=new IndexWriterConfig(analyzer);
		IndexWriter writer=new IndexWriter(dir, iwc);
		return writer;
	}
	/**
	 * 生成索引
	 * @throws Exception
	 */
	@Test
	public void index()throws Exception{
		dir=FSDirectory.open(Paths.get("D:\\lucene3"));
		IndexWriter writer=getWriter();
		for(int i=0;i<ids.length;i++){
			Document doc=new Document();
			doc.add(new StringField("id", ids[i], Field.Store.YES));
			doc.add(new StringField("author",authors[i],Field.Store.YES));
			doc.add(new StringField("position",positions[i],Field.Store.YES));
			
			
			doc.add(new TextField("title", titles[i], Field.Store.YES));
			doc.add(new TextField("content", contents[i], Field.Store.NO));
			writer.addDocument(doc); // 添加文档
		}
		writer.close();
	}
	/**
	 * 查询
	 * @throws IOException 
	 */
	@Test
	public void search() throws IOException{
		dir=FSDirectory.open(Paths.get("D:\\lucene3"));
		IndexReader reader=DirectoryReader.open(dir);
		IndexSearcher search=new IndexSearcher(reader);
		String searchField="title";
		String q="java";
		Term t=new Term(searchField,q);
		Query qu=new TermQuery(t);
		TopDocs hits = search.search(qu, 10);
		System.out.println("匹配:"+q+"总共查询到"+hits.totalHits+"个文档");
		for(ScoreDoc sd:hits.scoreDocs){
			Document doc = search.doc(sd.doc);
			System.out.println(doc.get("author"));
		}
		reader.close();
		
		
	}
}

结果：

经过加权后的

	/**
	 * 生成索引
	 * @throws Exception
	 */
	@Test
	public void index()throws Exception{
		dir=FSDirectory.open(Paths.get("D:\\lucene3"));
		IndexWriter writer=getWriter();
		for(int i=0;i<ids.length;i++){
			Document doc=new Document();
			doc.add(new StringField("id", ids[i], Field.Store.YES));
			doc.add(new StringField("author",authors[i],Field.Store.YES));
			doc.add(new StringField("position",positions[i],Field.Store.YES));
			// 加权操作
			TextField field=new TextField("title", titles[i], Field.Store.YES);
			if("boss".equals(positions[i])){
				field.setBoost(1.5f);
			}
			doc.add(field);
			doc.add(new TextField("content", contents[i], Field.Store.NO));
			writer.addDocument(doc); // 添加文档
		}
		writer.close();
	}

结果：