Lucene4.7.2 搜索与高亮显示

最新推荐文章于 2018-01-16 09:39:09 发布

原创最新推荐文章于 2018-01-16 09:39:09 发布 · 420 阅读

1 ·

CC 4.0 BY-SA版权

本文介绍如何使用Lucene进行文档索引创建及搜索，并实现搜索结果的得分排序和高亮显示功能。通过具体代码示例展示了索引创建过程、简单搜索得分情况以及高亮搜索结果的方法。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>cn.et</groupId>
  <artifactId>LuceneScoreSearch</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <parent>  
	<groupId>org.springframework.boot</groupId>  
	<artifactId>spring-boot-starter-parent</artifactId>  
	<version>1.5.9.RELEASE</version>  
  </parent>  
  <dependencies>
    <dependency>  
      <groupId>org.springframework.boot</groupId>  
      <artifactId>spring-boot-starter-web</artifactId>  
    </dependency>
    <dependency>  
        <groupId>com.janeluo</groupId>  
        <artifactId>ikanalyzer</artifactId>  
        <version>2012_u6</version>  
    </dependency>
    <dependency>  
        <groupId> org.apache.lucene</groupId>  
        <artifactId>lucene-highlighter</artifactId>  
        <version>4.7.2</version>  
    </dependency>  
    <!-- JUnit Java语言的单元测试框架 -->
	<dependency>
	    <groupId>junit</groupId>
	    <artifactId>junit</artifactId>
	    <version>4.12</version>
	    <scope>test</scope>
	</dependency>
  </dependencies>
  <build>
    <plugins>
	  <plugin>
	    <groupId>org.apache.maven.plugins</groupId>
	  	<artifactId>maven-compiler-plugin</artifactId>
	  	<configuration>
	  	  <source>1.7</source>
	  	  <target>1.7</target>
	  	  <encoding>UTF-8</encoding>
	  	</configuration>
	  </plugin>
    </plugins>
  </build>
</project>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>cn.et</groupId>
  <artifactId>LuceneScoreSearch</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <parent>  
	<groupId>org.springframework.boot</groupId>  
	<artifactId>spring-boot-starter-parent</artifactId>  
	<version>1.5.9.RELEASE</version>  
  </parent>  
  <dependencies>
    <dependency>  
      <groupId>org.springframework.boot</groupId>  
      <artifactId>spring-boot-starter-web</artifactId>  
    </dependency>
    <dependency>  
        <groupId>com.janeluo</groupId>  
        <artifactId>ikanalyzer</artifactId>  
        <version>2012_u6</version>  
    </dependency>
    <dependency>  
        <groupId> org.apache.lucene</groupId>  
        <artifactId>lucene-highlighter</artifactId>  
        <version>4.7.2</version>  
    </dependency>  
    <dependency>
	    <groupId>junit</groupId>
	    <artifactId>junit</artifactId>
	    <version>4.10</version>
	    <scope>test</scope>
	</dependency>
  </dependencies>
  <build>
    <plugins>
	  <plugin>
	    <groupId>org.apache.maven.plugins</groupId>
	  	<artifactId>maven-compiler-plugin</artifactId>
	  	<configuration>
	  	  <source>1.7</source>
	  	  <target>1.7</target>
	  	  <encoding>UTF-8</encoding>
	  	</configuration>
	  </plugin>
    </plugins>
  </build>
</project>

package cn.et;

import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.SimpleSpanFragmenter;
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import org.wltea.analyzer.lucene.IKAnalyzer;
@RestController
public class LueneTesting {
	//创建IKAnalyzer分词器
	static Analyzer analyzer = new IKAnalyzer();
	//创建索引，写入文件
	public static void write() throws Exception { 
		//索引存放目录
		Directory directory = FSDirectory.open(new File("H:/Lucene/index"));
		//Lucene分词器配置
		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer);
		IndexWriter iwriter = new IndexWriter(directory, config);
		//创建文档对象，相当于数据库中的每条记录(MongoDB、Oracle、MySQL...)，注意：对象简述纯属虚构，不带任何攻击恶意
		Document doc0 = new Document();
		Field doc0field1 = new Field("AGE","20",TextField.TYPE_STORED);
		Field doc0field2 = new Field("NAME","路橙",TextField.TYPE_STORED);
		Field doc0field3 = new Field("BRIEF","来自中国湖南永州，是一名初级Java开发工程师，中国互联网技术博客：http://blog.youkuaiyun.com/phone13144830339",TextField.TYPE_STORED);
		doc0.add(doc0field1);
		doc0.add(doc0field2);
		doc0.add(doc0field3);
		Document doc1 = new Document();
		Field doc1field1 = new Field("AGE","21",TextField.TYPE_STORED);
		Field doc1field2 = new Field("NAME","谢飞",TextField.TYPE_STORED);
		Field doc1field3 = new Field("BRIEF","来自中国湖北武汉，是一名语文老师，中国教育网成员，2010年评选为中国10大优秀教师",TextField.TYPE_STORED);
		doc1.add(doc1field1);
		doc1.add(doc1field2);
		doc1.add(doc1field3);
		Document doc2 = new Document();
		Field doc2field1 = new Field("AGE","22",TextField.TYPE_STORED);
		Field doc2field2 = new Field("NAME","邓娟",TextField.TYPE_STORED);
		Field doc2field3 = new Field("BRIEF","来自中国四川绵阳，是一名幼儿园老师",TextField.TYPE_STORED);
		doc2.add(doc2field1);
		doc2.add(doc2field2);
		doc2.add(doc2field3);
		Document doc3 = new Document();
		Field doc3field1 = new Field("AGE","23",TextField.TYPE_STORED);
		Field doc3field2 = new Field("NAME","曹焰斌",TextField.TYPE_STORED);
		Field doc3field3 = new Field("BRIEF","来自中国广东广州，是一名建筑工人",TextField.TYPE_STORED);
		doc3.add(doc3field1);
		doc3.add(doc3field2);
		doc3.add(doc3field3);
		Document doc4 = new Document();
		Field doc4field1 = new Field("AGE","24",TextField.TYPE_STORED);
		Field doc4field2 = new Field("NAME","SMISI",TextField.TYPE_STORED);
		Field doc4field3 = new Field("BRIEF","来自美国底特律，是一名外资企业经理",TextField.TYPE_STORED);
		doc4.add(doc4field1);
		doc4.add(doc4field2);
		doc4.add(doc4field3);
		iwriter.addDocument(doc0);
		iwriter.addDocument(doc1);
		iwriter.addDocument(doc2);
		iwriter.addDocument(doc3);
		iwriter.addDocument(doc4);
		iwriter.commit();
		iwriter.close();
	}
	//查找索引，查看得分情况
	@RequestMapping("/simpleSearchScore") 
	public static String simpleSearch() throws Exception {
		String content = "中国";
		Directory directory = FSDirectory.open(new File("H:/Lucene/index"));
		//指定索引查找目录
		DirectoryReader ireader = DirectoryReader.open(directory);
		IndexSearcher isearcher = new IndexSearcher(ireader);
		//指定查询的field名和使用的分词解析器
		QueryParser parser = new QueryParser(Version.LUCENE_47,"BRIEF",analyzer);
		Query query = parser.parse(content);
		//搜索得分排序的数组，文字中包含收搜内容的数量
		
	    String resultStr = "";
	    TopDocs docs = isearcher.search(query, 10);
	    for (ScoreDoc doc : docs.scoreDocs) {
	     String str = 
	    		 "文档ID: " + doc.doc 
	    		 + "<br/>BRIEF："+isearcher.doc(doc.doc).get("BRIEF")
	    		 + "<br/>NAME："+isearcher.doc(doc.doc).get("NAME")
	    		 +"<br/>AGE："+isearcher.doc(doc.doc).get("AGE")
	    		 + "<br/>得分情况: " + doc.score
	     		 + "<hr border='5px' color='red'/>";
	     resultStr += str;
	    }
	    return resultStr;
	}
	
	//查找索引，收搜文档对象,页面高亮显示
	@RequestMapping("/highlighterSearch")
    public List<Map<String,String>> highlighterTesting() throws Exception{
		String content = "中国";
		Directory directory = FSDirectory.open(new File("H:/Lucene/index"));
		//指定索引查找目录
		DirectoryReader ireader = DirectoryReader.open(directory);
		IndexSearcher searcher = new IndexSearcher(ireader);
		QueryParser parser = new QueryParser(Version.LUCENE_47,"BRIEF",analyzer);
		
		Query query = parser.parse(content);
		TopDocs hits = searcher.search(query, 10);
		SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter("<font color=red>","</font>");
		Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query));
		//设置高亮处理的字符个数
		highlighter.setMaxDocCharsToAnalyze(20);
		List<Map<String,String>> list = new ArrayList<Map<String,String>>();
		int item = hits.scoreDocs.length;
		System.out.println(item);
		for (int i = 0; i < item; i++) {
			System.out.println(i);
		    int id = hits.scoreDocs[i].doc;
		    Document doc = searcher.doc(id);
		    
		    Map<String,String> map = new HashMap<String,String>();
		    map.put("name",doc.get("NAME"));
		    
		    String text = doc.get("BRIEF");
		    TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), id, "BRIEF", analyzer);
		    TextFragment[] frag = highlighter.getBestTextFragments(tokenStream, text, false, 10);
		    System.out.println(frag.length);
		    for (int j = 0; j < frag.length; j++) {
		    	System.out.println(frag[j]);
                if((frag[j] != null) && (frag[j].getScore() > 0)) {
			        String str = frag[j].toString();
			        System.out.println(str);
			        map.put("brief",str);
			    }
		    }
		    
		    map.put("age",doc.get("AGE"));
		    list.add(map);
		}
		
		return list;
	}
}

package cn.et;

import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

@SpringBootApplication  
public class SpringBootMain {  
    public static void main(String[] args) {  
        SpringApplication.run(SpringBootMain.class, args);  
    }  
}