Lucene学习入门1

最新推荐文章于 2025-11-30 13:59:48 发布

原创最新推荐文章于 2025-11-30 13:59:48 发布 · 133 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#Lucene、 #全文检索

lucene 专栏收录该内容

4 篇文章

订阅专栏

一、概述
搜索的方式：
    1、只处理文本(不处理多媒体，多媒体是另外一个领域)
    2、不处理语义，而是按词查询
    3、对于英文，不区分大小写
Lucene(全文检索)：
    指以文本作为检索对象，找出含有指定词汇的文本
    全面、准确和快速是衡量全文检索系统的关键指标
全文检索与数据库搜索
    1、全文检索的搜索效果更加准确
    2、相关排序，数据库没有（全文检索会把最符合要求的放在最前面）
    3、速度更快(采用索引检索，数据库中的select不能)
应用场景：
    1、系统内搜索(站内搜索)
    2、垂直搜索(利用爬虫等把部分网站的信息拿过来存到自己系统中)
    3、全网搜索(需要软件和硬件的配合，很好的算法等，不常用)
二、Lucene的简介
    全文检索就如同一个ORM，是一个概念，ORM的框架有很多种：Hibernate、TopLink、iBatis等。同样的，全文检索领域中也有很多种框架，Lucene就是其中的一个开源的全文检索框架。
    Lucene的主页为：http://lucene.apache.org/ .我现在使用的为3.0.1的版本。

三、做个小例子
   1、开发环境的配置
      加入jar包：
        lucene-core-3.0.1.jar（核心包）
 contrib\analyzers\common\lucene-analyzers-3.0.1.jar（分词器）
 contrib\highlighter\lucene-highlighter-3.0.1.jar（高亮）
 contrib\memory\lucene-memory-3.0.1.jar（高亮）
   2、先做个实体类，用来模拟数据

package com.lucene.entity;

public class ArticleEntity {

private Integer  id ;

private String title ;

private String content ;


public void setId(Integer id) {
this.id = id;
}


public Integer getId() {
return id;
}


public String getTitle() {
return title;
}

public void setTitle(String title) {
this.title = title;
}

public String getContent() {
return content;
}

public void setContent(String content) {
this.content = content;
}



}

开始做lucene搜索：

package com.lucene.hello;

import java.io.File;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriter.MaxFieldLength;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.testng.annotations.Test;

import com.lucene.entity.ArticleEntity;

public class HelloLucene {

@Test
public void createindex() throws Exception{

// 模拟一个存到了数据库中的文章对象
ArticleEntity article = new ArticleEntity();
article.setId(111);
article.setTitle("Hello Lucene !");
article.setContent("第一个lucene实例 ");

// 在索引库中建立索引
   // 1>> 把Article转为Document

Document doc = new Document();
doc.add(new Field("id", article.getId().toString(), Store.YES, Index.NOT_ANALYZED));
doc.add(new Field("title",article.getTitle(),Store.YES,Index.ANALYZED));
doc.add(new Field("content",article.getContent(),Store.NO,Index.ANALYZED));
   // 2>> 把Document添加到索引库中以建立索引
Directory d = FSDirectory.open(new File("./filepath"));
Analyzer analyzar = new StandardAnalyzer(Version.LUCENE_30);

IndexWriter indexWrite = new IndexWriter(d, analyzar, MaxFieldLength.LIMITED);
indexWrite.addDocument(doc);
indexWrite.close();
}
@Test
public void search() throws Exception{
// 搜索条件
String queryString = "Hello" ;

// 执行搜索
// =============================================================
Directory path = FSDirectory.open(new File("./filepath"));
Analyzer  a = new StandardAnalyzer(Version.LUCENE_30);
   // 1>> 把查询字符串转为Query对象
QueryParser queryp = new QueryParser(Version.LUCENE_30, "title", a) ;// 默认只在title中查询
Query query = queryp.parse(queryString);
  // 2>> 搜索，得到中间结果
IndexSearcher indexSearch = new IndexSearcher(path);
TopDocs topd = indexSearch.search(query, 1000);// 第1参数是查询条件，第2个参数是指定要返回前n条结果
int totalnum = topd.totalHits; //得到符合条件的总记录数
ScoreDoc[] sore = topd.scoreDocs; //返回前n条结果的信息

// >> 处理结果
List<ArticleEntity> list = new ArrayList<ArticleEntity>();
for(int i = 0 ; i < sore.length ;i++){
ScoreDoc scoreDoc = sore[i];
int id = scoreDoc.doc;//Document内部编号
float scoreNum = scoreDoc.score ;//与搜索条件符合的相关度

// 根据内部编号得到真正的Document数据
Document doc = indexSearch.doc(id);

// 把Document转为Article
ArticleEntity entity = new ArticleEntity();
entity.setId(Integer.parseInt(doc.get("id"))); //需要转型
entity.setTitle(doc.get("title"));
entity.setContent(doc.get("content"));
list.add(entity);
}
indexSearch.close();
// =============================================================

// 显示结果
for(ArticleEntity aEntity : list){
System.out.println("id："+aEntity.getId());
System.out.println("标题："+aEntity.getTitle());
System.out.println("内容："+aEntity.getContent());
}
}

}