与lucene亲密接触第一集

最新推荐文章于 2022-12-31 17:50:11 发布

原创最新推荐文章于 2022-12-31 17:50:11 发布 · 1k 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#lucene #string #中文分词库 #file #import #myeclipse

web技术同时被 2 个专栏收录

37 篇文章

订阅专栏

Java

19 篇文章

订阅专栏

本文介绍了Lucene的基本原理及其在Java环境下的应用实践。作者从下载安装到配置MyEclipse环境，再到实现全文检索功能进行了详细说明，并分享了一款优秀的中文分词器IKAnalyzer，通过具体示例展示了如何建立倒排序索引并进行搜索业务处理。

这两天开始研究lucene，

可怜连java都没用过的我简直是举步维艰啊，

于是还附带的小学习了一下myeclipse等玩意的配置。。今天收获颇丰，晒一下与lucene第一次亲密接触的成果。

上apache的官网下了一个lucene最新版，（发现居然有人人网的一个分流，不错~下载速度很快！）

然后看了半天多lucene的基础教程，大概理清楚了他的框架和工作流程。

lucene我们应该理解为一个搜索引擎的类库，它不是一个完整的产品/软件，而是一个开发包，可以基于它快速开发满足自己需求的搜索引擎。

其扩展度很大，完全可以根据自己的需求定制，且底层实现效率很高。是apache基金会现在维护的。

其功能上总体就分为两块，倒排序索引的建立与搜索业务处理，

倒排序及相关度概念还是用的TF/IDF，以及经典的“相关度向量余弦”概念。基本原理都很好理解。

接着我开始在本地部署测试例程，

然后照着各种网上例程自己写了一个。然后集成了一个口碑不错的IKAnalyzer的中文分词器，感觉效果不错！

入库的中华人民共和国，我搜 “中华共和国”也能搜出来（自动分为中华|共和国|……）

建立倒排序索引：

import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.util.Date; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.wltea.analyzer.lucene.IKAnalyzer; import org.wltea.analyzer.lucene.IKQueryParser; import org.wltea.analyzer.lucene.IKSimilarity; public class Indexer { public static void main(String[] args) throws Exception { File fileDir = new File("E://lucene//docs"); /* 这里放索引文件的位置 */ Directory indexDir = FSDirectory.open(new File("E://lucene//index")); Analyzer luceneAnalyzer = new IKAnalyzer(); IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer, true,IndexWriter.MaxFieldLength.LIMITED); File[] textFiles = fileDir.listFiles(); long startTime = new Date().getTime(); //增加document到索引去 for (int i = 0; i < textFiles.length; i++) { if (textFiles[i].isFile()) { System.out.println("File " + textFiles[i].getCanonicalPath() + "正在被索引...."); String temp = FileReaderAll(textFiles[i].getCanonicalPath(), "GBK"); System.out.println(temp); Document document = new Document(); Field FieldPath = new Field("path", textFiles[i].getPath(), Field.Store.YES, Field.Index.NO); Field FieldBody = new Field("body", temp, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS); document.add(FieldPath); document.add(FieldBody); indexWriter.addDocument(document); } } //optimize()方法是对索引进行优化 indexWriter.optimize(); indexWriter.close(); //测试一下索引的时间 long endTime = new Date().getTime(); System.out .println("这花费了" + (endTime - startTime) + " 毫秒来把文档增加到索引里面去!" + fileDir.getPath()); } public static String FileReaderAll(String FileName, String charset) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader( new FileInputStream(FileName), charset)); String line = new String(); String temp = new String(); while ((line = reader.readLine()) != null) { temp += line; } reader.close(); return temp; } }

搜索业务：

import java.io.File; import java.io.IOException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.apache.lucene.store.LockObtainFailedException; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.search.FuzzyQuery; //模糊查询 import org.apache.lucene.search.PhraseQuery;//短语查询 //中文分词库 import org.wltea.analyzer.lucene.IKAnalyzer; import org.wltea.analyzer.lucene.IKQueryParser; import org.wltea.analyzer.lucene.IKSimilarity; public class Search { /** * 查询字符串 * * @param keyword * 搜索单词 * @param indexDir * 索引文件夹 */ public static void searcher(String keyword, File indexDir) { IndexSearcher isearcher = null; Directory directory = null; try { System.out.println("现在开始搜索关键字【"+keyword+"】"); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); // 创建一个查詢语法分析器 directory = FSDirectory.open(indexDir); QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, "body", analyzer); Query query = IKQueryParser.parse("body",keyword);// 获取查询对象 System.out.println("查寻表达式：" + query.toString()); isearcher = new IndexSearcher(directory, true); // 创建索引搜索器 isearcher.setSimilarity(new IKSimilarity());//在索引器中使用IKSimilarity相似度评估器 TopDocs ts = isearcher.search(query,100); int totalHits = ts.totalHits; // 获取命中数 System.out.println("命中数：" + totalHits); // 获取命中的文档信息对象查询结果信息。它包括符合条件的Document的内部编号(doc)及评分(score) 。 ScoreDoc[] hits = ts.scoreDocs; //老版本中 Hits --length for (int i = 0; i < hits.length; i++) { Document hitDoc = isearcher.doc(hits[i].doc); System.out.println(hitDoc.getField("body").stringValue()); } } catch (IOException e) { e.printStackTrace(); } finally { if (isearcher != null) { try { isearcher.close(); // 关闭搜索器 } catch (IOException e) { e.printStackTrace(); } } if (directory != null) { try { directory.close(); // 关闭索引存放目录 } catch (IOException e) { e.printStackTrace(); } } } } public static void main(String[] args) { File src = new File("E://lucene//index"); File destDir = new File("E://lucene//index"); //CreateIndexerDir.index(src, destDir); Search.searcher("中华共和国", destDir); } }