导入依赖
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>6.0.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queryparser -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>6.0.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-analyzers-common -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>6.0.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-queries -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>6.0.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.lucene/lucene-highlighter -->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>6.0.0</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>com.hankcs.nlp</groupId>
<artifactId>hanlp-lucene-plugin</artifactId>
<version>1.1.2</version>
</dependency>
代码实现
package com.chao;
import com.hankcs.lucene.HanLPAnalyzer;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.File;
import java.io.IOException;
import java.nio.file.FileSystems;
/* I AM A PURE SHEEP
*
* WHAT YOU WANT TO DO
*
* FUCK AS YOU WISH
*/
public class IndexUtil {
public void index() throws IOException {
//创建索引的目录对象
Directory directory= FSDirectory.open(FileSystems.getDefault().
getPath("/Users/dllo/Desktop/0402wordspace/0830Lucene/src/main/resources/tar"));
//创建分词器
// hanLPAnalyzer中文分词器
// StandardAnalyzer英文文分词器
Analyzer analyzer = new HanLPAnalyzer();
//写入设置
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
//索引写入器
IndexWriter indexWriter = new IndexWriter(directory,indexWriterConfig);
//删除所有的索引(生产环境慎用)
indexWriter.deleteAll();
//获取目标文件夹内的所有TXT,遍历写入索引
File sourceFile = new File("/Users/dllo/Desktop/0402wordspace/0830Lucene/src/main/resources/source");
File[] listFiles = sourceFile.listFiles();
for (File file : listFiles) {
//每一次获取文件,都创建一个新的document
Document document = new Document();
//将文件内容写入到索引中
document.add(new Field("content", FileUtils.readFileToString(file,"utf-8"), TextField.TYPE_STORED));
//文件标题
document.add(new Field("fileName",file.getName(),TextField.TYPE_STORED));
document.add(new Field("filePath",file.getAbsolutePath(),TextField.TYPE_STORED));
indexWriter.addDocument(document);
}
indexWriter.close();
}
package com.chao;
import com.hankcs.lucene.HanLPAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.FileSystems;
/* I AM A PURE SHEEP
*
* WHAT YOU WANT TO DO
*
* FUCK AS YOU WISH
*/
public class SearchUtil {
public void search(String keyword) throws IOException, ParseException, InvalidTokenOffsetsException {
//获取保存索引的文件夹
Directory directory = FSDirectory.open(FileSystems.getDefault().
getPath("/Users/dllo/Desktop/0402wordspace/0830Lucene/src/main/resources/tar"));
//索引读取工具
DirectoryReader directoryReader = DirectoryReader.open(directory);
//检索器
IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
//创建中文分词器
Analyzer analyzer = new HanLPAnalyzer();
//查询解析器
QueryParser queryParser = new QueryParser("content",analyzer);
//使用分词器对用户的输入内容进行分词,得到要进行查询的关键词集
Query query = queryParser.parse(keyword);
//使用检索器对query进行检索,得到结果集
TopDocs hits = indexSearcher.search(query, 2);
//显示结果数量
int i = hits.totalHits;
System.out.println("查询出"+i+"条结果");
//查询结果
ScoreDoc[] scoreDocs = hits.scoreDocs;
for (ScoreDoc scoreDoc : scoreDocs) {
Document document = indexSearcher.doc(scoreDoc.doc);
String s = document.get("content");
System.out.println(s);
}
//高亮查询代码开始----------------
//查询评分
QueryScorer queryScorer = new QueryScorer(query);
//规定高亮的格式
Fragmenter fragmenter = new SimpleSpanFragmenter(queryScorer);
SimpleHTMLFormatter simpleHTMLFormatter =
new SimpleHTMLFormatter("<font color='red'>","</font>");
//高亮分析器
Highlighter highlighter = new Highlighter(simpleHTMLFormatter,queryScorer);
highlighter.setTextFragmenter(fragmenter);
// 遍历结果,处理高亮关键字
for (ScoreDoc scoreDoc : scoreDocs) {
Document document = indexSearcher.doc(scoreDoc.doc);
String s = document.get("content");
if (s!=null){
//分词器做好处理后得到的一个流
//这个流储存了分词的各种信息
//可以通过tokenStream有效的获取分词单元
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(s));
String h = highlighter.getBestFragment(tokenStream, s);
System.out.println(h);
}
}
}
public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException {
SearchUtil searchUtil = new SearchUtil();
searchUtil.search("民政局淑女");
}
}