前一段时间,公司让用Lucene检索文档,自己写了些代码,在这里记录一下,以免忘记了。
其实,简单的Lucene的入门还是很简单的,它的整体构造和关系型数据库差不多,一个键对应一个值,生成索引,然后根据索引去查找文档内容,在将内容通过别的方式显示出来。
Lucene创建、增、删、改索引:
package com.haiyisoft.szgl.file.service.impl;
import java.io.File;
/**
* 档案管理的创建索引
*
* @author haojiahong
*
* <p>Modification History:</p>
* <p>Date Author Description</p>
* <p>--------------------------------------------------------------</p>
* <p>20151027 haojiahong new</p>
* <p> </p>
*/
@Component("schDocForDocBuilderService")
public class SchDocForDocBuilderServiceImpl implements SchDocForDocBuilderService {
@Autowired
public DocService docService;
@Autowired
public FileContentService fileContentService;
private long time = 0;
/**
* 创建lucene索引
*/
public void creatLucene() {
IndexWriter indexWriter = null;
try {
File indexDir = new File(FileManage.searchCenterForDocPath);
creatFile(indexDir);
delAllFile(indexDir);
Directory dir = FSDirectory.open(indexDir);
Analyzer luceneAnalyzer = new IKAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, luceneAnalyzer);
indexWriter = new IndexWriter(dir, iwc);
LogUtil.getAppLoger().debug("开始创建索引");
long indexcount = this.createIndex(indexWriter);
LogUtil.getAppLoger().debug("创建索引结束,共处理数据行数" + indexcount + "条");
indexWriter.commit();
indexWriter.close();
} catch (IOException ex) {
ex.printStackTrace();
}
}
/**
* 按照数据集创建索引
* @param indexWriter
* @return
*/
private long createIndex(IndexWriter indexWriter) {
try {
this.showtime();
long current = 0;
// current += this.initFile(indexWriter);//根据文档建立索引
current += this.initFileWithDocument(indexWriter);// 根据档案建立索引
return current;
} catch (Exception e) {
e.printStackTrace();
return -1;
}
}
private long initFileWithDocument(IndexWriter indexWriter) {
long current = 0;
String jpql = "select file from FileManage file where 1=1";
List<FileManage> fmLs = (List) JPAUtil.find(jpql);
for (FileManage fm : fmLs) {
try {
indexWriter.addDocument(initLuceneDocument(indexWriter, current, fm));
current++;
} catch (Exception e) {
e.printStackTrace();
}
}
return current;
}
/**
* 增量添加 lucene索引
*/
@Override
public void insertLucene(String fmUuid) {
IndexWriter indexWriter = null;
try {
File indexDir = new File(FileManage.searchCenterForDocPath);
Directory dir = FSDirectory.open(indexDir);
Analyzer luceneAnalyzer = new IKAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, luceneAnalyzer);
indexWriter = new IndexWriter(dir, iwc);
LogUtil.getAppLoger().debug("开始增量添加索引");
long indexcount = this.insertIndex(indexWriter, fmUuid);
LogUtil.getAppLoger().debug("添加索引结束,共处理数据行数" + indexcount + "条");
indexWriter.commit();
indexWriter.close();
} catch (IOException ex) {
ex.printStackTrace();
}
}
/**
* 增量添加索引
* @param indexWriter
* @return
*/
private long insertIndex(IndexWriter indexWriter, String fmUuid) {
try {
this.showtime();
long current = 0;
current += this.insertFileWithDocument(indexWriter, fmUuid);// 根据档案添加索引
return current;
} catch (Exception e) {
e.printStackTrace();
return -1;
}
}
private long insertFileWithDocument(IndexWriter indexWriter, String fmUuid) {
long current = 0;
FileManage fm = JPAUtil.loadById(FileManage.class, fmUuid);
try {
indexWriter.addDocument(initLuceneDocument(indexWriter, current, fm));
current++;
} catch (Exception e) {
e.printStackTrace();
}
return current;
}
/**
* 更新Lucene索引
*/
@Override
public void updateLucene(String fmUuid) {
IndexWriter indexWriter = null;
try {
File indexDir = new File(FileManage.searchCenterForDocPath);
Directory dir = FSDirectory.open(indexDir);
Analyzer luceneAnalyzer = new IKAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, luceneAnalyzer);
indexWriter = new IndexWriter(dir, iwc);
LogUtil.getAppLoger().debug("开始更新索引");
long indexcount = this.updateIndex(indexWriter, fmUuid);
LogUtil.getAppLoger().debug("更新索引结束,共处理数据行数" + indexcount + "条");
indexWriter.commit();
indexWriter.close();
} catch (IOException ex) {
ex.printStackTrace();
}
}
/**
* 更新Lucene索引
*/
private long updateIndex(IndexWriter indexWriter, String fmUuid) {
try {
this.showtime();
long current = 0;
current += this.updateFileWithDocument(indexWriter, fmUuid);// 根据档案更新索引
return current;
} catch (Exception e) {
e.printStackTrace();
return -1;
}
}
private long updateFileWithDocument(IndexWriter indexWriter, String fmUuid) {
long current = 0;
FileManage fm = JPAUtil.loadById(FileManage.class, fmUuid);
try {
indexWriter.updateDocument(new Term("UUID", fmUuid), initLuceneDocument(indexWriter, current, fm));
current++;
} catch (Exception e) {
e.printStackTrace();
}
return current;
}
/**
* 删除Lucene索引
*/
@Override
public void deteleLucene(String fmUuid) {
IndexWriter indexWriter = null;
try {
File indexDir = new File(FileManage.searchCenterForDocPath);
Directory dir = FSDirectory.open(indexDir);
Analyzer luceneAnalyzer = new IKAnalyzer();
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, luceneAnalyzer);
indexWriter = new IndexWriter(dir, iwc);
LogUtil.getAppLoger().debug("开始删除索引");
long indexcount = this.deteleIndex(indexWriter, fmUuid);
LogUtil.getAppLoger().debug("删除索引结束,共处理数据行数" + indexcount + "条");
indexWriter.commit();
indexWriter.close();
} catch (IOException ex) {
ex.printStackTrace();
}
}
/**
* 删除Lucene索引
* @param indexWriter
* @param fmUuid
* @return
*/
private long deteleIndex(IndexWriter indexWriter, String fmUuid) {
try {
this.showtime();
long current = 0;
current += this.deleteFileWithDocument(indexWriter, fmUuid);// 根据档案删除索引
return current;
} catch (Exception e) {
e.printStackTrace();
return -1;
}
}
private long deleteFileWithDocument(IndexWriter indexWriter, String fmUuid) {
long current = 0;
try {
indexWriter.deleteDocuments(new Term("UUID", fmUuid));
current++;
} catch (Exception e) {
e.printStackTrace();
}
return current;
}
/**
* 批量删除索引
*/
@Override
public void deteleLuceneLs(List<FileManage> fmList) {
for (FileManage fm : fmList) {
this.deteleLucene(fm.getUuid());
}
}
/**
* 每个档案创建一个Lucene的document(创建、添加、更新索引用到此方法)
* @param indexWriter
* @param current
* @param fm
* @return
*/
private Document initLuceneDocument(IndexWriter indexWriter, long current, FileManage fm) {
String fileName = "";
String fileType = "";
String fileContent = "";
// TODO 版本号待定
List<EdocFileObjectRelation> fileList = docService.queryFiles(DocConstant.NO_DIR, fm.getUuid(), fm.getClass()
.getName(), null);
if (fileList.size() > 0) {
for (EdocFileObjectRelation file : fileList) {
InputStream fs = docService.getFileInputStream(file.getFileId());
fileName = file.getEdocFile().getName();
fileType = fileName.substring(fileName.lastIndexOf(".") + 1);
fileContent += FileManageUtil.getContent(fileType, fs);
// String content = fileContentService(file.getEdocFile().)
}
}
/**
* 写入索引文件
*/
Document doc = new Document();
doc.add(new Field("UUID", fm.getUuid(), Store.YES, Index.NOT_ANALYZED));
// doc.add(new Field("FILENAME", fileName, Store.YES,
// Index.NO));// 文档名称
if (!SzglCommonUtil.strIsNull(fileContent)) {
doc.add(new Field("CONTENT", fileContent, Store.YES, Index.ANALYZED));// 具体内容
}
if (!SzglCommonUtil.strIsNull(fm.getTitle())) {
doc.add(new Field("TITLE", fm.getTitle(), Store.YES, Index.ANALYZED));// 档案标题
}
if (!SzglCommonUtil.strIsNull(fm.getDocNum())) {
doc.add(new Field("DOCNUM", fm.getDocNum(), Store.YES, Index.ANALYZED));// 档案文号
}
if (fm.getFromMan() != null) {
doc.add(new Field("FROMMAN", fm.getFromMan().toString(), Store.YES, Index.ANALYZED));// 创建人
}
if (fm.getType() != null) {
doc.add(new Field("TYPE", fm.getType(), Store.YES, Index.ANALYZED));// 档案类别
}
if (fm.getStatus() != null) {
doc.add(new Field("STATUS", fm.getStatus(), Store.YES, Index.ANALYZED));// 档案状态
}
if (!SzglCommonUtil.strIsNull(fm.getIsShare())) {
doc.add(new Field("ISHARE", fm.getIsShare(), Store.YES, Index.ANALYZED));// 档案是否共享
}
if (fm.getFromTime() != null) {
doc.add(new Field("FROMTIME", fm.getFromTime() + "", Store.YES, Index.ANALYZED));// 档案传来时间
}
if (!SzglCommonUtil.strIsNull(fm.getBoxUuid())) {
FileBox filebox = JPAUtil.loadById(FileBox.class, fm.getBoxUuid());
if (!SzglCommonUtil.strIsNull(filebox.getYearCode())) {
doc.add(new Field("YEARCODE", filebox.getYearCode(), Store.YES, Index.ANALYZED));// 档案所属的档案盒年度
}
}
return doc;
}
private boolean delAllFile(File file) {
boolean flag = false;
if (file != null) {
File[] tempList = file.listFiles();
File temp = null;
for (int i = 0; i < tempList.length; i++) {
temp = tempList[i];
if (temp.isFile()) {
temp.delete();
}
}
}
return flag;
}
/**
* 显示时间
*/
private void showtime() {
long time1 = System.currentTimeMillis();
if (time > 0) {
LogUtil.getAppLoger().debug("MilliSecond:" + (time1 - time));
} else {
LogUtil.getAppLoger().debug("Start time:" + (new Timestamp(System.currentTimeMillis())));
}
time = time1;
}
private void creatFile(File file) {
if (!file.exists()) {
file.mkdirs();
}
}
private long initFile(IndexWriter indexWriter) {
long current = 0;
String jpql = "select file from FileManage file where 1=1";
List<FileManage> fmLs = (List) JPAUtil.find(jpql);
for (FileManage fm : fmLs) {
// TODO 版本号待定
List<EdocFileObjectRelation> fileList = docService.queryFiles(DocConstant.NO_DIR, fm.getUuid(), fm
.getClass().getName(), null);
if (fileList.size() > 0) {
for (EdocFileObjectRelation file : fileList) {
InputStream fs = docService.getFileInputStream(file.getFileId());
String fileName = file.getEdocFile().getName();
String fileType = fileName.substring(fileName.lastIndexOf(".") + 1);
String fileContent = FileManageUtil.getContent(fileType, fs);
// String content = fileContentService(file.getEdocFile().)
/**
* 写入索引文件
*/
try {
Document doc = new Document();
// doc.add(new Field("TYPE", fm.getType(), Store.YES,
// Index.ANALYZED)); // 档案类别
doc.add(new Field("UUID", fm.getUuid(), Store.YES, Index.NO));
doc.add(new Field("FILENAME", fileName, Store.YES, Index.NO));// 文档名称
if (!SzglCommonUtil.strIsNull(fileContent)) {
doc.add(new Field("CONTENT", fileContent, Store.YES, Index.ANALYZED));// 具体内容
}
if (!SzglCommonUtil.strIsNull(fm.getTitle())) {
doc.add(new Field("TITLE", fm.getTitle(), Store.YES, Index.ANALYZED));// 档案标题
}
if (!SzglCommonUtil.strIsNull(fm.getDocNum())) {
doc.add(new Field("DOCNUM", fm.getDocNum(), Store.YES, Index.ANALYZED));// 档案文号
}
if (fm.getFromMan() != null) {
doc.add(new Field("FROMMAN", fm.getFromMan().toString(), Store.YES, Index.ANALYZED));// 创建人
}
indexWriter.addDocument(doc);
current++;
if ((current - (current / 10000) * 10000) == 0) {
LogUtil.getAppLoger().debug("current row num:" + current);
}
} catch (Exception ex) {
ex.printStackTrace();
}
}
}
}
return current;
}
}
根据索引去查询,并将关键词标红
package com.haiyisoft.szgl.file.service.impl;
import java.io.File;
/**
* 通过Lucene查询
*
* @author haojiahong
*
* <p>Modification History:</p>
* <p>Date Author Description</p>
* <p>--------------------------------------------------------------</p>
* <p>20151102 haojiahong new</p>
* <p> </p>
*/
@Component("fileSchByLuceneService")
public class FileSchByLuceneServiceImpl implements FileSchByLuceneService {
@Override
public List<FileManage> retrieveByLucene(String keyword, String titleSch, String docNumSch, String typeSch,
String isShareSch, Timestamp yearCodeBegin, Timestamp yearCodeEnd, SortParamList sortParamList,
PageInfo pageInfo) {
List<FileManage> result = new ArrayList<FileManage>();
IndexSearcher searcher = null;
String indexDir = FileManage.searchCenterForDocPath;
File file = new File(indexDir);
if ((SzglCommonUtil.strIsNull(keyword) && SzglCommonUtil.strIsNull(docNumSch)
&& SzglCommonUtil.strIsNull(titleSch) && SzglCommonUtil.strIsNull(typeSch) && SzglCommonUtil
.strIsNull(isShareSch)) || (!file.exists())) {
if (pageInfo != null) {
pageInfo.setAllRowNum(0);
}
return null;
}
try {
Directory dir = FSDirectory.open(new File(indexDir));
IndexReader reader = IndexReader.open(dir);
searcher = new IndexSearcher(reader);
BooleanQuery query = new BooleanQuery();
Analyzer anal = new IKAnalyzer();
QueryParser qp = new QueryParser(Version.LUCENE_36, "CONTENT", anal);
QueryParser qpTitle = new QueryParser(Version.LUCENE_36, "TITLE", anal);
QueryParser qpDocNum = new QueryParser(Version.LUCENE_36, "DOCNUM", anal);
QueryParser qpIshare = new QueryParser(Version.LUCENE_36, "ISHARE", anal);
if (!SzglCommonUtil.strIsNull(keyword)) {
query.add(qp.parse(keyword), Occur.MUST);
}
if (!SzglCommonUtil.strIsNull(titleSch)) {
query.add(qpTitle.parse(titleSch), Occur.MUST);
}
if (!SzglCommonUtil.strIsNull(docNumSch)) {
query.add(qpDocNum.parse(docNumSch), Occur.MUST);
}
if (!SzglCommonUtil.strIsNull(typeSch)) {
query.add(new TermQuery(new Term("TYPE", typeSch)), Occur.MUST);
}
if (!SzglCommonUtil.strIsNull(isShareSch)) {
query.add(qpIshare.parse(isShareSch), Occur.MUST);
}
if (yearCodeBegin != null || yearCodeEnd != null) {
query.add(new TermRangeQuery("YEARCODE", yearCodeBegin.toString(), yearCodeEnd.toString(), true, true),
Occur.MUST);
}
ScoreDoc[] hits = searcher.search(query, Integer.MAX_VALUE).scoreDocs;
int nowPagestart = (pageInfo.getCurPageNum() - 1) * pageInfo.getRowOfPage();// 当前页第一条数据是总数的第几条数据
int allPage = hits.length;// 总条数
pageInfo.setAllRowNum(allPage);
int nowPageEnd = (nowPagestart + pageInfo.getRowOfPage()) < allPage ? (nowPagestart + pageInfo
.getRowOfPage()) : allPage;
for (int i = nowPagestart; i < nowPageEnd; i++) {
FileManage fm = new FileManage();
Document doc = searcher.doc(hits[i].doc);
fm.setUuid(doc.get("UUID"));
if (!SzglCommonUtil.strIsNull(docNumSch)) {
fm.setDocNum(lighterStr(anal, qpDocNum.parse(docNumSch), doc.get("DOCNUM"), "DOCNUM"));
} else {
fm.setDocNum(doc.get("DOCNUM"));
}
if (!SzglCommonUtil.strIsNull(titleSch)) {
fm.setTitle(lighterStr(anal, qpTitle.parse(titleSch), doc.get("TITLE"), "TITLE"));
} else {
fm.setTitle(doc.get("TITLE"));
}
if (!SzglCommonUtil.strIsNull(keyword)) {
fm.setFileContent(lighterStr(anal, qp.parse(keyword), doc.get("CONTENT"), "CONTENT"));
} else {
fm.setFileContent(doc.get("CONTENT"));
}
fm.setFromMan(Long.valueOf(doc.get("FROMMAN")));
fm.setFileName(doc.get("FILENAME"));
fm.setType(doc.get("TYPE"));
fm.setStatus(doc.get("STATUS"));
fm.setFromTime(Timestamp.valueOf(doc.get("FROMTIME")));
result.add(fm);
}
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (searcher != null)
try {
searcher.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return result;
}
private String lighterStr(Analyzer a, Query query, String txt, String fieldname) throws Exception {
String str = null;
QueryScorer scorer = new QueryScorer(query);
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
Formatter fmt = new SimpleHTMLFormatter("<span style='color:red'>", "</span>");
Highlighter lighter = new Highlighter(fmt, scorer);
lighter.setTextFragmenter(fragmenter);
str = lighter.getBestFragment(a, fieldname, txt);
if (str == null)
return txt;
return str;
}
}
一些工具类,用于读文档内容。
* Copyright (C) 2014-2020 Yantai HaiYi Software Co.,Ltd
package com.haiyisoft.szgl.file.util;
import java.io.BufferedReader;
/**
* 档案管理工具类
*
* @author haojiahong
*
* <p>Modification History:</p>
* <p>Date Author Description</p>
* <p>--------------------------------------------------------------</p>
* <p>20151102 haojiahong new</p>
* <p> </p>
*/
public class FileManageUtil {
/**
* 根据文件类型,获取文本内容
* @param type
* @param fs
* @return
*/
public static String getContent(String type, InputStream fs) {
String text = null;
if ("doc".equals(type)) {
POITextExtractor ex = null;
try {
ex = new WordExtractor(fs);
text = ex.getText();
} catch (Exception e) {
e.printStackTrace();
}
} else if ("docx".equals(type)) {
POITextExtractor ex = null;
try {
OPCPackage opcPackage = OPCPackage.open(fs);
ex = new XWPFWordExtractor(opcPackage);
text = ex.getText();
} catch (Exception e) {
e.printStackTrace();
}
} else if ("txt".equals(type)) {
BufferedReader reader = new BufferedReader(new InputStreamReader(fs));
String line = null;
try {
while ((line = reader.readLine()) != null) {
text += line;
}
} catch (IOException e) {
e.printStackTrace();
}
} else if ("pdf".equals(type)) {
try {
PDDocument pdfDocument = PDDocument.load(fs);
text = new PDFTextStripper().getText(pdfDocument);
} catch (IOException e) {
e.printStackTrace();
}
}
return text;
}
}