1.创建索引:
package com.prl.utils.lucene;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.htmlparser.Parser;
import org.htmlparser.beans.StringBean;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class IndexUtils {
private final static String charset = "utf-8";
public void createIndex(String docPath, String indexPath) throws Exception {
IndexWriter indexWriter = null;
// 创建Directory对象
Directory dir = new SimpleFSDirectory(new File(indexPath));
Analyzer analyzer = new IKAnalyzer();
// 创建IndexWriter对象,第一个参数是Directory,第二个是分词器,第三个表示是否是创建,如果为false为在此基础上面修改,第四表示表示分词的最大值,比如说new
// MaxFieldLength(2),就表示两个字一分,一般用 IndexWriter.MaxFieldLength.LIMITED
indexWriter = new IndexWriter(dir, analyzer, true,IndexWriter.MaxFieldLength.UNLIMITED);
File[] files = new File(docPath).listFiles();
for (int i = 0; i < files.length; i++) {
if (files[i].isFile()) {
FileInputStream fis = new FileInputStream(files[i]);
Reader reader = new InputStreamReader(fis, charset);
String fileContent = readTextFile(files[i].getAbsolutePath(),charset);
Map<String,String> parserResult = getPlainText(fileContent);
fileContent = parserResult.get("plainText");
String title = parserResult.get("title");
Document doc = new Document();
// 创建Field对象,并放入doc对象中
// doc.add(new Field("contents", new FileReader(files[i])));
// doc.add(new Field("contents", reader));
doc.add(new Field("contents", fileContent, Field.Store.YES,
Field.Index.ANALYZED));
doc.add(new Field("filename", files[i].getName(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
doc.add(new Field("pagetitle", title, Field.Store.YES,
Field.Index.NOT_ANALYZED));
doc.add(new Field("indexDate", DateTools.dateToString(
new Date(), DateTools.Resolution.DAY), Field.Store.YES,
Field.Index.NOT_ANALYZED));
// 写入IndexWriter
indexWriter.addDocument(doc);
// showAnalyzerResult(analyzer,getFileContent(files[i].getCanonicalPath()));
}
}
System.out.println("indexWriter.numDocs():" + indexWriter.numDocs());
indexWriter.optimize();
indexWriter.close();
}
public void showAnalyzerResult(Analyzer analyzer, String s)
throws Exception {
System.out.println("分词结果:");
StringReader reader = new StringReader(s);
TokenStream ts = analyzer.tokenStream(s, reader);
TermAttribute termAtt = (TermAttribute) ts
.getAttribute(TermAttribute.class);
while (ts.incrementToken()) {
System.out.print(termAtt.term());
System.out.print(' ');
// System.out.println(typeAtt.type());
}
System.out.println("\n分析完毕.......................");
}
public static String readTextFile(String sFileName, String sEncode) {
StringBuffer sbStr = new StringBuffer();
try {
File ff = new File(sFileName);
InputStreamReader read = new InputStreamReader(new FileInputStream(
ff), sEncode);
BufferedReader ins = new BufferedReader(read);
String dataLine = "";
while (null != (dataLine = ins.readLine())) {
sbStr.append(dataLine);
// sbStr.append("\r\n");
}
ins.close();
} catch (Exception e) {
e.printStackTrace();
}
return sbStr.toString();
}
/**
* 获取html纯文本信息
*
* @param str
* @return
*/
public static Map<String,String> getPlainText(String str) {
Map<String,String> result = new HashMap<String,String>();
try {
Parser parser = new Parser();
parser.setInputHTML(str);
StringBean sb = new StringBean();
// 设置不需要得到页面所包含的链接信息
sb.setLinks(false);
// 设置将不间断空格由正规空格所替代
sb.setReplaceNonBreakingSpaces(true);
// 设置将一序列空格由一个单一空格所代替
sb.setCollapse(true);
parser.visitAllNodesWith(sb);
String plainText = sb.getStrings();
Parser myParser = Parser.createParser(str, charset);
HtmlPage visitor = new HtmlPage(myParser);
myParser.visitAllNodesWith(visitor);
String title = visitor.getTitle();
//System.out.println("title="+title);
result.put("title",title);
result.put("plainText",plainText);
} catch (ParserException e) {
}
return result;
}
}
2.搜索,并高亮显示
package com.prl.utils.lucene;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class SerachUtils {
public SearchResult search(String indexPath, String keyWords,
Integer maxSerach) throws IOException, ParseException, InvalidTokenOffsetsException {
SearchResult searchResult = new SearchResult();
Directory dir = new SimpleFSDirectory(new File(indexPath));
// 创建 IndexSearcher对象,相比IndexWriter对象,这个参数就要提供一个索引的目录就行了
IndexSearcher indexSearch = new IndexSearcher(dir);
// 创建QueryParser对象,第一个参数表示Lucene的版本,第二个表示搜索Field的字段,第三个表示搜索使用分词器
Analyzer analyzer = new IKAnalyzer();
QueryParser queryParser = new QueryParser(Version.LUCENE_30,"contents", analyzer);
// 生成Query对象
Query query = queryParser.parse(keyWords);
// 搜索结果 TopDocs里面有scoreDocs[]数组,里面保存着索引值
TopDocs hits = indexSearch.search(query, maxSerach);
// hits.totalHits表示一共搜到多少个
// 循环hits.scoreDocs数据,并使用indexSearch.doc方法把Document还原,再拿出对应的字段的值
searchResult.setMatchCount(hits.totalHits);
searchResult.setKeyWords(keyWords);
for (int i = 0; i < hits.scoreDocs.length; i++) {
ScoreDoc sdoc = hits.scoreDocs[i];
Document doc = indexSearch.doc(sdoc.doc);
String fileName = doc.get("filename");
String contents = doc.get("contents");
String filetitle = doc.get("pagetitle");
SimpleHTMLFormatter sHtmlF = new SimpleHTMLFormatter("<b><font color='red'>", "</font></b>");
Highlighter highlighter = new Highlighter(sHtmlF, new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(50));
if (contents != null) {
TokenStream tokenStream = analyzer.tokenStream("contents",new StringReader(contents));
String matchText = highlighter.getBestFragment(tokenStream,contents);
searchResult.addMatchItem(filetitle,fileName,matchText);
}
}
indexSearch.close();
return searchResult;
}
}
上面的代码有清晰的注释,一看就明白,不用多解释了吧~~~