Lucene主要就是一个用来进行信息检索的工具。
信息检索主要分为以下步骤:
1:构建文本库
2:建立索引
3:进行搜索
4:对结果进行过滤
初次接触lucene,主要流程如下:
1:切割文档,将一份文档分解为多个小文档
2:创建索引文件
3:执行索引
具体代码如下:
public class FilePreprocess {
public static void preprocess(File file, String outputDir){
try{
splitToSmallFiles(charactorProcess(file, outputDir + "output.all"), outputDir);
}catch(Exception e){
e.printStackTrace();
}
}
// 对文件进行处理
public static File charactorProcess(File file, String destFile) throws Exception{
BufferedWriter writer = new BufferedWriter(new FileWriter(destFile));
BufferedReader reader = new BufferedReader(new FileReader(file));
String line = reader.readLine();
while(line != null){
if(!line.equals("\r\n")){
String newline = replace(line);
writer.write(newline);
writer.newLine();
}
line = reader.readLine();
}
reader.close();
writer.close();
return new File(destFile);
}
// 拆分文件
public static void splitToSmallFiles(File file, String outputpath) throws IOException{
int filePointer = 0;
int MAX_SIZE = 10240;
BufferedWriter writer = null;
BufferedReader reader = new BufferedReader(new FileReader(file));
StringBuffer buffer = new StringBuffer();
String line = reader.readLine();
while(line != null){
buffer.append(line).append("\r\n");
if(buffer.toString().getBytes().length >= MAX_SIZE){
writer = new BufferedWriter(new FileWriter(outputpath + "output" + filePointer + ".txt"));
writer.write(buffer.toString());
writer.close();
filePointer ++;
// 清空缓存区的数据
buffer = new StringBuffer();
}
line = reader.readLine();
}
writer = new BufferedWriter(new FileWriter(outputpath + "output" + filePointer + ".txt"));
writer.write(buffer.toString());
writer.close();
}
// 转换文档中的特殊字符
private static String replace(String line){
HashMap map = new HashMap();
map.put(",", ",");
map.put("。", ",");
map.put("《", "<");
map.put("》", ">");
map.put("【", "[");
map.put("】", "]");
map.put("{", "{");
map.put("}", "}");
map.put(":", ":");
map.put("!", "!");
int length = line.length();
for(int i =0;i<length;i++){
String charat = line.substring(i, i+1);
if(map.get(charat) != null){
line = line.replace(charat, (String)map.get(charat));
}
}
return line;
}
public static void main(String[] args) {
String inputFile = "d:\\book.txt";
String outputDir = "d:\\testfolder\\";
if(!new File(outputDir).exists()){
new File(outputDir).mkdirs();
}
FilePreprocess filePreprocess = new FilePreprocess();
filePreprocess.preprocess(new File(inputFile), outputDir);
}
}
建立索引:
public class IndexProcesser {
// 成员变量,存储创建的索引文件存放的位置
private String INDEX_STORE_PATH = "d:\\index";
// 创建索引
public void createIndex(String inputDir){
try{
// 以MMAnalyzer作为分词工具创建一个IndexWriter
IndexWriter writer = new IndexWriter(INDEX_STORE_PATH, new MMAnalyzer(), true);
File filesDir = new File(inputDir);
// 取得所有需要建立索引的文件数组
File[] files = filesDir.listFiles();
// 遍历数组
for(int i =0;i<files.length;i++){
String fileName = files[i].getName();
if(fileName.substring(fileName.lastIndexOf(".")).equals(".txt")){
// 创建一个新的Document
Document doc = new Document();
// 为文件名创建一个Field
Field field = new Field("filename", files[i].getName(), Field.Store.YES, Field.Index.TOKENIZED);
doc.add(field);
// 为文件内容创建一个Field
field = new Field("content", loadFileToString(files[i]), Field.Store.YES, Field.Index.TOKENIZED);
doc.add(field);
// 把Document加入IndexWriter
writer.addDocument(doc);
}
}
writer.close();
}catch(Exception e){
e.printStackTrace();
}
}
public String loadFileToString(File file){
try{
BufferedReader br = new BufferedReader(new FileReader(file));
StringBuffer sb = new StringBuffer();
String line = br.readLine();
while(line != null){
sb.append(line);
line = br.readLine();
}
br.close();
return sb.toString();
}catch(Exception e){
e.printStackTrace();
return null;
}
}
public static void main(String[] args) {
IndexProcesser indexProcesser = new IndexProcesser();
indexProcesser.createIndex("d:\\testfolder");
}
}
执行查询:
public class Search {
private String INDEX_STORE_PATH = "d:\\index";
public void indexSearch(String searchType, String searchKey){
try{
IndexSearcher searcher = new IndexSearcher(INDEX_STORE_PATH);
Term t = new Term(searchType, searchKey);
Query q = new TermQuery(t);
// Hits hit = searcher.search(q);
// System.out.println("*************************");
// for(int i =0;i<hit.length();i++){
// System.out.println(hit.doc(i));
// }
// 搜索开始时间
Date beginTime = new Date();
// 获取一个<document, frequency>的枚举对象TermDocs
TermDocs termDocs = searcher.getIndexReader().termDocs(t);
while(termDocs.next()){
// 输出文档中出现关键字的次数
System.out.println(termDocs.freq());
//输出搜索到关键词的文档
System.out.println(searcher.getIndexReader().document(termDocs.doc()));
}
Date endTime = new Date();
// 时长
long timeOfSearch = endTime.getTime() - beginTime.getTime();
System.out.println("The time For indexsearch is " + timeOfSearch + " ms");
}catch(Exception e){
e.printStackTrace();
}
}
}
所用jar包请参考附件