Lucene学习入门之简单示例

本文详细介绍了使用Lucene进行文本处理、建立索引的全过程,包括文档切割、创建索引文件、执行索引操作,并通过代码示例展示了具体实现步骤。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

Lucene主要就是一个用来进行信息检索的工具。

 

信息检索主要分为以下步骤:

1:构建文本库

2:建立索引

3:进行搜索

4:对结果进行过滤

 

初次接触lucene,主要流程如下:

1:切割文档,将一份文档分解为多个小文档

2:创建索引文件

3:执行索引

具体代码如下:

public class FilePreprocess {
	
	public static void preprocess(File file, String outputDir){
		try{
			splitToSmallFiles(charactorProcess(file, outputDir + "output.all"), outputDir);
		}catch(Exception e){
			e.printStackTrace();
		}
	}
        // 对文件进行处理
	public static File charactorProcess(File file, String destFile) throws Exception{
		BufferedWriter writer = new BufferedWriter(new FileWriter(destFile));
		BufferedReader reader = new BufferedReader(new FileReader(file));
		String line = reader.readLine();
		while(line != null){
			if(!line.equals("\r\n")){
				String newline = replace(line);
				writer.write(newline);
				writer.newLine();
			}
			line = reader.readLine();
		}
		reader.close();
		writer.close();
		return new File(destFile);
	}
        // 拆分文件
	public static void splitToSmallFiles(File file, String outputpath) throws IOException{
		int filePointer = 0;
		int MAX_SIZE = 10240;
		BufferedWriter writer = null;
		BufferedReader reader = new BufferedReader(new FileReader(file));
		StringBuffer buffer = new StringBuffer();
		String line = reader.readLine();
		while(line != null){
			buffer.append(line).append("\r\n");
			if(buffer.toString().getBytes().length >= MAX_SIZE){
				writer = new BufferedWriter(new FileWriter(outputpath + "output" + filePointer + ".txt"));
				writer.write(buffer.toString());
				writer.close();
				filePointer ++;
				// 清空缓存区的数据
				buffer = new StringBuffer();
			}
			line = reader.readLine();
		}
		writer = new BufferedWriter(new FileWriter(outputpath + "output" + filePointer + ".txt"));
		writer.write(buffer.toString());
		writer.close();
	}
        // 转换文档中的特殊字符
	private static String replace(String line){
		HashMap map = new HashMap();
		map.put(",", ",");
		map.put("。", ",");
		map.put("《", "<");
		map.put("》", ">");
		map.put("【", "[");
		map.put("】", "]");
		map.put("{", "{");
		map.put("}", "}");
		map.put(":", ":");
		map.put("!", "!");
		int length = line.length();
		for(int i =0;i<length;i++){
			String charat = line.substring(i, i+1);
			if(map.get(charat) != null){
				line = line.replace(charat, (String)map.get(charat));
			}
		}
		return line;
	}
	
	public static void main(String[] args) {
		String inputFile = "d:\\book.txt";
		String outputDir = "d:\\testfolder\\";
		if(!new File(outputDir).exists()){
			new File(outputDir).mkdirs();
		}
		FilePreprocess filePreprocess = new FilePreprocess();
		filePreprocess.preprocess(new File(inputFile), outputDir);
	}

}

 建立索引:

public class IndexProcesser {
	
	// 成员变量,存储创建的索引文件存放的位置
	private String INDEX_STORE_PATH = "d:\\index";
	
	// 创建索引
	public void createIndex(String inputDir){
		try{
			// 以MMAnalyzer作为分词工具创建一个IndexWriter
			IndexWriter writer = new IndexWriter(INDEX_STORE_PATH, new MMAnalyzer(), true);
			File filesDir = new File(inputDir);
			// 取得所有需要建立索引的文件数组
			File[] files = filesDir.listFiles();
			// 遍历数组
			for(int i =0;i<files.length;i++){
				String fileName = files[i].getName();
				if(fileName.substring(fileName.lastIndexOf(".")).equals(".txt")){
					// 创建一个新的Document
					Document doc = new Document();
					// 为文件名创建一个Field
					Field field = new Field("filename", files[i].getName(), Field.Store.YES, Field.Index.TOKENIZED);
					doc.add(field);
					// 为文件内容创建一个Field
					field = new Field("content", loadFileToString(files[i]), Field.Store.YES, Field.Index.TOKENIZED);
					doc.add(field);
					// 把Document加入IndexWriter
					writer.addDocument(doc);
				}
			}
			writer.close();
		}catch(Exception e){
			e.printStackTrace();
		}
	}
	public String loadFileToString(File file){
		try{
			BufferedReader br = new BufferedReader(new FileReader(file));
			StringBuffer sb = new StringBuffer();
			String line = br.readLine();
			while(line != null){
				sb.append(line);
				line = br.readLine();
			}
			br.close();
			return sb.toString();
		}catch(Exception e){
			e.printStackTrace();
			return null;
		}
	}
	public static void main(String[] args) {
		IndexProcesser indexProcesser = new IndexProcesser();
		indexProcesser.createIndex("d:\\testfolder");
	}

}

 执行查询:

public class Search {
	
	private String INDEX_STORE_PATH = "d:\\index";
	
	public void indexSearch(String searchType, String searchKey){
		try{
			IndexSearcher searcher = new IndexSearcher(INDEX_STORE_PATH);
			
			Term t = new Term(searchType, searchKey);
			Query q = new TermQuery(t);
//			Hits hit = searcher.search(q);
//			System.out.println("*************************");
//			for(int i =0;i<hit.length();i++){
//				System.out.println(hit.doc(i));
//			}
			// 搜索开始时间
			Date beginTime = new Date();
			// 获取一个<document, frequency>的枚举对象TermDocs
			TermDocs termDocs = searcher.getIndexReader().termDocs(t);
			while(termDocs.next()){
				// 输出文档中出现关键字的次数
				System.out.println(termDocs.freq());
				//输出搜索到关键词的文档
				System.out.println(searcher.getIndexReader().document(termDocs.doc()));
			}
			Date endTime = new Date();
			// 时长
			long timeOfSearch = endTime.getTime() - beginTime.getTime();
			System.out.println("The time For indexsearch is " + timeOfSearch + " ms");
		}catch(Exception e){
			e.printStackTrace();
		}
	}
}

 所用jar包请参考附件

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值