全文检索保存在文件中的数据——简单实例-优快云博客

本文介绍了一个基于Lucene的索引管理系统，该系统能够为指定目录下的HTML文档创建索引，并提供搜索功能。通过使用StandardAnalyzer分析器，系统可以解析HTML文件内容并将其存储到索引中，方便后续进行全文检索。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1、创建索引

package com.dream.index;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.demo.html.HTMLParser;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.htmlparser.Parser;


/**
 * 索引管理器
 * 为资源建立索引
 */
public class IndexMgr {

	private  String dataDir = "";//资源所在目录
	private  String indexDir = "";//索引文件所在目录

	public IndexMgr(){}

	/**
	 * 
	 * @param dataDir 要索引的资源所在目录
	 * @param indexDir 索引文件所在目录
	 */
	public IndexMgr(String dataDir,String indexDir){
		this.dataDir = dataDir;
		this.indexDir = indexDir;
	}

	/**
	 * 创建索引
	 * @throws IOException 
	 * @throws InterruptedException 
	 */
	public boolean createIndex() throws IOException, InterruptedException{

//		if(isIndexExist()){//若索引已经存在
//			return true;
//		}
		File dir = new File(dataDir);
		if(!dir.exists()){
			return false;
		}

		File[] htmls = dir.listFiles();
		//索引文件存放的位置
		//Directory fsDirectory = FSDirectory.getDirectory(indexDir, true);
		//分析器
		Analyzer analyzer = new StandardAnalyzer();
		//负责将Document写入索引文件
		IndexWriter indexWriter = new IndexWriter(this.indexDir,analyzer,true);

		for (int i = 0; i < htmls.length; i++) {
			String htmlPath = htmls[i].getAbsolutePath();

				addDocument(htmlPath, indexWriter);

		}
		indexWriter.optimize();
		indexWriter.close();
		return true;
	}

	/**
	 * 将Document写入索引文件
	 * @throws InterruptedException 
	 * @throws IOException 
	 */
	public void addDocument(String filePath,IndexWriter indexWriter) throws IOException, InterruptedException{

		String content = readFileByLines(filePath);
		Document document = new Document();
		document.add(new Field("title",content,Field.Store.YES,Field.Index.TOKENIZED));
		indexWriter.addDocument(document);
	}

	public String readFileByLines(String fileName){
		   File file = new File(fileName);
		   BufferedReader reader = null;
		   StringBuffer retString = new StringBuffer();
		   try {
		    reader = new BufferedReader(new FileReader(file));
		    String tempString = null;
		    int line = 1;
		    //һ�ζ���һ�У�ֱ������nullΪ�ļ�����
		    while ((tempString = reader.readLine()) != null){
		     //��ʾ�к�
		     retString.append(tempString);
		     retString.append("\n");
		     line++;
		    }
		    reader.close();
		   } catch (IOException e) {
		    e.printStackTrace();
		   } finally {
		    if (reader != null){
		     try {
		      reader.close();
		     } catch (IOException e1) {
		     }
		    }
		   }

		   return retString.toString(); 

	}

	/**
	 * 判断索引是否已存在
	 * @param indexDir 索引所在路径
	 */
	public boolean isIndexExist(){
		File directory = new File(indexDir);
		if(directory.listFiles().length > 0){
			return true;
		}else{
			return false;
		}
	}

	public String getDataDir() {
		return dataDir;
	}

	public void setDataDir(String dataDir) {
		this.dataDir = dataDir;
	}

	public String getIndexDir() {
		return indexDir;
	}

	public void setIndexDir(String indexDir) {
		this.indexDir = indexDir;
	}
}

2、执行搜索

package com.dream.index;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.demo.html.HTMLParser;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.htmlparser.Parser;


/**
 * 索引管理器
 * 为资源建立索引
 */
public class IndexMgr {

	private  String dataDir = "";//资源所在目录
	private  String indexDir = "";//索引文件所在目录

	public IndexMgr(){}

	/**
	 * 
	 * @param dataDir 要索引的资源所在目录
	 * @param indexDir 索引文件所在目录
	 */
	public IndexMgr(String dataDir,String indexDir){
		this.dataDir = dataDir;
		this.indexDir = indexDir;
	}

	/**
	 * 创建索引
	 * @throws IOException 
	 * @throws InterruptedException 
	 */
	public boolean createIndex() throws IOException, InterruptedException{

//		if(isIndexExist()){//若索引已经存在
//			return true;
//		}
		File dir = new File(dataDir);
		if(!dir.exists()){
			return false;
		}

		File[] htmls = dir.listFiles();
		//索引文件存放的位置
		//Directory fsDirectory = FSDirectory.getDirectory(indexDir, true);
		//分析器
		Analyzer analyzer = new StandardAnalyzer();
		//负责将Document写入索引文件
		IndexWriter indexWriter = new IndexWriter(this.indexDir,analyzer,true);

		for (int i = 0; i < htmls.length; i++) {
			String htmlPath = htmls[i].getAbsolutePath();

				addDocument(htmlPath, indexWriter);

		}
		indexWriter.optimize();
		indexWriter.close();
		return true;
	}

	/**
	 * 将Document写入索引文件
	 * @throws InterruptedException 
	 * @throws IOException 
	 */
	public void addDocument(String filePath,IndexWriter indexWriter) throws IOException, InterruptedException{

		String content = readFileByLines(filePath);
		Document document = new Document();
		document.add(new Field("title",content,Field.Store.YES,Field.Index.TOKENIZED));
		indexWriter.addDocument(document);
	}

	public String readFileByLines(String fileName){
		   File file = new File(fileName);
		   BufferedReader reader = null;
		   StringBuffer retString = new StringBuffer();
		   try {
		    reader = new BufferedReader(new FileReader(file));
		    String tempString = null;
		    int line = 1;
		    //һ�ζ���һ�У�ֱ������nullΪ�ļ�����
		    while ((tempString = reader.readLine()) != null){
		     //��ʾ�к�
		     retString.append(tempString);
		     retString.append("\n");
		     line++;
		    }
		    reader.close();
		   } catch (IOException e) {
		    e.printStackTrace();
		   } finally {
		    if (reader != null){
		     try {
		      reader.close();
		     } catch (IOException e1) {
		     }
		    }
		   }

		   return retString.toString(); 

	}

	/**
	 * 判断索引是否已存在
	 * @param indexDir 索引所在路径
	 */
	public boolean isIndexExist(){
		File directory = new File(indexDir);
		if(directory.listFiles().length > 0){
			return true;
		}else{
			return false;
		}
	}

	public String getDataDir() {
		return dataDir;
	}

	public void setDataDir(String dataDir) {
		this.dataDir = dataDir;
	}

	public String getIndexDir() {
		return indexDir;
	}

	public void setIndexDir(String indexDir) {
		this.indexDir = indexDir;
	}
}

3、控制器

package com.dream.servlet;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;

import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;

import sun.rmi.server.Dispatcher;

import com.dream.index.IndexMgr;
import com.dream.search.SearchMgr;

public class ToSearch extends HttpServlet {

	public ToSearch() {
		super();
	}

	public void destroy() {
		super.destroy();
	}

	public void doGet(HttpServletRequest request, HttpServletResponse response)
			throws ServletException, IOException {
		doPost(request, response);
	}

	public void doPost(HttpServletRequest request, HttpServletResponse response)
			throws ServletException, IOException {
		String keyword = request.getParameter("keyword");
		List searchResult = new ArrayList();
		SearchMgr searchMgr = new SearchMgr(keyword);
		IndexMgr indexMgr = new IndexMgr();
		//设置要搜索的资源所在目录
		indexMgr.setDataDir("D:\\dataDir");
		//设置索引存放的目录
		indexMgr.setIndexDir("D:\\indexDir");
		searchMgr.setIndexMgr(indexMgr);
		searchResult = searchMgr.search();
		request.setAttribute("searchResult", searchResult);
		RequestDispatcher rdp = request.getRequestDispatcher("index.jsp");
		rdp.forward(request, response);

	}


	public void init() throws ServletException {
	}

}