lucene 创建索引

最新推荐文章于 2021-08-18 12:19:08 发布
原创最新推荐文章于 2021-08-18 12:19:08 发布 · 429 阅读
0 ·
CC 4.0 BY-SA版权
文章标签：
#Lucene
JAVA 专栏收录该内容
12 篇文章
订阅专栏
package com.cjr.lucene;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.nio.file.Files;
import java.nio.file.Paths;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

/**
 * 创建索引 对某个目录下的txt文件创建索引
 * 
 * @author chenjiarong
 * 
 */
public class Indexer {

	/**
	 * 索引存放目录
	 */
	private static String INDEXDIR = "F:\\工作区\\test";

	/**
	 * 文件存在目录
	 */
	private static String DATADIR = "F:\\其它\\文学";

	/**
	 * 后缀
	 */
	private static String SUFFIX = ".txt";

	/**
	 * 索引写入器
	 */
	private IndexWriter indexWriter;

	public static void main(String[] args) throws IOException {

		Indexer indexer = new Indexer(INDEXDIR);

		int numIndexed = 0;

		numIndexed = indexer.index(DATADIR, new TextFilesFilter());

		System.out.println("numIndexed: " + numIndexed);

		indexer.close();
	}

	/**
	 * 实例化索引写入器
	 * 
	 * @param inderDir
	 * @throws IOException
	 */
	public Indexer(String inderDir) throws IOException {
		// 目录
		Directory directory = FSDirectory.open(new File(inderDir));
		// 分词器
		Analyzer analyzer = new StandardAnalyzer();
		// 索引写入器配置
		IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_2,
				analyzer);
		config.setOpenMode(OpenMode.CREATE_OR_APPEND);
		indexWriter = new IndexWriter(directory, config);
	}

	/**
	 * 返回被索引文档数
	 * 
	 * @param dataDir
	 * @param fileFilter
	 * @return
	 * @throws IOException
	 */
	public int index(String dataDir, FileFilter fileFilter) throws IOException {
		File[] files = new File(dataDir).listFiles();
		for (File file : files) {
			if (!file.isDirectory() && !file.isHidden() && file.exists()
					&& file.canRead()
					&& (fileFilter == null || fileFilter.accept(file))) {
				indexFile(file);
			}
		}
		return indexWriter.numDocs();
	}

	/**
	 * 向Lucene索引中添加文档
	 * 
	 * @param file
	 * @throws IOException
	 */
	private void indexFile(File file) throws IOException {
		// 打印文本文件的完整路径
		System.out.println("Indexing " + file.getCanonicalPath());
		Document document = getDocument(file);
		indexWriter.addDocument(document);
	}

	/**
	 * 获取文件内容
	 * 
	 * @param file
	 * @return
	 * @throws IOException
	 */
	private Document getDocument(File file) throws IOException {
		Document document = new Document();
		// 文件内容
		document.add(new TextField("contents", getFileContent(file),Field.Store.YES));
		// 文件名称
		document.add(new TextField("filename", file.getName(), Field.Store.YES));
		// 文件完整路径
		document.add(new TextField("fullpath", file.getCanonicalPath(),
				Field.Store.YES));
		return document;
	}

	/**
	 * 根据文件的全路径获得所有的文件内容
	 * 
	 * @param fileName
	 * @param charset
	 * @return
	 * @throws Exception
	 */
	public String getFileContent(File file) {
		String everything = null;
		try {
			everything = new String(Files.readAllBytes(Paths.get(file.getCanonicalPath())),"utf-8");
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return everything;
	}

	/**
	 * 关闭 indexWriter
	 * 
	 * @throws IOException
	 */
	public void close() throws IOException {
		indexWriter.close();
	}

	/**
	 * 文本文件过滤器
	 * 
	 * @author chenjiarong
	 * 
	 */
	private static class TextFilesFilter implements FileFilter {

		public boolean accept(File pathname) {
			return pathname.getName().toLowerCase().endsWith(SUFFIX);
		}

	}
}