Lucene實戰開發手記(四）--- 為PDF/excel/doc格式的文檔創建索引-优快云博客

本文介绍了一个用于处理多种格式文档的框架，包括Word、Excel、PDF等。通过工厂模式创建不同的文档处理器，实现对不同文件类型的解析，并将内容抽取为统一的Document对象。

上篇有提到這段代碼：

DocHander docHander = DocHanderFactory.buildDocHander(fileName);   
           
        attachDocument = docHander.getDocument(attach);

下面我們看一看實現細節。

抽象類DocHander的代碼：

public abstract class DocHander {
	public static String FIELD_CONTENT = "contents";		
	
	public abstract Document getDocument(byte[] inputByte) throws Exception;	
	
	protected Document addContent(Document document, String content){		
		document.add(new Field(DocHander.FIELD_CONTENT, content ,Field.Store.YES,Field.Index.TOKENIZED));
		return document;
	}	
	
}

現看看工廠類DocHanderFactory的代碼：

public abstract class DocHanderFactory {
	
	public static DocHander buildDocHander(String fileName){
		DocHander docHander = null;
		if (fileName.toLowerCase().endsWith(".doc")){
			docHander = new WordDocHander();
		}
		else if(fileName.toLowerCase().endsWith(".xls")){
			docHander = new ExcelDocHander();
		}
		else if(fileName.toLowerCase().endsWith(".pdf")){
			docHander = new PdfDocHander();
		}
		else if(fileName.toLowerCase().endsWith(".html") || fileName.toLowerCase().endsWith(".htm")){
			docHander = new HtmlDocHander();
		}
		else{
			docHander = new TxtDocHander();
		}
		return docHander;
	}
}

以下貼出WordDocHander、 ExcelDocHander、PdfDocHander的代碼，因為別人已經幫我們包好了，所以我們寫起來很簡單，非常感謝他們！

public class WordDocHander extends DocHander {

	public Document getDocument(byte[] inputByte) throws IOException {
		InputStream inputStream = new ByteArrayInputStream(inputByte);
		// TODO Auto-generated method stub		
		Document document = new Document();
		WordExtractor extractor = new WordExtractor(inputStream);		
		addContent(document,extractor.getText());
		return document;
	}
}

public class ExcelDocHander extends DocHander {

	public Document getDocument(byte[] inputByte) throws IOException {
		// TODO Auto-generated method stub
		InputStream inputStream = new ByteArrayInputStream(inputByte);
		Document document = new Document();
		HSSFWorkbook wb = new HSSFWorkbook(inputStream);
		ExcelExtractor extractor = new ExcelExtractor(wb);

		extractor.setFormulasNotResults(true);
		extractor.setIncludeSheetNames(false);
		String content = extractor.getText();

		return addContent(document, content);
	}

}

public class PdfDocHander extends DocHander {

	public Document getDocument(byte[] inputByte) throws IOException {		
//		Document document = LucenePDFDocument.getDocument(inputStream);//如何你不需要摘要顯示所搜索到的內容，就可以直接用這個簡單的方法
		InputStream inputStream = new ByteArrayInputStream(inputByte);
		Document document = new Document();
		PDDocument pdfDocument = PDDocument.load(inputStream );
		try {
	        if( pdfDocument.isEncrypted() )
	        {
	            //Just try using the default password and move on            
				pdfDocument.decrypt( "" );					
	        }//create a writer where to append the text content.
	        StringWriter writer = new StringWriter();
	        PDFTextStripper stripper = new PDFTextStripper();        
	        stripper.writeText( pdfDocument, writer );
	        String contents = writer.getBuffer().toString();
	        super.addContent(document, contents);
		} catch (CryptographyException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();		
			throw new IOException( "Error decrypting document: " + e );
		} catch (InvalidPasswordException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
			throw new IOException( "Error decrypting document: " + e );
		}
        
		return document;
	}

}