最近学习Lucene,在别人基础上,做了一个小例子 ,以便共同学习!
import java.io.InputStream;

import lia.handlingtypes.framework.DocumentHandlerException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.textmining.text.extraction.WordExtractor;


public class DocDocumentHandler implements DocumentHandler ...{


public Document getDocument(InputStream is) throws Exception ...{
// TODO Auto-generated method stub
String bodyText = null;


try ...{
bodyText = new WordExtractor().extractText(is);
}

catch (Exception e) ...{
throw new DocumentHandlerException(
"Cannot extract text from a Word document", e);
}


if ((bodyText != null) && (bodyText.trim().length() > 0)) ...{
Document doc = new Document();
doc.add(Field.UnStored("body", bodyText));
return doc;
}
return null;
}

}


import java.io.InputStream;

import org.apache.lucene.document.Document;



public interface DocumentHandler ...{
Document getDocument(InputStream is)
throws Exception;
}
import java.io.InputStream;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.w3c.tidy.Tidy;


public class HtmlDocumentHandler implements DocumentHandler ...{


public Document getDocument(InputStream is) throws Exception ...{
// TODO Auto-generated method stub
Tidy tidy = new Tidy();
tidy.setQuiet(true);
tidy.setShowWarnings(false);
org.w3c.dom.Document root = tidy.parseDOM(is, null);
Element rawDoc = root.getDocumentElement();
Document doc = new Document();
String title = getTitle(rawDoc);
String body = getBody(rawDoc);

if ((title != null) && (!title.equals(""))) ...{
doc.add(Field.Text("title", title));
}

if ((body != null) && (!body.equals(""))) ...{
doc.add(Field.Text("body", body));
}

return doc;
}

private String getTitle(Element rawDoc) ...{

if (rawDoc == null) ...{
return null;
}

String title = "";

NodeList children = rawDoc.getElementsByTagName("title");

if (children.getLength() > 0) ...{
Element titleElement = ((Element) children.item(0));
Text text = (Text) titleElement.getFirstChild();

if (text != null) ...{
title = text.getData();
}
}
return title;
}


/** *//**
* Gets the body text of the HTML document.
*
* @rawDoc the DOM Element to extract body Node from
* @return the body text
*/

private String getBody(Element rawDoc) ...{

if (rawDoc == null) ...{
return null;
}

String body = "";
NodeList children = rawDoc.getElementsByTagName("body");

if (children.getLength() > 0) ...{
body = getText(children.item(0));
}
return body;
}


/** *//**
* Extracts text from the DOM node.
*
* @param node a DOM node
* @return the text value of the node
*/

private String getText(Node node) ...{
NodeList children = node.getChildNodes();
StringBuffer sb = new StringBuffer();

for (int i = 0; i < children.getLength(); i++) ...{
Node child = children.item(i);

switch (child.getNodeType()) ...{
case Node.ELEMENT_NODE:
sb.append(getText(child));
sb.append(" ");
break;
case Node.TEXT_NODE:
sb.append(((Text) child).getData());
break;
}
}
return sb.toString();
}
}


import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;

import lia.handlingtypes.framework.DocumentHandlerException;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.pdfbox.cos.COSDocument;
import org.pdfbox.encryption.DecryptDocument;
import org.pdfbox.exceptions.CryptographyException;
import org.pdfbox.exceptions.InvalidPasswordException;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.pdmodel.PDDocumentInformation;
import org.pdfbox.searchengine.lucene.LucenePDFDocument;
import org.pdfbox.util.PDFTextStripper;


public class PdfDocumentHandler implements DocumentHandler ...{

&nbs
























































































































































































