最近学习Lucene,在别人基础上,做了一个小例子

最新推荐文章于 2020-07-16 13:48:58 发布

黄瓜和土豆

最新推荐文章于 2020-07-16 13:48:58 发布

阅读量125

点赞数

文章标签： lucene Java Apache

本文介绍了一个基于Lucene的小型示例项目，演示如何从不同格式如Word、HTML及PDF中提取文本并构建索引文档。通过实现DocumentHandler接口，针对每种文件类型创建了专门的处理类。

最近学习Lucene,在别人基础上,做了一个小例子 ,以便共同学习！

import java.io.InputStream;

import lia.handlingtypes.framework.DocumentHandlerException;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.textmining.text.extraction.WordExtractor;

public class DocDocumentHandler implements DocumentHandler ...{

public Document getDocument(InputStream is) throws Exception ...{

// TODO Auto-generated method stub

String bodyText = null;

try ...{

bodyText = new WordExtractor().extractText(is);

}

catch (Exception e) ...{

throw new DocumentHandlerException(

"Cannot extract text from a Word document", e);

}

if ((bodyText != null) && (bodyText.trim().length() > 0)) ...{

Document doc = new Document();

doc.add(Field.UnStored("body", bodyText));

return doc;

}

return null;

}

}

import java.io.InputStream;

import org.apache.lucene.document.Document;

public interface DocumentHandler ...{

Document getDocument(InputStream is)

throws Exception;

}

import java.io.InputStream;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.w3c.dom.Element;

import org.w3c.dom.Node;

import org.w3c.dom.NodeList;

import org.w3c.dom.Text;

import org.w3c.tidy.Tidy;

public class HtmlDocumentHandler implements DocumentHandler ...{

public Document getDocument(InputStream is) throws Exception ...{

// TODO Auto-generated method stub

Tidy tidy = new Tidy();

tidy.setQuiet(true);

tidy.setShowWarnings(false);

org.w3c.dom.Document root = tidy.parseDOM(is, null);

Element rawDoc = root.getDocumentElement();

Document doc = new Document();

String title = getTitle(rawDoc);

String body = getBody(rawDoc);

if ((title != null) && (!title.equals(""))) ...{

doc.add(Field.Text("title", title));

}

if ((body != null) && (!body.equals(""))) ...{

doc.add(Field.Text("body", body));

}

return doc;

}

private String getTitle(Element rawDoc) ...{

if (rawDoc == null) ...{

return null;

}

String title = "";

NodeList children = rawDoc.getElementsByTagName("title");

if (children.getLength() > 0) ...{

Element titleElement = ((Element) children.item(0));

Text text = (Text) titleElement.getFirstChild();

if (text != null) ...{

title = text.getData();

}

}

return title;

}

/** *//**

* Gets the body text of the HTML document.

*

* @rawDoc the DOM Element to extract body Node from

* @return the body text

*/

private String getBody(Element rawDoc) ...{

if (rawDoc == null) ...{

return null;

}

String body = "";

NodeList children = rawDoc.getElementsByTagName("body");

if (children.getLength() > 0) ...{

body = getText(children.item(0));

}

return body;

}

/** *//**

* Extracts text from the DOM node.

*

* @param node a DOM node

* @return the text value of the node

*/

private String getText(Node node) ...{

NodeList children = node.getChildNodes();

StringBuffer sb = new StringBuffer();

for (int i = 0; i < children.getLength(); i++) ...{

Node child = children.item(i);

switch (child.getNodeType()) ...{

case Node.ELEMENT_NODE:

sb.append(getText(child));

sb.append(" ");

break;

case Node.TEXT_NODE:

sb.append(((Text) child).getData());

break;

}

}

return sb.toString();

}

}

import java.io.File;

import java.io.FileInputStream;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import lia.handlingtypes.framework.DocumentHandlerException;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.pdfbox.cos.COSDocument;

import org.pdfbox.encryption.DecryptDocument;

import org.pdfbox.exceptions.CryptographyException;

import org.pdfbox.exceptions.InvalidPasswordException;

import org.pdfbox.pdfparser.PDFParser;

import org.pdfbox.pdmodel.PDDocument;

import org.pdfbox.pdmodel.PDDocumentInformation;

import org.pdfbox.searchengine.lucene.LucenePDFDocument;

import org.pdfbox.util.PDFTextStripper;

public class PdfDocumentHandler implements DocumentHandler ...{

&nbs

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。