最近学习Lucene,在别人基础上,做了一个小例子_keywords != null && !keywords.equals("")-优快云博客

本文链接：https://blog.youkuaiyun.com/pengchua/article/details/1660848

本文介绍了一个基于Lucene的小型示例项目，展示了如何从不同类型的文件（如Word、PDF、RTF、HTML和纯文本）中提取文本内容，并将其转换为Lucene Document对象，以便进行索引和搜索。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

最近学习Lucene,在别人基础上,做了一个小例子 ,以便共同学习！

import java.io.InputStream;

import lia.handlingtypes.framework.DocumentHandlerException;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.textmining.text.extraction.WordExtractor;

public class DocDocumentHandler implements DocumentHandler {

public Document getDocument(InputStream is) throws Exception {

// TODO Auto-generated method stub

String bodyText = null;

try {

bodyText = new WordExtractor().extractText(is);

}

catch (Exception e) {

throw new DocumentHandlerException(

"Cannot extract text from a Word document", e);

}

if ((bodyText != null) && (bodyText.trim().length() > 0)) {

Document doc = new Document();

doc.add(Field.UnStored("body", bodyText));

return doc;

}

return null;

}

import java.io.InputStream;

import org.apache.lucene.document.Document;

public interface DocumentHandler {

Document getDocument(InputStream is)

throws Exception;

}

import java.io.InputStream;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.w3c.dom.Element;

import org.w3c.dom.Node;

import org.w3c.dom.NodeList;

import org.w3c.dom.Text;

import org.w3c.tidy.Tidy;

public class HtmlDocumentHandler implements DocumentHandler {

public Document getDocument(InputStream is) throws Exception {

// TODO Auto-generated method stub

Tidy tidy = new Tidy();

tidy.setQuiet(true);

tidy.setShowWarnings(false);

org.w3c.dom.Document root = tidy.parseDOM(is, null);

Element rawDoc = root.getDocumentElement();

Document doc = new Document();

String title = getTitle(rawDoc);

String body = getBody(rawDoc);

if ((title != null) && (!title.equals(""))) {

doc.add(Field.Text("title", title));

}

if ((body != null) && (!body.equals(""))) {

doc.add(Field.Text("body", body));

}

return doc;

}

private String getTitle(Element rawDoc) {

if (rawDoc == null) {

return null;

}

String title = "";

NodeList children = rawDoc.getElementsByTagName("title");

if (children.getLength() > 0) {

Element titleElement = ((Element) children.item(0));

Text text = (Text) titleElement.getFirstChild();

if (text != null) {

title = text.getData();

}

return title;

}

/**

* Gets the body text of the HTML document.

* @rawDoc the DOM Element to extract body Node from

* @return the body text

private String getBody(Element rawDoc) {

if (rawDoc == null) {

return null;

}

String body = "";

NodeList children = rawDoc.getElementsByTagName("body");

if (children.getLength() > 0) {

body = getText(children.item(0));

}

return body;

}

/**

* Extracts text from the DOM node.

* @param node a DOM node

* @return the text value of the node

private String getText(Node node) {

NodeList children = node.getChildNodes();

StringBuffer sb = new StringBuffer();

for (int i = 0; i < children.getLength(); i++) {

Node child = children.item(i);

switch (child.getNodeType()) {

case Node.ELEMENT_NODE:

sb.append(getText(child));

sb.append(" ");

break;

case Node.TEXT_NODE:

sb.append(((Text) child).getData());

break;

}

return sb.toString();

}

import java.io.File;

import java.io.FileInputStream;

import java.io.FileOutputStream;

import java.io.IOException;

import java.io.InputStream;

import lia.handlingtypes.framework.DocumentHandlerException;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.pdfbox.cos.COSDocument;

import org.pdfbox.encryption.DecryptDocument;

import org.pdfbox.exceptions.CryptographyException;

import org.pdfbox.exceptions.InvalidPasswordException;

import org.pdfbox.pdfparser.PDFParser;

import org.pdfbox.pdmodel.PDDocument;

import org.pdfbox.pdmodel.PDDocumentInformation;

import org.pdfbox.searchengine.lucene.LucenePDFDocument;

import org.pdfbox.util.PDFTextStripper;

public class PdfDocumentHandler implements DocumentHandler {

public static String password = "-password";

public Document getDocument(InputStream is) throws Exception {

// TODO Auto-generated method stub

COSDocument cosDoc = null;

try {

cosDoc = parseDocument(is);

}

catch (IOException e) {

closeCOSDocument(cosDoc);

throw new DocumentHandlerException(

"Cannot parse PDF document", e);

}

// decrypt the PDF document, if it is encrypted

try {

if (cosDoc.isEncrypted()) {

DecryptDocument decryptor = new DecryptDocument(cosDoc);

decryptor.decryptDocument(password);

}

catch (CryptographyException e) {

closeCOSDocument(cosDoc);

throw new DocumentHandlerException(

"Cannot decrypt PDF document", e);

}

catch (InvalidPasswordException e) {

closeCOSDocument(cosDoc);

throw new DocumentHandlerException(

"Cannot decrypt PDF document", e);

}

catch (IOException e) {

closeCOSDocument(cosDoc);

throw new DocumentHandlerException(

"Cannot decrypt PDF document", e);

}

// extract PDF document's textual content

String docText = null;

try {

PDFTextStripper stripper = new PDFTextStripper();

docText = stripper.getText(new PDDocument(cosDoc));

}

catch (IOException e) {

closeCOSDocument(cosDoc);

throw new DocumentHandlerException(

"Cannot parse PDF document", e);

// String errS = e.toString();

// if (errS.toLowerCase().indexOf("font") != -1) {

// }

}

Document doc = new Document();

if (docText != null) {

doc.add(Field.UnStored("body", docText));

}

// extract PDF document's meta-data

PDDocument pdDoc = null;

try {

pdDoc = new PDDocument(cosDoc);

PDDocumentInformation docInfo =

pdDoc.getDocumentInformation();

String author = docInfo.getAuthor();

String title = docInfo.getTitle();

String keywords = docInfo.getKeywords();

String summary = docInfo.getSubject();

if ((author != null) && (!author.equals(""))) {

doc.add(Field.Text("author", author));

}

if ((title != null) && (!title.equals(""))) {

doc.add(Field.Text("title", title));

}

if ((keywords != null) && (!keywords.equals(""))) {

doc.add(Field.Text("keywords", keywords));

}

if ((summary != null) && (!summary.equals(""))) {

doc.add(Field.Text("summary", summary));

}

catch (Exception e) {

closeCOSDocument(cosDoc);

closePDDocument(pdDoc);

System.err.println("Cannot get PDF document meta-data: "

+ e.getMessage());

}

return doc;

}

private static COSDocument parseDocument(InputStream is)

throws IOException {

PDFParser parser = new PDFParser(is);

parser.parse();

return parser.getDocument();

}

private void closeCOSDocument(COSDocument cosDoc) {

if (cosDoc != null) {

try {

cosDoc.close();

}

catch (IOException e) {

// eat it, what else can we do?

}

private void closePDDocument(PDDocument pdDoc) {

if (pdDoc != null) {

try {

pdDoc.close();

}

catch (IOException e) {

// eat it, what else can we do?

}

import java.io.IOException;

import java.io.InputStream;

import javax.swing.text.BadLocationException;

import javax.swing.text.DefaultStyledDocument;

import javax.swing.text.rtf.RTFEditorKit;

import lia.handlingtypes.framework.DocumentHandlerException;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

public class RtfDocumentHandler implements DocumentHandler {

public Document getDocument(InputStream is) throws Exception {

// TODO Auto-generated method stub

String bodyText = null;

DefaultStyledDocument styledDoc = new DefaultStyledDocument();

try {

new RTFEditorKit().read(is, styledDoc, 0);

bodyText = styledDoc.getText(0, styledDoc.getLength());

}

catch (IOException e) {

throw new DocumentHandlerException(

"Cannot extract text from a RTF document", e);

}

catch (BadLocationException e) {

throw new DocumentHandlerException(

"Cannot extract text from a RTF document", e);

}

if (bodyText != null) {

Document doc = new Document();

doc.add(Field.UnStored("body", bodyText));

return doc;

}

return null;

}

import java.io.BufferedReader;

import java.io.IOException;

import java.io.InputStream;

import java.io.InputStreamReader;

import lia.handlingtypes.framework.DocumentHandlerException;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

public class TxtDocumentHandler implements DocumentHandler {

public Document getDocument(InputStream is) throws Exception {

// TODO Auto-generated method stub

String bodyText = "";

try {

BufferedReader br =

new BufferedReader(new InputStreamReader(is));

String line = null;

while ((line = br.readLine()) != null) {

bodyText += line;

}

br.close();

}

catch(IOException e) {

throw new DocumentHandlerException(

"Cannot read the text document", e);

}

if (!bodyText.equals("")) {

Document doc = new Document();

doc.add(Field.UnStored("body", bodyText));

return doc;

}

return null;

}

import java.io.File;

import java.util.Date;

import org.apache.lucene.document.Document;

import org.apache.lucene.queryParser.QueryParser;

import org.apache.lucene.search.BooleanQuery;

import org.apache.lucene.search.Hits;

import org.apache.lucene.search.IndexSearcher;

import org.apache.lucene.search.Query;

import org.apache.lucene.store.Directory;

import org.apache.lucene.store.FSDirectory;

import org.mira.lucene.analysis.IK_CAnalyzer;

/**

* This code was originally written for

* Erik's Lucene intro java.net article

public class Searcher {

public static void main(String[] args) throws Exception {

if (args.length != 1) {

throw new Exception("Usage: java " + Searcher.class.getName()

+ " <index dir> <query>");

}

// File indexDir = new File(args[0]);

// String q = args[1];

File indexDir = new File("E:/LUCENE/index");

String q=args[0];

if (!indexDir.exists() || !indexDir.isDirectory()) {

throw new Exception(indexDir +

" does not exist or is not a directory.");

}

search(indexDir, q);

}

public static void search(File indexDir, String q)

throws Exception {

Directory fsDir = FSDirectory.getDirectory(indexDir, false);

IndexSearcher is = new IndexSearcher(fsDir);

Query query = QueryParser.parse(q, "body",

new IK_CAnalyzer());

//在“body”中查找，必须要已经在create index中已经定义好

//QueryParser .parse(String query, String field, Analyzer analyzer)，例如：

//query为检索词, field为检索的字段名, analyzer为分析器

long start = new Date().getTime();

// BooleanQuery m_BooleanQuery = new BooleanQuery();

// m_BooleanQuery.add(query,true,false);

Hits hits = is.search(query); //search

long end = new Date().getTime();

System.err.println("Found " + hits.length() +

" document(s) (in " + (end - start) +

" milliseconds) that matched query '" +

q + "':");

for (int i = 0; i < hits.length(); i++) {

Document doc = hits.doc(i);

System.out.println(doc.get("filename"));

// System.out.println(doc.getField("contents"));

}

import java.io.File;

import java.io.FileInputStream;

import java.io.IOException;

import java.util.Date;

import org.apache.lucene.analysis.standard.StandardAnalyzer;

import org.apache.lucene.document.Document;

import org.apache.lucene.document.Field;

import org.apache.lucene.index.IndexWriter;

// 下面两个是网上下载的别人的中文分词器,对文中的中文进行分割

import org.mira.lucene.analysis.MIK_CAnalyzer; // (最大全切分)

import org.mira.lucene.analysis.IK_CAnalyzer; // (细粒度全切分)<------引用类

/**

* This code was originally written for

* Erik's Lucene intro java.net article

public class Indexer {

// private static Document doc = new Document();

public static void main(String[] args) throws Exception {

// if (args.length != 2) {

// throw new Exception("Usage: java " + Indexer.class.getName()

// + " <index dir> <data dir>");

// }

// File indexDir = new File(args[0]);

// File dataDir = new File(args[1]);

File indexDir = new File("E:/LUCENE/index");

File dataDir = new File("E:/LUCENE/test");

long start = new Date().getTime();

int numIndexed = index(indexDir, dataDir);

long end = new Date().getTime();

System.out.println("Indexing " + numIndexed + " files took "

+ (end - start) + " milliseconds");

// test 分词功能：

// System.out.println(new IK_CAnalyzer().tokenStream("用户本地系统中必须安装有Word的应用程序"," "));

}

public static int index(File indexDir, File dataDir)

throws IOException {

if (!dataDir.exists() || !dataDir.isDirectory()) {

throw new IOException(dataDir

+ " does not exist or is not a directory");

}

IndexWriter writer = new IndexWriter(indexDir,

new IK_CAnalyzer(), true);

writer.setUseCompoundFile(false);

indexDirectory(writer, dataDir);

int numIndexed = writer.docCount();

writer.optimize();

writer.close();

return numIndexed;

}

private static void indexDirectory(IndexWriter writer, File dir)

throws IOException {

Document doc = new Document();

File[] files = dir.listFiles();

for (int i = 0; i < files.length; i++) {

File f = files[i];

if (f.isDirectory()) {

indexDirectory(writer, f); // recurse

} else

{

try {

doc=Factory(f);

} catch (Exception e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

writer.addDocument(doc); //一定要将生成的Document加到Writer中去。

}

private static Document Factory(File f) throws Exception {

Document doc = new Document();

DocumentHandler handler=null;

if (f.getName().endsWith(".txt") || f.getName().endsWith(".java")) {

//doc = getTxtDocument(new FileInputStream(f));

handler=new TxtDocumentHandler();

} else if (f.getName().endsWith(".doc")) {

// doc = getDocument(new FileInputStream(f));

handler=new DocDocumentHandler();

} else if (f.getName().endsWith(".pdf")) {

// doc = LucenePDFDocument.getDocument(f);

handler=new PdfDocumentHandler();

} else if (f.getName().endsWith(".rtf")) {

// doc = getRtfDocument(new FileInputStream(f));

handler=new RtfDocumentHandler();

} else if (f.getName().endsWith(".html")

|| f.getName().endsWith(".htm")) {

// doc = getHtmlDocument(new FileInputStream(f));

handler=new HtmlDocumentHandler();

}

if(handler!=null){

doc=handler.getDocument(new FileInputStream(f));

doc.add(Field.Keyword("filename", f.getCanonicalPath()));

System.out.println("Indexing " + f.getCanonicalPath());

}

return doc;

}