java读取pdf(可分页读取)

最新推荐文章于 2023-10-24 19:10:33 发布

原创最新推荐文章于 2023-10-24 19:10:33 发布 · 2.4k 阅读

1 ·

CC 4.0 BY-SA版权

文章标签：

#java #移动开发

本文介绍了使用Java的pdfbox库解析PDF文件并提取文本的方法，包括导入必要的包、实例化PDFParser和PDFTextStripper类，以及如何从PDF文件中获取纯文本内容。

此文非原创,来源：http://www.2cto.com/kf/201109/104526.html

需要pdfbox和log4j的包
举个例子：
import org.pdfbox.pdfparser.*;
import org.pdfbox.util.PDFTextStripper;
import java.io.*;
/**
* 测试pdfbox
* @author kingfish
* @version 1.0
*/
public class TestPdf {
public static void main(String[] args) throws Exception{
    FileInputStream fis = new FileInputStream("c://intro.pdf");
    PDFParser p = new PDFParser(fis);
    p.parse();
    PDFTextStripper ts = new PDFTextStripper();
    String s = ts.getText(p.getPDDocument());
    System.out.println(s);
    fis.close();
}
}

--------------------------------------------------------------------------------

import java.io.*;
import java.util.*;
import com.etymon.pj.*;
import com.etymon.pj.object.*;
import com.etymon.pj.exception.*;
/**
* This is a wrapper for the Pj PDF parser
*/
public class PjWrapper {
Pdf pdf;
PjCatalog catalog;
PjPagesNode rootPage;
public PjWrapper(String PdfFileName,String TextFileName)throws
IOException, PjException {
pdf = new Pdf(PdfFileName);
// hopefully the catalog can never be a reference...
catalog = (PjCatalog) pdf.getObject(pdf.getCatalog());
// root node of pages tree is specified by a reference in the catalog
rootPage = (PjPagesNode) pdf.resolve(catalog.getPages());
}
public static void main (String [] args) throws IOException, PjException
{
/*PjWrapper testWrapper = new PjWrapper(args[0]);
LinkedList textList = testWrapper.getAllText();*/
}
/**
* Returns as much text as we can extract from the PDF.
* This currently includes:
*
* NOTE: Pj does not support LZW, so some text in some PDF's may not
* be indexable
*/
public LinkedList getAllText() throws PjException {
LinkedList stringList = new LinkedList();
Iterator streamIter = getAllContentsStreams().iterator();
PjStream stream;
String streamData;
String streamText;
boolean moreData;
int textStart, textEnd;
//System.out.println("Going through streams...");
while(streamIter.hasNext()) {
//System.out.println("Getting next stream");
stream = (PjStream) streamIter.next();
//System.out.println("Adding text from stream with filter: "
+getFilterString(stream);
stream = stream.flateDecompress();
//System.out.println("Adding text from stream with filter
afterdecompress: " + getFilterString(stream));
streamData = new String(stream.getBuffer());
streamText = new String();
moreData = true;
textStart = textEnd = 0;
while(moreData) {
if ((textStart = streamData.indexOf('(', textEnd + 1)) < 0) {
moreData = false;
break;
}
if ((textEnd = streamData.indexOf(')', textStart + 1)) < 0) {
moreData = false;
break;
}
try {
streamText +=
PjString.decodePdf(streamData.substring(textStart,textEnd + 1));
} catch (Exception e) {
System.out.println("malformed string: " +
streamData.substring(textStart, textEnd + 1));
}
}
//if(streamText.equals("inserted text"))
System.out.println(streamText);
if (streamText.length() > 0)
stringList.add(streamText);
}
return stringList;
}
public static String getFilterString(PjStream stream) throws PjException
{
String filterString = new String();
PjObject filter;
//System.out.println("getting filter from dictionary");
if ((filter = stream.getStreamDictionary().getFilter()) == null) {
//System.out.println("Got null filter");
return "";
}
//System.out.println("got it");
// filter should either be a name or an array of names
if (filter instanceof PjName) {
//System.out.println("getting filter string from simple name");
filterString = ((PjName) filter).getString();
} else {
//System.out.println("getting filter string from array of names");
Iterator nameIter;
Vector nameVector;
if ((nameVector = ((PjArray) filter).getVector()) == null) {
//System.out.println("got null vector for list of names");
return "";
}
nameIter = nameVector.iterator();
while (nameIter.hasNext()) {
filterString += ((PjName) nameIter.next()).getString();
if (nameIter.hasNext())
filterString += " ";
}
}
//System.out.println("got filter string");
return filterString;
}
/**
* Performs a post-order traversal of the pages tree
* from the root node and gets all of the contents streams
* @returns a list of all the contents of all the pages
*/
public LinkedList getAllContentsStreams() throws
InvalidPdfObjectException {
return getContentsStreams(getAllPages());
}
/**
* Get contents streams from the list of PjPage objects
* @returns a list of all the contents of the pages
*/
public LinkedList getContentsStreams(LinkedList pages) throws
InvalidPdfObjectException {
LinkedList streams = new LinkedList();
Iterator pageIter = pages.iterator();
PjObject contents;
while(pageIter.hasNext()) {
contents = pdf.resolve(((PjPage)pageIter.next()).getContents());
// should only be a stream or an array of streams (or refs to
streams)
if (contents instanceof PjStream)
streams.add(contents);
else{
Iterator streamsIter = ((PjArray)contents).getVector().iterator();
while(streamsIter.hasNext())
streams.add(pdf.resolve((PjObject)streamsIter.next()));
}
}
return streams ;
}
/**
* Performs a post-order traversal of the pages tree
* from the root node.
* @returns a list of all the PjPage objects
*/
public LinkedList getAllPages() throws InvalidPdfObjectException {
LinkedList pages = new LinkedList();
getPages(rootPage, pages);
return pages;
}
/**
* Performs a post-order traversal of the pages tree
* from the node passed to it.
* @returns a list of all the PjPage objects under node
*/
public void getPages(PjObject node, LinkedList pages) throws
InvalidPdfObjectException {
PjPagesNode pageNode = null;
// let's hope pdf's don't have pointers to pointers
if (node instanceof PjReference)
pageNode = (PjPagesNode) pdf.resolve(node);
else
pageNode = (PjPagesNode) node;
if (pageNode instanceof PjPage) {
pages.add(pageNode);
return;
}
// kids better be an array and not a reference to one
Iterator kidIterator = ((PjArray) ((PjPages)
pageNode).getKids()).getVector().iterator();
while(kidIterator.hasNext()) {
getPages((PjObject) kidIterator.next(), pages);
}
}
public Pdf getPdf() {
return pdf;
}
}