引入pdfbox依赖,其版本号为1.8.10
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>1.8.10</version>
</dependency>
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
public class PdfUtil {
/**
* 功能描述:从指定的文件路径中获取PDF文件的内容
*/
public static String readText(String filePath)throws IOException {
File file =new File(filePath);
FileInputStream inputStream = new FileInputStream(file);
PDDocument document = PDDocument.load(inputStream);
PDFTextStripper stripper=new PDFTextStripper();
stripper.setSortByPosition(true);
String result=stripper.getText(document);
document.close();
return result;
}
}
测试文件demo.pdf中的内容为:
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Demo {
public static void main(String[] args) throws Exception{
String filePath="D:\\demo.pdf";
String content=PdfUtil.readText(filePath);
//System.out.println("全部数据:"+content);
// 正则表达式进行数据内容提取
String regular = "\\w{2}-\\w{2}-\\w{6}";
Pattern pattern = Pattern.compile(regular);
Matcher matcher = pattern.matcher(content);
if (matcher != null && matcher.find()) {
System.out.println("提取到数据:"+matcher.group());
} else {
System.out.println("未提取到数据");
}
}
}
执行上述代码,其输出结果为:
提取到数据:12-34-567890