关于使用itextpdf7提取pdf文件中指定关键字内容的坐标位置
参考文章和资料
老梁 - https://cloud.tencent.com/developer/article/1502408
微光•无单位 - https://blog.youkuaiyun.com/LvWeijie941/article/details/105248627/
官方api - https://api.itextpdf.com/iText7/java/7.0.5/
前言
具体版本为itexpdf7.0.4,使用Maven项目管理依赖
主要使用jar包: kernel-7.0.4.jar
转载请注明出处
配置
为所有Maven项目配置远程仓库m2/settings.xml
<?xml version="1.0" encoding="UTF-8"?>
<settings>
<localRepository>m2</localRepository><!--需要改成自己的maven的本地仓库地址-->
<mirrors>
<mirror>
<id>alimaven</id>
<name>aliyun maven</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<mirrorOf>central</mirrorOf>
</mirror>
</mirrors>
<profiles>
<profile>
<id>nexus</id>
<repositories>
<repository>
<id>nexus</id>
<name>local private nexus</name>
<url>http://maven.oschina.net/content/groups/public/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>
<pluginRepositories>
<pluginRepository>
<id>nexus</id>
<name>local private nexus</name>
<url>http://maven.oschina.net/content/groups/public/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</pluginRepository>
</pluginRepositories>
</profile></profiles>
</settings>
具体项目通过pom.xml文件配置依赖(通过远程仓库自动在线下载依赖,即可无须手动下载jar包)
<dependencies>
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>kernel</artifactId>
<version>7.0.4</version>
</dependency>
</dependencies>
具体运行代码
import com.itextpdf.kernel.geom.Rectangle;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.canvas.parser.PdfDocumentContentParser;
import com.itextpdf.kernel.pdf.canvas.parser.listener.IPdfTextLocation;
import com.itextpdf.kernel.pdf.canvas.parser.listener.RegexBasedLocationExtractionStrategy;
import java.util.Collection;
public class test {
public static void main(String args[]){
String input = "target.pdf";
//通过指定pdf文件名,指定关键字,和指定的pdf文件的待处理页数做参数
getKeyWordsLocation(input, "presented", 46);
}
public static void getKeyWordsLocation(String input, String key, int pageNum){
RegexBasedLocationExtractionStrategy strategy = new RegexBasedLocationExtractionStrategy(key);
try{
//核心思路为对PdfDocument对象采用某种Strategy,这里使用RegexBasedLocationExtractionStrategy
PdfReader pr = new PdfReader(input);
PdfDocument pd = new PdfDocument(pr);
PdfDocumentContentParser pdcp = new PdfDocumentContentParser(pd);
//文本内容具体解析借助使用PdfDocumentContentParser类(实质使用PdfCanvasProcessor进行处理), 对待处理页面装配合适策略
RegexBasedLocationExtractionStrategy regexStrategy =
pdcp.processContent(pageNum, strategy);
//获取处理结果
Collection<IPdfTextLocation> resultantLocations = strategy.getResultantLocations();
//自定义结果处理
if (!resultantLocations.isEmpty()){
for(IPdfTextLocation item: resultantLocations){
Rectangle boundRectangle = item.getRectangle();
System.out.println(item.getText());
System.out.println("["+key + "] location of x: " + boundRectangle.getX() + " ,y: " + boundRectangle.getY());
}
}
else {
System.out.println("the result is null");
}
pr.close();
pd.close();
}catch (Exception e){
System.err.println("read file failed!");
e.printStackTrace();
}
}
}