参考github项目mymonstercat 、识别度有限,需高精度请按需购买!!!
1.MAVEN依赖
<dependency>
<groupId>io.github.mymonstercat</groupId>
<artifactId>rapidocr</artifactId>
<version>0.0.7</version>
</dependency>
<dependency>
<groupId>io.github.mymonstercat</groupId>
<artifactId>rapidocr-onnx-platform</artifactId>
<version>0.0.7</version>
</dependency>
<!-- 本地测试不引入, 服务器部署linux x86架构 下引入-->
<dependency>
<groupId>io.github.mymonstercat</groupId>
<artifactId>rapidocr-onnx-linux-x86_64</artifactId>
<version>1.2.2</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>3.0.3</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>3.0.3</version>
</dependency>
2.工具类
/**
* 判断是文本格式发票还是图片格式发票
* @param filePath
* @return
*/
public static Map<String, String> isTextBasedPdf(String filePath) {
String path = SheBaoConfig.getProfile() + StringUtils.substringAfter(filePath, Constants.RESOURCE_PREFIX);
return handlePic(path);
}
/**
* 纯PDF格式发票识别--快
*
* @param filePath
* @return
* @throws IOException
*/
public static Map<String, String> handleInvoice(String filePath) {
try {
PDDocument document = Loader.loadPDF(new File(filePath));
PDFTextStripper pdfStripper = new PDFTextStripper();
pdfStripper.setSortByPosition(true);
String text = pdfStripper.getText(document);
document.close();
return pdfStr(text);
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
public static Map<String, String> pdfStr(String invoiceInfo) {
invoiceInfo = invoiceInfo.replaceAll("(", "(").replaceAll(")", ")");
//正则匹配关键字
Map<String, String> patterns = new HashMap<>();
// patterns.put("billNumber", billNum);
// 提取数据
Map<String, String> result = new HashMap<>();
for (Map.Entry<String, String> entry : patterns.entrySet()) {
Pattern pattern = Pattern.compile(entry.getValue(), Pattern.DOTALL);
Matcher matcher = pattern.matcher(invoiceInfo);
if (matcher.find()) {
result.put(entry.getKey(), matcher.group(1).trim());
} else {
result.put(entry.getKey(), null);
}
}
return result;
}
/**
* PDF图片格式识别发票--慢
*
* @param filePath
* @return
*/
public static Map<String, String> handlePic(String filePath) {
long start = System.currentTimeMillis();
List<String> fileNameList = runOcr(filePath);
String str = null;
Map<String, String> result = new HashMap<>();
if (StringUtils.isNotEmpty(fileNameList)) {
str = fileNameList.stream().collect(Collectors.joining(" "));
str = str.replaceAll("(", "(").replaceAll(")", ")");
System.out.println(str);
}
long finish = System.currentTimeMillis();
log.info("图片识别执行耗费时间:" + Long.valueOf(finish - start));
return result;
}
public static void main(String[] args) {
handlePic("D:\\微信图片_2025-09-24_153045_537.jpg");
}
//将pdf 转图片 并识别文字
public static List<String> getWords(String pdfFilePath) throws IOException {
String outputDir = outputDirs + UUID.randomUUID().toString().replace("-", "");
List<String> fileNameList = convertPdfToImage(pdfFilePath, outputDir);
List<String> wordsList = new ArrayList<>();
for (String fileName : fileNameList) {
if (StringUtils.isEmpty(fileName)) {
break;
}
List<String> words = runOcr(fileName);
for (String word : words) {
wordsList.add(word);
}
}
deleteDirectory(outputDir);
return wordsList;
}
public static List<String> runOcr(String path) {
List<String> results = new ArrayList<>();
InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V3);
OcrResult ocrResult = engine.runOcr(path);
for (TextBlock textBlock : ocrResult.getTextBlocks()) {
results.add(textBlock.getText());
}
return results;
}
//将PDF文件转换为多张PNG图片 设置DPI(越高图片越清晰,但文件也会更大)
public static List<String> convertPdfToImage(String pdfFilePath, String outputDir) {
/
int dpi = 300;
List<String> fileNameList = new ArrayList<>();
File file = new File(pdfFilePath);
try (PDDocument document = Loader.loadPDF(file)) {
PDFRenderer pdfRenderer = new PDFRenderer(document);
String pdfFileName = file.getName().replace(".pdf", "");
String name = pdfFileName;
for (int page = 0; page < document.getNumberOfPages(); page++) {
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, dpi);
String folder = createFolder(outputDir + "/" + name);
String fileName = folder + "/" + pdfFileName + "_page_" + (page + 1) + ".png";
ImageIO.write(bim, "png", new File(fileName));
fileNameList.add(fileName);
}
} catch (IOException e) {
e.printStackTrace();
}
return fileNameList;
}
public static void deleteDirectory(String path) throws IOException {
Path directory = Paths.get(path);
if (!Files.isDirectory(directory)) {
throw new ServiceException("文件不存在.");
}
Files.walkFileTree(directory, new SimpleFileVisitor<Path>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
Files.delete(file);
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
Files.delete(dir);
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
throw exc;
}
});
}
public static String createFolder(String folderPath) {
String txt = folderPath;
try {
File myFilePath = new File(txt);
txt = folderPath;
if (!myFilePath.exists()) {
myFilePath.mkdirs();
}
} catch (Exception e) {
e.printStackTrace();
}
return txt;
}
3619

被折叠的 条评论
为什么被折叠?



