JAVA OCR识别图片(开源版)

参考github项目mymonstercat 、识别度有限,需高精度请按需购买!!!

1.MAVEN依赖

            <dependency>
                <groupId>io.github.mymonstercat</groupId>
                <artifactId>rapidocr</artifactId>
                <version>0.0.7</version>
            </dependency>

            <dependency>
                <groupId>io.github.mymonstercat</groupId>
                <artifactId>rapidocr-onnx-platform</artifactId>
                <version>0.0.7</version>
            </dependency>
            
            <!-- 本地测试不引入, 服务器部署linux x86架构 下引入-->
			<dependency>
                <groupId>io.github.mymonstercat</groupId>
                <artifactId>rapidocr-onnx-linux-x86_64</artifactId>
                <version>1.2.2</version>
            </dependency>

            <dependency>
                <groupId>org.apache.pdfbox</groupId>
                <artifactId>fontbox</artifactId>
                <version>3.0.3</version>
            </dependency>
            <dependency>
                <groupId>org.apache.pdfbox</groupId>
                <artifactId>pdfbox</artifactId>
                <version>3.0.3</version>
            </dependency>

2.工具类

/**
     * 判断是文本格式发票还是图片格式发票
     * @param filePath
     * @return
     */
    public static Map<String, String> isTextBasedPdf(String filePath) {
        String path = SheBaoConfig.getProfile() + StringUtils.substringAfter(filePath, Constants.RESOURCE_PREFIX);
        return handlePic(path);
    }

    /**
     * 纯PDF格式发票识别--快
     *
     * @param filePath
     * @return
     * @throws IOException
     */
    public static Map<String, String> handleInvoice(String filePath) {
        try {
            PDDocument document = Loader.loadPDF(new File(filePath));
            PDFTextStripper pdfStripper = new PDFTextStripper();
            pdfStripper.setSortByPosition(true);
            String text = pdfStripper.getText(document);
            document.close();
            return pdfStr(text);
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }

    public static Map<String, String> pdfStr(String invoiceInfo) {
        invoiceInfo = invoiceInfo.replaceAll("(", "(").replaceAll(")", ")");
        //正则匹配关键字
        Map<String, String> patterns = new HashMap<>();
//        patterns.put("billNumber", billNum);
        // 提取数据
        Map<String, String> result = new HashMap<>();
        for (Map.Entry<String, String> entry : patterns.entrySet()) {
            Pattern pattern = Pattern.compile(entry.getValue(), Pattern.DOTALL);
            Matcher matcher = pattern.matcher(invoiceInfo);
            if (matcher.find()) {
                result.put(entry.getKey(), matcher.group(1).trim());
            } else {
                result.put(entry.getKey(), null);
            }
        }
        return result;
    }

    /**
     * PDF图片格式识别发票--慢
     *
     * @param filePath
     * @return
     */
    public static Map<String, String> handlePic(String filePath) {
        long start = System.currentTimeMillis();
        List<String>   fileNameList = runOcr(filePath);
        String str = null;
        Map<String, String> result = new HashMap<>();
        if (StringUtils.isNotEmpty(fileNameList)) {
            str = fileNameList.stream().collect(Collectors.joining(" "));
            str = str.replaceAll("(", "(").replaceAll(")", ")");
            System.out.println(str);
        }
        long finish = System.currentTimeMillis();
        log.info("图片识别执行耗费时间:" + Long.valueOf(finish - start));
        return result;
    }

    public static void main(String[] args) {
        handlePic("D:\\微信图片_2025-09-24_153045_537.jpg");
    }

    //将pdf 转图片 并识别文字
    public static List<String> getWords(String pdfFilePath) throws IOException {
        String outputDir = outputDirs + UUID.randomUUID().toString().replace("-", "");
        List<String> fileNameList = convertPdfToImage(pdfFilePath, outputDir);
        List<String> wordsList = new ArrayList<>();
        for (String fileName : fileNameList) {
            if (StringUtils.isEmpty(fileName)) {
                break;
            }
            List<String> words = runOcr(fileName);
            for (String word : words) {
                wordsList.add(word);
            }
        }
        deleteDirectory(outputDir);
        return wordsList;
    }


    public static List<String> runOcr(String path) {
        List<String> results = new ArrayList<>();
        InferenceEngine engine = InferenceEngine.getInstance(Model.ONNX_PPOCR_V3);
        OcrResult ocrResult = engine.runOcr(path);
        for (TextBlock textBlock : ocrResult.getTextBlocks()) {
            results.add(textBlock.getText());
        }
        return results;
    }

 
    //将PDF文件转换为多张PNG图片 设置DPI(越高图片越清晰,但文件也会更大)
    public static List<String> convertPdfToImage(String pdfFilePath, String outputDir) {
        /
        int dpi = 300;
        List<String> fileNameList = new ArrayList<>();
        File file = new File(pdfFilePath);
        try (PDDocument document = Loader.loadPDF(file)) {
            PDFRenderer pdfRenderer = new PDFRenderer(document);
            String pdfFileName = file.getName().replace(".pdf", "");
            String name = pdfFileName;
            for (int page = 0; page < document.getNumberOfPages(); page++) {
                BufferedImage bim = pdfRenderer.renderImageWithDPI(page, dpi);
                String folder = createFolder(outputDir + "/" + name);
                String fileName = folder + "/" + pdfFileName + "_page_" + (page + 1) + ".png";
                ImageIO.write(bim, "png", new File(fileName));
                fileNameList.add(fileName);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return fileNameList;
    }

    public static void deleteDirectory(String path) throws IOException {
        Path directory = Paths.get(path);
        if (!Files.isDirectory(directory)) {
            throw new ServiceException("文件不存在.");
        }
        Files.walkFileTree(directory, new SimpleFileVisitor<Path>() {
            @Override
            public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                Files.delete(file);
                return FileVisitResult.CONTINUE;
            }

            @Override
            public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
                Files.delete(dir);
                return FileVisitResult.CONTINUE;
            }

            @Override
            public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {
                throw exc;
            }
        });
    }

    public static String createFolder(String folderPath) {
        String txt = folderPath;
        try {
            File myFilePath = new File(txt);
            txt = folderPath;
            if (!myFilePath.exists()) {
                myFilePath.mkdirs();
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return txt;
    }

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值