前言
Tesseract 是世界上最知名、应用最广泛的开源 OCR 引擎。它由 Google 积极维护,功能强大,支持多种语言和平台。虽然它在处理理想条件下的印刷文本时表现出色,但其精度会受到图像质量和复杂性的影响。它通常作为核心引擎被集成到各种应用程序、脚本和更大型的系统中,是许多需要文本提取功能的项目的首选开源解决方案。
一、安装 tesseract (OCR)
安装链接:Index of /tesseract (uni-mannheim.de)
二、下载训练数据
通过网盘分享的文件:tessdata各语言集合包.zip
链接: https://pan.baidu.com/s/13oPR2r7qOE6lt6SgbpWOQA 提取码: uaaw
三、创建springboot项目
1、导入依赖
<dependency> <groupId>net.sourceforge.tess4j</groupId> <artifactId>tess4j</artifactId> <version>5.3.0</version> </dependency>
2、编写配置类
package com.songwp.config; import net.sourceforge.tess4j.Tesseract; import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; /** * @ClassName:TesseractOcrConfig * @Description: ocr配置类 * @Author: songwp * @Date: 2025/6/29 19:24 */ @Configuration public class TesseractOcrConfig { @Value("${tess4j.data-path}") private String dataPath; @Value("${tess4j.language}") private String language; @Bean public Tesseract tesseract() { Tesseract tesseract = new Tesseract(); // 设置训练数据文件夹路径 tesseract.setDatapath(dataPath); // 设置为中文简体 tesseract.setLanguage(language); return tesseract; } }
3、编写controller
package com.songwp.controller; import com.songwp.service.OcrService; import lombok.extern.slf4j.Slf4j; import net.sourceforge.tess4j.TesseractException; import org.springframework.http.MediaType; import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RestController; import org.springframework.web.multipart.MultipartFile; import java.io.IOException; /** * @ClassName:OcrController * @Description: ocr识别controller * @Author: songwp * @Date: 2025/6/29 18:40 */ @RestController @RequestMapping("/ocr") @Slf4j public class OcrController { private final OcrService ocrService; public OcrController(OcrService ocrService) { this.ocrService = ocrService; } @PostMapping(value = "/recognize", consumes = MediaType.MULTIPART_FORM_DATA_VALUE) public String recognizeImage(@RequestParam("file") MultipartFile file) throws TesseractException, IOException { log.info(ocrService.recognizeText(file)); // 调用OcrService中的方法进行文字识别 return ocrService.recognizeText(file); } }
5、编写service
package com.songwp.service; import net.sourceforge.tess4j.TesseractException; import org.springframework.web.multipart.MultipartFile; import java.io.IOException; /** * @ClassName:OcrService * @Description: ocr识别接口 * @Author: songwp * @Date: 2025/6/29 19:27 */ public interface OcrService { public String recognizeText(MultipartFile imageFile) throws IOException, TesseractException; }
5、编写service实现类
package com.songwp.service.impl; import com.songwp.service.OcrService; import net.sourceforge.tess4j.Tesseract; import net.sourceforge.tess4j.TesseractException; import org.springframework.stereotype.Service; import org.springframework.web.multipart.MultipartFile; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; /** * @ClassName:OcrServiceImpl * @Description: ocr识别实现类 * @Author: songwp * @Date: 2025/6/29 19:28 */ @Service public class OcrServiceImpl implements OcrService { private final Tesseract tesseract; public OcrServiceImpl(Tesseract tesseract) { this.tesseract = tesseract; } /** * * @param imageFile 要识别的图片 * @return */ @Override public String recognizeText(MultipartFile imageFile) throws IOException, TesseractException { // 转换 InputStream sbs = new ByteArrayInputStream(imageFile.getBytes()); BufferedImage bufferedImage = ImageIO.read(sbs); // 对图片进行文字识别 return tesseract.doOCR(bufferedImage); } }
6、运行调试
注:图片颜色比较多的时候有有点识别不清楚了以及一些带字体的文本