记录:读取pdf中的文字内容-SpringBoot
1、引入依赖
// 自行官网查看api 下载 jar 包 打入本地仓库
<!-- 百度 ocr-->
<dependency>
<groupId>com.baidu.api</groupId>
<artifactId>java-sdk</artifactId>
<version>4.15.8</version>
</dependency>
<!-- fontbox 开源免费 读取PDF-->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.24</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.24</version>
</dependency>
<!--pdf分割-->
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.3.2</version>
</dependency>
<dependency>
<groupId>org.bouncycastle</groupId>
<artifactId>bcprov-jdk15on</artifactId>
<version>1.47</version>
</dependency>
<dependency>
<groupId>org.bouncycastle</groupId>
<artifactId>bcmail-jdk15on</artifactId>
<version>1.47</version>
</dependency>
新建一个测试类 ReadPdf.class
public static final String API_KEY = "申请的百度 API_KEY";
public static final String SECRET_KEY = "申请的百度 SECRET_KEY";
新建方法 ocrReadPdf
private static String ocrReadPdf(String path) throws Exception {
String url = "https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic";
String auth = getAuth(API_KEY, SECRET_KEY);
byte[] imgData = FileUtil.readFileByBytes(path);
String imgStr = Base64Util.encode(imgData);
String imgParam = URLEncoder.encode(imgStr, "UTF-8");
String param = "pdf_file=" + imgParam;
String result = HttpUtil.post(url, auth, param);
JSONObject jsonObject = new JSONObject(result);
String words_result = jsonObject.getString("words_result");
JSONArray jsonArray = null;
StringBuffer s = new StringBuffer();
try {
jsonArray = new JSONArray(words_result);
if (jsonArray != null) {
for (int i = 0; i < jsonArray.length(); i++) {
JSONObject jsonobject = jsonArray.getJSONObject(i);
String words = jsonobject.optString("words");
s.append(words);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return s.toString();
}
新建方法 getAuth
private static String getAuth(String ak, String sk) {
// 获取token地址
String authHost = "https://aip.baidubce.com/oauth/2.0/token?";
String getAccessTokenUrl = authHost
// 1. grant_type为固定参数
+ "grant_type=client_credentials"
// 2. 官网获取的 API Key
+ "&client_id=" + ak
// 3. 官网获取的 Secret Key
+ "&client_secret=" + sk;
try {
URL realUrl = new URL(getAccessTokenUrl);
// 打开和URL之间的连接
HttpURLConnection connection = (HttpURLConnection) realUrl.openConnection();
connection.setRequestMethod("GET");
connection.connect();
// 获取所有响应头字段
Map<String, List<String>> map = connection.getHeaderFields();
// 遍历所有的响应头字段
// for (String key : map.keySet()) {
// System.err.println(key + "--->" + map.get(key));
// }
// 定义 BufferedReader输入流来读取URL的响应
BufferedReader in = new BufferedReader(new InputStreamReader(connection.getInputStream()));
String result = "";
String line;
while ((line = in.readLine()) != null) {
result += line;
}
/**
* 返回结果示例
*/
// System.err.println("result:" + result);
JSONObject jsonObject = new JSONObject(result);
String access_token = jsonObject.getString("access_token");
return access_token;
} catch (Exception e) {
System.err.printf("获取token失败!");
e.printStackTrace(System.err);
}
return null;
}
新建方法 splitExistFile
private static void splitExistFile(String filePath, String fileName) {
// 每个文件最大页数
int filePageSize = 1;
// 待拆分文件的总页数
int totalPage;
// 拆分后的文件数量
int splitFileNum;
int pageIndex = 1;
PdfReader reader = null;
try {
String orignName = fileName.split("\\.")[0];
reader = new PdfReader(filePath + fileName);
PdfReader.unethicalreading = true;
totalPage = reader.getNumberOfPages();
splitFileNum = totalPage % filePageSize == 0 ? totalPage / filePageSize : totalPage / filePageSize + 1;
for (int i = 0; i < splitFileNum; i++) {
String newFileName = filePath + orignName + "_" + (i + 1) + ".pdf";
System.out.println(newFileName);
// 新建一个PDF文件
Document document = null;
PdfWriter writer = null;
try {
document = new Document();
writer = PdfWriter.getInstance(document, new FileOutputStream(newFileName));
document.open();
PdfContentByte pdfContentByte = writer.getDirectContent();
for (int j = 0; j < filePageSize; j++) {
document.newPage();
pdfContentByte.addTemplate(writer.getImportedPage(reader, pageIndex), 0, 0);
pageIndex++;
if (pageIndex > totalPage)
break;
}
} catch (IOException e) {
e.printStackTrace();
} catch (DocumentException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
} finally {
//这个地方要特别注意资源关闭的顺序
if (document != null)
document.close();
if (writer != null)
writer.close();
}
// 先切割再读取
ocrReadPdf(filePath + fileName);
}
} catch (IOException e1) {
e1.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
} finally {
if (reader != null) reader.close();
}
}
写个 main
方法测试
public static void main(String[] args) throws Exception {
String path = "文件路径";
String fileName = "文件名称";
splitExistFile(path, fileName);
}
参考文章:
链接: 百度API OCR.
如有侵权,请联系删除!!!