方案一:
pdf2htmlex
package com.realize.controller;
import cn.hutool.http.HttpUtil;
import com.alibaba.fastjson2.JSONObject;
import com.realize.util.MsgUtil;
import com.realize.util.OssUtil;
import com.realize.util.PdfConvertUtil;
import com.realize.util.StreamGobbler;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.ModelAttribute;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
@RestController
@Slf4j
public class ParserController {
@GetMapping("/test")
public String test() {
return "test";
}
// @PostMapping("/parseHtml")
// public JSONObject parseHtml(@ModelAttribute("htmlUrl") String htmlUrl) {
// try (Playwright playwright = Playwright.create()) {
// Browser browser = playwright.chromium().launch(new BrowserType.LaunchOptions().setHeadless(true));
// Page page = browser.newPage();
// String filePath = "/mnt/temp/html/" + RandomUtil.randomString(10) + ".html";
String filePath = "/Users/sunyechen/IdeaProjects/realize-nacos/bin/" + RandomUtil.randomString(10) + ".html";
// HttpUtil.downloadFile(htmlUrl, filePath);
// page.navigate("file:" + filePath);
// page.evaluate("var imgList=document.getElementsByTagName('img');" +
// "for(var i=0;i<imgList.length;i++){" +
// "var src=imgList[i].getAttribute('src');" +
// "imgList[i].setAttribute('src','https://realizedongmi.oss-cn-shanghai.aliyuncs.com/a-filings/test/'+src);" +
// "}");
// JSONObject result = new JSONObject();
// result.put("html", page.innerHTML("css=body"));
// result.put("css", page.innerHTML("css=style"));
// result.put("txt", page.innerText("css=body").trim().replaceAll("\n", ""));
// page.close();
// browser.close();
// return result;
// } catch (Exception e) {
// e.printStackTrace();
// }
// return null;
// }
@GetMapping("/batchConvertPdf")
public String batchConvertPdf() {
String folderName = "/root/pdf";
File folder = new File(folderName);
File[] files = folder.listFiles();
for (int i = 0; i < files.length; i++) {
String fileName