下面的代码应该可以直接使用,已经写好了处理一个文件的方法(OCR)和处理一个文件夹中所有文件的方法(OCR_all_file),直接调用就可以。函数内做了并行处理,在处理多个文件时速度比较快。
有问题的话联系2200012186@stu.pku.edu.cn
import requests
import json
import time
import zipfile
import os
import traceback
import concurrent.futures
API_KEY = "Bearer sk-YOUR_API_KEY"
project_root = YOUR_PROJECT_ROOT
# YOUR_API_KEY替换为自己的
# YOUR_PROJECT_ROOT替换为你的项目的根目录的绝对路径
# 你的项目中需要有一个output文件夹,默认导出文件都会存储到这个文件夹中
# 如果你只需要处理一个文件,请使用方法OCR,如果你要批量处理文件,请使用方法OCR_all_file
def get_all_file_paths(folder_path):
file_paths = []
for root, directories, files in os.walk(folder_path):
for filename in files:
if filename[0] == ".":
continue
file_path = os.path.join(root, filename)
file_paths.append(file_path)
if len(file_paths) != 0 :
return file_paths
else:
print("warning : 文件夹路径不存在或文件夹内没有文件")
return 0
def OCR(file_pathway):#将文件进行OCR识别,将服务器返回的zip文件保存在“原始zip文件”中,并解压到“解压文件”中。文件名均为filename,解压文件中对于文件夹中的output.md即为后续需要的文件 url = 'https://v2.doc2x.noedgeai.com/api/v2/parse/pdf'
file_name = os.path.basename(file_pathway)[:-4]
try:
with open(f"{file_pathway}","rb") as file:
# 定义轮询间隔(秒)
polling_interval = 1
# 轮询次数上限,避免无限循环
max_polls = 50
poll_count = 0
# POST /api/v2/parse/pdf PDF识别(直接上传)
url = 'https://v2.doc2x.noedgeai.com/api/v2/parse/pdf'
headers = {'Authorization': API_KEY }
response = requests.post(url, headers=headers, data=file)
try:
responsedata = response.json()
uid = responsedata["data"]["uid"]
except:
error_info = traceback.format_exc()
print(f"{file_name}OCR识别失败:\n{error_info}")
print(response.json())
return
# GET /api/v2/parse/status 查看异步状态
url = f'https://v2.doc2x.noedgeai.com/api/v2/parse/status?uid={uid}'
# 轮询GET /api/v2/parse/status
while poll_count < max_polls:
response = requests.get(url,headers=headers)
if response.json()["data"]["status"] == "success":
break
if response.json()["data"]["status"] == "failed":
print(response.json()["code"])
break
# 增加轮询次数
poll_count += 1
# 等待一段时间后再次轮询
time.sleep(polling_interval)
if poll_count == max_polls:
print("文件上传超时")
return 1
# POST /api/v2/convert/parse 请求导出文件(异步)
url = "https://v2.doc2x.noedgeai.com/api/v2/convert/parse"
headers = {
"Authorization": API_KEY,
"Content-Type": "application/json",
}
data = {
"uid": str(uid),
"to": "md",
"formula_mode": "normal",
}
response = requests.post(url, headers=headers, data=json.dumps(data))
# GET /api/v2/convert/parse/result 导出获取结果
url = f'https://v2.doc2x.noedgeai.com/api/v2/convert/parse/result?uid={uid}'
headers = {'Authorization': API_KEY}
# 轮询GET /api/v2/convert/parse/result
poll_count = 0
while poll_count < max_polls:
response = requests.get(url,headers=headers)
if response.json()["data"]["status"] == "success":
break
if response.json()["data"]["status"] == "failed":
print(response.json()["code"])
break
# 增加轮询次数
poll_count += 1
# 等待一段时间后再次轮询
time.sleep(polling_interval)
if poll_count == max_polls:
print("文件导出超时")
return 1
responsedata = response.json()
url = responsedata["data"]["url"]
response = requests.get(url)
with open(f'{project_root}/output/{file_name}.zip', 'wb') as f:
f.write(response.content)
# 要解压的 ZIP 文件路径
zip_file_path = f'{project_root}/output/{file_name}.zip'
# 解压目标目录
extract_dir = f'{project_root}/output/{file_name}'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
# 解压 ZIP 文件到指定目录
zip_ref.extractall(extract_dir)
# remove the ZIP
os.remove(f"{zip_file_path}")
print(f"{file_name}OCR识别已成功")
except Exception as e:
# 获取详细的错误堆栈信息
error_info = traceback.format_exc()
print(f"{file_name}OCR识别失败:\n{error_info}")
return 0
def OCR_all_file(document_pathway):
# OCR处理
# 对document_pathway中所有文件进行OCR处理,用多线程提高效率.
# !!!如果上述文件夹中有同名的文件,无论是否在同一个文件夹,都将引发随机的覆盖!
file_paths = get_all_file_paths(f"{document_pathway}")
if file_paths == 0:
return
# 此处并行线程数量不可以超过5,会触发API的调用限制
OCR_executor = concurrent.futures.ThreadPoolExecutor(max_workers=5)
for file_path in file_paths:
OCR_executor.submit(OCR,file_path)
OCR_executor.shutdown()
return 0