doc转docx,0.3.1版本的不支持doc上传
import win32com.client
import os
def convert_doc_to_docx(doc_path, output_path):
# 确保输出路径具有 .docx 后缀
if not output_path.endswith(".docx"):
output_path += ".docx"
# 检查文件是否存在
if not os.path.exists(doc_path):
print(f"文件未找到: {doc_path}")
return
try:
# 启动 Word 应用程序
word = win32com.client.Dispatch("Word.Application")
word.Visible = False # 不显示 Word 窗口
# 打开 DOC 文件
doc = word.Documents.Open(doc_path)
# 保存为 DOCX 格式
doc.SaveAs(output_path, FileFormat=16) # 16 对应 DOCX 格式
doc.Close()
word.Quit()
print(f"文件已成功转换并保存为: {output_path}")
except Exception as e:
print(f"发生错误: {e}")
convert_doc_to_docx("winding.doc", "winding.docx")
docx,pdf转md格式(pdf格式在pdf,csv,md三种格式下测试,md格式效果远好于另外两种)
import os
import fitz
import cv2
import numpy as np
import tqdm
from PIL import Image
from typing import List
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from chatchat.server.file_rag.document_loaders.ocr import get_ocr
from docx import Document
import win32com.client # 用于处理 doc 文件
from io import BytesIO
class RapidOCRDocxDocLoader(UnstructuredFileLoader):
def _get_elements(self) -> List:
def rotate_img(img, angle):
"""旋转图像"""
h, w = img.shape[:2]
rotate_center = (w / 2, h / 2)
M = cv2.getRotationMatrix2D(rotate_center, angle, 1.0)
new_w = int(h * abs(M[0, 1]) + w * abs(M[0, 0]))
new_h = int(h * abs(M[0, 0]) + w * abs(M[0, 1]))
M[0, 2] += (new_w - w) / 2
M[1, 2] += (new_h - h) / 2
return cv2.warpAffine(img, M, (new_w, new_h))
def ocr_from_image(image_bytes):
"""从图片字节内容中提取OCR文本"""
img = Image.open(BytesIO(image_bytes))
img_array = np.array(img)
ocr = get_ocr()
result, _ = ocr(img_array)
return result
def docx2md(filepath, output_md):
"""将DOCX转换为Markdown文件,包含图片OCR处理"""
doc = Document(filepath)
md_content = [] # 存储Markdown内容
# 处理文本内容
for para in doc.paragraphs:
md_content.append(para.text.strip())
# 处理图片(OCR)
for rel in doc.part.rels.values():
if "image" in rel.target_ref.lower():
image_bytes = rel.target_part.blob
ocr_result = ocr_from_image(image_bytes)
if ocr_result:
md_content.append("\n> OCR: " + "\n> ".join([line[1] for line in ocr_result]))
# 写入 Markdown 文件
with open(output_md, "w", encoding="utf-8") as f:
f.write("\n".join(md_content))
print(f"Markdown 文件已保存至: {output_md}")
def pdf2md(filepath, output_md):
"""将PDF转换为Markdown文件"""
ocr = get_ocr()
doc = fitz.open(filepath)
md_content = [] # 存储 Markdown 结构化数据
b_unit = tqdm.tqdm(total=doc.page_count, desc="Processing PDF pages")
for i, page in enumerate(doc):
b_unit.set_description(f"Processing page {i + 1}")
b_unit.refresh()
# 记录当前页
md_content.append(f"## Page {i + 1}\n")
# 提取纯文本
text = page.get_text("text")
if text.strip():
md_content.append(text.strip())
# 提取图片OCR信息
img_list = page.get_image_info(xrefs=True)
for img in img_list:
if xref := img.get("xref"):
pix = fitz.Pixmap(doc, xref)
img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)
if int(page.rotation) != 0: # 旋转图片
ori_img = cv2.cvtColor(np.array(Image.fromarray(img_array)), cv2.COLOR_RGB2BGR)
img_array = rotate_img(ori_img, angle=360 - page.rotation)
result, _ = ocr(img_array)
if result:
ocr_text = "\n".join([f"> OCR: {line[1]}" for line in result])
md_content.append(ocr_text)
# 每页之间加空行
md_content.append("\n---\n")
# 更新进度条
b_unit.update(1)
# 写入 Markdown 文件
with open(output_md, "w", encoding="utf-8") as f:
f.write("\n".join(md_content))
print(f"Markdown 文件已保存至: {output_md}")
# 处理文档类型
file_extension = os.path.splitext(self.file_path)[1].lower()
output_md = self.file_path.replace(file_extension, ".md")
if file_extension == ".pdf":
pdf2md(self.file_path, output_md)
elif file_extension == ".docx":
docx2md(self.file_path, output_md)
else:
print(f"不支持的文件类型: {file_extension}")
if __name__ == "__main__":
loader = RapidOCRDocxDocLoader(file_path="winding.docx")
loader._get_elements() # 直接执行转换