langchain0.3.1版本中，知识库doc，docx，pdf转md格式（实测md格式最佳）

本文链接：https://blog.youkuaiyun.com/m0_70954884/article/details/146175892

doc转docx，0.3.1版本的不支持doc上传

import win32com.client
import os


def convert_doc_to_docx(doc_path, output_path):
    # 确保输出路径具有 .docx 后缀
    if not output_path.endswith(".docx"):
        output_path += ".docx"

    # 检查文件是否存在
    if not os.path.exists(doc_path):
        print(f"文件未找到: {doc_path}")
        return

    try:
        # 启动 Word 应用程序
        word = win32com.client.Dispatch("Word.Application")
        word.Visible = False  # 不显示 Word 窗口

        # 打开 DOC 文件
        doc = word.Documents.Open(doc_path)

        # 保存为 DOCX 格式
        doc.SaveAs(output_path, FileFormat=16)  # 16 对应 DOCX 格式
        doc.Close()
        word.Quit()

        print(f"文件已成功转换并保存为: {output_path}")
    except Exception as e:
        print(f"发生错误: {e}")

convert_doc_to_docx("winding.doc", "winding.docx")

docx，pdf转md格式（pdf格式在pdf，csv，md三种格式下测试，md格式效果远好于另外两种）

import os
import fitz
import cv2
import numpy as np
import tqdm
from PIL import Image
from typing import List
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
from chatchat.server.file_rag.document_loaders.ocr import get_ocr
from docx import Document
import win32com.client  # 用于处理 doc 文件
from io import BytesIO


class RapidOCRDocxDocLoader(UnstructuredFileLoader):
    def _get_elements(self) -> List:
        def rotate_img(img, angle):
            """旋转图像"""
            h, w = img.shape[:2]
            rotate_center = (w / 2, h / 2)
            M = cv2.getRotationMatrix2D(rotate_center, angle, 1.0)
            new_w = int(h * abs(M[0, 1]) + w * abs(M[0, 0]))
            new_h = int(h * abs(M[0, 0]) + w * abs(M[0, 1]))
            M[0, 2] += (new_w - w) / 2
            M[1, 2] += (new_h - h) / 2
            return cv2.warpAffine(img, M, (new_w, new_h))

        def ocr_from_image(image_bytes):
            """从图片字节内容中提取OCR文本"""
            img = Image.open(BytesIO(image_bytes))
            img_array = np.array(img)
            ocr = get_ocr()
            result, _ = ocr(img_array)
            return result

        def docx2md(filepath, output_md):
            """将DOCX转换为Markdown文件，包含图片OCR处理"""
            doc = Document(filepath)
            md_content = []  # 存储Markdown内容

            # 处理文本内容
            for para in doc.paragraphs:
                md_content.append(para.text.strip())

            # 处理图片（OCR）
            for rel in doc.part.rels.values():
                if "image" in rel.target_ref.lower():
                    image_bytes = rel.target_part.blob
                    ocr_result = ocr_from_image(image_bytes)
                    if ocr_result:
                        md_content.append("\n> OCR: " + "\n> ".join([line[1] for line in ocr_result]))

            # 写入 Markdown 文件
            with open(output_md, "w", encoding="utf-8") as f:
                f.write("\n".join(md_content))

            print(f"Markdown 文件已保存至: {output_md}")

        def pdf2md(filepath, output_md):
            """将PDF转换为Markdown文件"""
            ocr = get_ocr()
            doc = fitz.open(filepath)

            md_content = []  # 存储 Markdown 结构化数据

            b_unit = tqdm.tqdm(total=doc.page_count, desc="Processing PDF pages")
            for i, page in enumerate(doc):
                b_unit.set_description(f"Processing page {i + 1}")
                b_unit.refresh()

                # 记录当前页
                md_content.append(f"## Page {i + 1}\n")

                # 提取纯文本
                text = page.get_text("text")
                if text.strip():
                    md_content.append(text.strip())

                # 提取图片OCR信息
                img_list = page.get_image_info(xrefs=True)
                for img in img_list:
                    if xref := img.get("xref"):
                        pix = fitz.Pixmap(doc, xref)
                        img_array = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, -1)

                        if int(page.rotation) != 0:  # 旋转图片
                            ori_img = cv2.cvtColor(np.array(Image.fromarray(img_array)), cv2.COLOR_RGB2BGR)
                            img_array = rotate_img(ori_img, angle=360 - page.rotation)

                        result, _ = ocr(img_array)
                        if result:
                            ocr_text = "\n".join([f"> OCR: {line[1]}" for line in result])
                            md_content.append(ocr_text)

                # 每页之间加空行
                md_content.append("\n---\n")

                # 更新进度条
                b_unit.update(1)

            # 写入 Markdown 文件
            with open(output_md, "w", encoding="utf-8") as f:
                f.write("\n".join(md_content))

            print(f"Markdown 文件已保存至: {output_md}")

        # 处理文档类型
        file_extension = os.path.splitext(self.file_path)[1].lower()
        output_md = self.file_path.replace(file_extension, ".md")

        if file_extension == ".pdf":
            pdf2md(self.file_path, output_md)
        elif file_extension == ".docx":
            docx2md(self.file_path, output_md)
        else:
            print(f"不支持的文件类型: {file_extension}")

if __name__ == "__main__":
    loader = RapidOCRDocxDocLoader(file_path="winding.docx")
    loader._get_elements()  # 直接执行转换