检查PDF中是不是有表格，如果存在表格需要做特殊处理

最新推荐文章于 2025-12-09 14:03:55 发布

原创最新推荐文章于 2025-12-09 14:03:55 发布 · 951 阅读

CC 4.0 BY-SA版权

文章标签：

import cv2
import numpy as np
from pdf2image import convert_from_path
import os
import sys


def detect_table_in_image_pdf(pdf_path):
    """检测图像型PDF中的表格 - 修复版本"""

    # 检查文件是否存在
    if not os.path.exists(pdf_path):
        print(f"文件不存在: {pdf_path}")
        return []

    def visual_table_detection(img):
        """纯视觉检测表格"""
        try:
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

            # 二值化
            _, binary = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY_INV)

            # 检测水平线
            horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
            horizontal_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel)

            # 检测垂直线
            vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
            vertical_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel)

            # 组合线条
            table_mask = cv2.addWeighted(horizontal_lines, 0.5, vertical_lines, 0.5, 0.0)

            # 查找表格区域
            contours, _ = cv2.findContours(table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

            table_regions = []
            for contour in contours:
                area = cv2.contourArea(contour)
                if area > 5000:  # 足够大的区域
                    x, y, w, h = cv2.boundingRect(contour)
                    if w > 100 and h > 100:  # 合理的尺寸
                        table_regions.append((x, y, w, h))

            return len(table_regions) > 0, table_regions

        except Exception as e:
            print(f"视觉检测失败: {e}")
            return False, []

    def check_poppler():
        """检查poppler是否安装"""
        try:
            # 尝试转换第一页来测试
            convert_from_path(pdf_path, first_page=1, last_page=1, dpi=72)
            return True
        except Exception as e:
            print(f"Poppler检查失败: {e}")
            return False

    # 检查poppler安装
    if not check_poppler():
        print("错误: poppler未正确安装或不在PATH中")
        print("请运行以下命令安装:")
        print("macOS: brew install poppler")
        print("或者: conda install -c conda-forge poppler")
        return []

    # 转换PDF为图像
    try:
        print("正在转换PDF为图像...")
        pages = convert_from_path(pdf_path, dpi=200)
        print(f"成功转换 {len(pages)} 页")
    except Exception as e:
        print(f"PDF转图像失败: {e}")
        return []

    results = []

    for page_num, page in enumerate(pages):
        try:
            print(f"正在检测第 {page_num + 1} 页...")

            # 转换为OpenCV格式
            img = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)

            # 视觉检测
            has_visual_table, regions = visual_table_detection(img)

            result = {
                'page': page_num + 1,
                'visual_detection': has_visual_table,
                'table_regions': len(regions) if has_visual_table else 0,
                'regions_coords': regions if has_visual_table else []
            }

            results.append(result)

            if has_visual_table:
                print(f"页面 {page_num + 1}: 检测到 {len(regions)} 个表格区域")
            else:
                print(f"页面 {page_num + 1}: 未检测到表格")

        except Exception as e:
            print(f"处理第 {page_num + 1} 页时出错: {e}")
            # 即使出错也要添加结果，保持列表完整
            results.append({
                'page': page_num + 1,
                'visual_detection': False,
                'table_regions': 0,
                'regions_coords': [],
                'error': str(e)
            })

    return results


def simple_table_detection(pdf_path):
    """简化版本的表格检测"""
    try:
        # 只转换第一页进行测试
        pages = convert_from_path(pdf_path, first_page=1, last_page=1, dpi=150)

        if not pages:
            return False

        # 转换为OpenCV格式
        img = cv2.cvtColor(np.array(pages[0]), cv2.COLOR_RGB2BGR)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # 简单的线条检测
        edges = cv2.Canny(gray, 50, 150, apertureSize=3)

        # 检测水平线
        horizontal_lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold=100,
                                           minLineLength=100, maxLineGap=10)

        # 检测垂直线
        vertical_lines = cv2.HoughLinesP(edges, 1, np.pi / 2, threshold=100,
                                         minLineLength=100, maxLineGap=10)

        h_count = len(horizontal_lines) if horizontal_lines is not None else 0
        v_count = len(vertical_lines) if vertical_lines is not None else 0

        print(f"检测到水平线: {h_count}, 垂直线: {v_count}")

        # 如果同时有水平线和垂直线，可能存在表格
        return h_count > 5 and v_count > 3

    except Exception as e:
        print(f"简化检测失败: {e}")
        return False


# 主程序
if __name__ == "__main__":
    pdf_file = "wanzheng-15-17.pdf"  # 替换为你的PDF文件路径

    print("开始检测图像型PDF中的表格...")

    # 方法1: 完整检测
    try:
        results = detect_table_in_image_pdf(pdf_file)

        if results:  # 确保results不为空
            total_tables = sum(1 for r in results if r.get('visual_detection', False))
            print(f"\n=== 检测结果 ===")
            print(f"总共检测到 {total_tables} 页包含表格")

            for result in results:
                page_num = result['page']
                has_table = result.get('visual_detection', False)
                region_count = result.get('table_regions', 0)

                if has_table:
                    print(f"页面 {page_num}: ✅ 包含表格 ({region_count} 个区域)")
                else:
                    print(f"页面 {page_num}: ❌ 未发现表格")
        else:
            print("检测失败，尝试简化方法...")

            # 方法2: 简化检测
            has_table = simple_table_detection(pdf_file)
            if has_table:
                print("简化检测: ✅ 可能包含表格")
            else:
                print("简化检测: ❌ 未发现表格")

    except Exception as e:
        print(f"程序执行出错: {e}")
        import traceback

        traceback.print_exc()

下面把这段脚本按“做什么—怎么做—为什么这样做—可能踩坑与改进”的顺序解释清楚。你可以直接把它当成“图像型 PDF 表格探测”的入门范例来看。

⸻

这段代码总体在做什么
• 把 PDF 页转换为位图（用 pdf2image + Poppler）。
• 对每页位图做纯视觉表格检测（OpenCV）：
• 二值化 → 形态学提取水平线与垂直线 → 合并 → 找外接轮廓区域，判定是否存在表格。
• 返回每一页是否检测到表格，以及表格的候选矩形区域坐标。
• 如果完整流程失败或无结果，退而求其次用一个“简化版”的方法：Canny 边缘 + Hough 直线统计。

⸻

逐段拆解

1）入口函数：detect_table_in_image_pdf(pdf_path)

职责：整本 PDF 分页检测，每页给出结果字典。

关键步骤：
1. 文件存在性检查

if not os.path.exists(pdf_path): …

2.	内嵌函数 visual_table_detection(img)：纯视觉检测
•	转灰度：

gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

•	全局阈值二值化（反白）：

_, binary = cv2.threshold(gray, 128, 255, cv2.THRESH_BINARY_INV)

反白的目的是把“线条/文字”变成白色（255），背景变黑（0），利于后续形态学。

•	形态学“开运算”提线：

horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
horizontal_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel)

vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
vertical_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel)

(40,1)/(1,40) 让横/竖的长细结构被保留，杂点被去掉。

•	合并横竖线得到“表格掩膜”：

table_mask = cv2.addWeighted(horizontal_lines, 0.5, vertical_lines, 0.5, 0.0)

这一步在语义上等价于“横 OR 竖”，常见替代是 cv2.bitwise_or。

•	找外部轮廓并筛选：

contours, _ = cv2.findContours(table_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for contour in contours:
area = cv2.contourArea(contour)
if area > 5000:
x, y, w, h = cv2.boundingRect(contour)
if w > 100 and h > 100:
table_regions.append((x, y, w, h))

通过面积与宽高阈值过滤掉噪声小块，保留“像表格”的大块网格结构。
返回 (has_table, table_regions)。

3.	内嵌函数 check_poppler()：Poppler 可用性检查
•	试图仅转换第 1 页（72 DPI）：

convert_from_path(pdf_path, first_page=1, last_page=1, dpi=72)

能成功则认为 Poppler 环境 OK，否则提示安装（mac brew install poppler 或 conda）。

4.	将 PDF 全部页转成图像

pages = convert_from_path(pdf_path, dpi=200)

•	200 DPI：质量和速度的折中。

5.	逐页检测
•	page（PIL.Image）→ np.array(page) → 转 BGR（给 OpenCV）：

img = cv2.cvtColor(np.array(page), cv2.COLOR_RGB2BGR)

•	调用 visual_table_detection(img) 得到是否有表格与候选区域：

has_visual_table, regions = visual_table_detection(img)

•	组装结果字典追加到 results。出现异常也会给该页写入一个“失败结果”，保证输出页数与输入一致（便于上层流水线不出错）。

6.	返回每页结果的列表

⸻

2）退路函数：simple_table_detection(pdf_path)

职责：只看第一页，用更粗糙的方式判断“可能有表格”。

流程：
• 150 DPI 转第一页 → 灰度 → Canny 边缘。
• 两次 HoughLinesP：
• “水平线”：

horizontal_lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100,
minLineLength=100, maxLineGap=10)

•	“垂直线”（代码里把 theta 设置为 np.pi/2）：

vertical_lines = cv2.HoughLinesP(edges, 1, np.pi/2, threshold=100,
minLineLength=100, maxLineGap=10)

•	分别统计数量，经验阈值 h_count > 5 and v_count > 3 则认为“可能有表格”。

注意：HoughLinesP 的第二个角度参数不是“只检测某个方向的线”，它是角度分辨率（推荐都用 np.pi/180），真正区分水平/垂直应在拿到线段后按斜率/角度筛选。这个我在“改进建议”里给出修正写法。

⸻

3）main 主程序
• 指定 pdf_file → 先跑“完整检测”。
• 如果 results 为空（失败），再跑“简化检测”作为兜底。

⸻

这些参数为什么这样选
• DPI=200：表格线在 200 DPI 下宽度与长度都足够明显，计算量适中。
• 阈值二值化 + 反白：在扫描件里，线一般更深，反白后线为白色，形态学“开运算”更容易抓到线性结构。
• 形态学核大小 (40)：经验值，适合 A4 在 150–300 DPI 的常见表格；核越长，越偏向“细长结构”，越能抑制噪点。
• 轮廓面积 5000 / 宽高>100：避免把页眉短线、页码边框等小块误识别为表格。

⸻

常见坑与如何改得更稳

1）合并横竖线后最好“再二值化”

当前是：

table_mask = cv2.addWeighted(horizontal_lines, 0.5, vertical_lines, 0.5, 0.0)

这会得到灰阶结果（0~255 的多个层级），findContours 通常能用，但更稳妥是 OR 并阈值一下：

table_mask = cv2.bitwise_or(horizontal_lines, vertical_lines)
_, table_mask = cv2.threshold(table_mask, 0, 255, cv2.THRESH_BINARY)

2）阈值选择：固定 128 容易翻车

光照、底色、扫描强度不同会导致失败。两种替代：

Otsu 自动阈值（全局）

_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

或自适应

binary = cv2.adaptiveThreshold(gray, 255,
cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 31, 5)

3）纵横核大小自适应图像宽高

固定 40 在超高/超低 DPI 时不合适。可按宽高的比例设：

h_len = max(20, img.shape[1] // 50) # 宽度的 2%
v_len = max(20, img.shape[0] // 50) # 高度的 2%
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (h_len, 1))
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, v_len))

4）RETR_EXTERNAL 可能漏掉嵌套表/子表

如果你想保留所有层级，用：

contours, hierarchy = cv2.findContours(table_mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

然后按层级/面积再筛。

5）简化版里的 HoughLinesP 用法不对

theta=np.pi/2 会把角分辨率设成 90°，几乎检测不到线。应该统一用 np.pi/180，再按角度分类：

lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100,
minLineLength=100, maxLineGap=10)
h_count = v_count = 0
if lines is not None:
for (x1,y1,x2,y2) in lines[:,0,:]:
ang = abs(np.degrees(np.arctan2(y2-y1, x2-x1)))
if ang < 10 or ang > 170: # 近水平
h_count += 1
elif 80 < ang < 100: # 近垂直
v_count += 1

6）Poppler 环境在 Windows 上的特殊性

Windows 不走 brew/conda，通常要：
• 安装 Poppler（e.g. 下载 poppler-xx\bin），
• 把 bin 加进 PATH，或在 convert_from_path(…, poppler_path=“C:\path\to\poppler\bin”) 显式指定。

7）速度与内存
• 大 PDF：可以先缩放到较低分辨率（比如 150 DPI）“探测”，检测到表格再对该页用 300 DPI 精抽。
• 多进程：分页独立，易于 multiprocessing 并行。

⸻

你可以立刻抄用的小修正片段

（A）更稳的视觉检测（替换 visual_table_detection 的中间几行）

gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

更鲁棒的阈值方法（二选一）

_, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY_INV, 31, 5)

自适应核

h_len = max(20, img.shape[1] // 50)
v_len = max(20, img.shape[0] // 50)
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (h_len, 1))
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, v_len))

horizontal_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel, iterations=1)
vertical_lines = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel, iterations=1)

逻辑合并并二值化

table_mask = cv2.bitwise_or(horizontal_lines, vertical_lines)

可选：膨胀一点把网格连成块，便于框出整张表

table_mask = cv2.dilate(table_mask, cv2.getStructuringElement(cv2.MORPH_RECT, (3,3)), iterations=1)

（B）修正简化版 Hough 方向统计

lines = cv2.HoughLinesP(edges, 1, np.pi/180, threshold=100,
minLineLength=100, maxLineGap=10)
h_count = v_count = 0
if lines is not None:
for x1, y1, x2, y2 in lines[:,0,:]:
ang = abs(np.degrees(np.arctan2(y2 - y1, x2 - x1)))
if ang < 10 or ang > 170:
h_count += 1
elif 80 < ang < 100:
v_count += 1

⸻

返回结果怎么用

detect_table_in_image_pdf 的单页结果形如：

{
‘page’: 3,
‘visual_detection’: True,
‘table_regions’: 2,
‘regions_coords’: [(x,y,w,h), (x,y,w,h)]
}

•	你可以把 regions_coords 用 cv2.rectangle 画出来做可视化，
•	或在这些区域内做更精细的 OCR/结构化（如表格结构还原）。

⸻

小结
• 这段代码采用“形态学提线 → 轮廓筛选”的经典套路，简单、可解释、易调参。
• 两个最关键的点：二值化策略和核大小/阈值的自适配。把这两个做好，鲁棒性会明显提升。
• “简化版”里的 Hough 使用要按角度再分类，不要把 theta 改成 np.pi/2。