import pdfplumber
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.platypus import Paragraph
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
import os
def register_chinese_font():
"""自动处理不同系统的中文字体路径"""
font_name = 'SimSun'
# 尝试不同系统的默认字体路径
font_paths = [
# Windows
r'C:\Windows\Fonts\simsun.ttc',
# macOS
'/Library/Fonts/SimSun.ttf',
# Linux
'/usr/share/fonts/truetype/simsun.ttf',
# 当前目录
os.path.join(os.path.dirname(__file__), 'fonts', 'SimSun.ttf')
]
for path in font_paths:
if os.path.exists(path):
try:
# 处理 TTC 格式
if path.endswith('.ttc'):
pdfmetrics.registerFont(TTFont(font_name, path, index=0))
else:
pdfmetrics.registerFont(TTFont(font_name, path))
print(f"成功注册字体: {path}")
return True
except Exception as e:
print(f"字体注册失败: {e}")
print("错误: 未找到SimSun字体文件")
# 回退到默认字体
pdfmetrics.registerFont(TTFont('Fallback', 'Helvetica'))
return False
def save_text_to_pdf(text, output_path):
# 注册中文字体并获取字体名称
font_name = register_chinese_font()
# 创建PDF文档
c = canvas.Canvas(output_path, pagesize=letter)
width, height = letter
# 设置样式
styles = getSampleStyleSheet()
chinese_style = ParagraphStyle(
'ChineseStyle',
parent=styles['Normal'],
fontName=font_name,
fontSize=10,
leading=14,
spaceAfter=6
)
# 将文本拆分为段落
paragraphs = text.split('\n\n')
# 添加内容到PDF
y_position = height - 40
for para in paragraphs:
if not para.strip():
continue
p = Paragraph(para, chinese_style)
w, h = p.wrap(width - 80, height)
if y_position - h < 40:
c.showPage()
y_position = height - 40
p.drawOn(c, 40, y_position - h)
y_position -= (h + 10)
c.save()
def extract_text_exclude_tables(pdf_path):
"""
提取PDF中的纯文本内容,自动排除表格区域
"""
full_text = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
# 识别表格区域
table_bboxes = [table.bbox for table in page.find_tables()]
# 定义表格区域过滤函数
def not_in_table(obj):
if not (obj["object_type"] == "char" or "text" in obj["object_type"]):
return True
# 检查字符中心点是否在表格区域内
x_center = (obj["x0"] + obj["x1"]) / 2
y_center = (obj["top"] + obj["bottom"]) / 2
for bbox in table_bboxes:
if (bbox[0] <= x_center <= bbox[2] and
bbox[1] <= y_center <= bbox[3]):
return False
return True
# 过滤表格区域后提取文本
filtered_page = page.filter(not_in_table)
text = filtered_page.extract_text(layout=True, keep_blank_chars=False)
if text:
full_text.append(text)
return "\n\n".join(full_text)
def save_text_to_pdf(text, output_path):
# 注册中文字体
pdfmetrics.registerFont(TTFont('SimSun', 'SimSun.ttf')) # 确保字体文件路径正确
pdfmetrics.addMapping('SimSun', 0, 0, 'SimSun')
# 创建PDF文档
c = canvas.Canvas(output_path, pagesize=letter)
width, height = letter
# 设置样式
styles = getSampleStyleSheet()
chinese_style = ParagraphStyle(
'ChineseStyle',
parent=styles['Normal'],
fontName='SimSun',
fontSize=10,
leading=14,
spaceAfter=6
)
# 将文本拆分为段落
paragraphs = text.split('\n\n')
# 添加内容到PDF
y_position = height - 40 # 起始位置(距顶部40点)
for para in paragraphs:
if not para.strip():
continue
p = Paragraph(para, chinese_style)
w, h = p.wrap(width - 80, height) # 左右各留40点边距
# 检查是否需要换页
if y_position - h < 40: # 底部留40点边距
c.showPage()
y_position = height - 40
p.drawOn(c, 40, y_position - h)
y_position -= (h + 10) # 段落间距
c.save()
# 主程序
if __name__ == "__main__":
input_pdf = "/Users/bravesword/Desktop/2024/八亿时空:八亿时空2024年年度报告摘要.pdf"
output_pdf = "/Users/bravesword/Desktop/text_processed.pdf"
register_chinese_font()
extracted_text = extract_text_exclude_tables(input_pdf)
save_text_to_pdf(extracted_text, output_pdf)
print(f"PDF文件已生成: {output_pdf}")
运行上述代码遇到报错:
错误: 未找到SimSun字体文件
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/reportlab/lib/utils.py", line 523, in open_for_read
return open_for_read_by_name(name,mode)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/reportlab/lib/utils.py", line 463, in open_for_read_by_name
return open(name,mode)
FileNotFoundError: [Errno 2] No such file or directory: 'Helvetica'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/reportlab/lib/utils.py", line 530, in open_for_read
return BytesIO((datareader if name[:5].lower()=='data:' else rlUrlRead)(name))
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/reportlab/lib/utils.py", line 476, in rlUrlRead
return urlopen(name).read()
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/urllib/request.py", line 216, in urlopen
return opener.open(url, data, timeout)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/urllib/request.py", line 503, in open
req = Request(fullurl, data)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/urllib/request.py", line 322, in __init__
self.full_url = url
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/urllib/request.py", line 348, in full_url
self._parse()
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/urllib/request.py", line 377, in _parse
raise ValueError("unknown url type: %r" % self.full_url)
ValueError: unknown url type: 'Helvetica'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/reportlab/pdfbase/ttfonts.py", line 159, in TTFOpenFile
f = open_for_read(fn,'rb')
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/reportlab/lib/utils.py", line 534, in open_for_read
return open_for_read(name,mode)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/reportlab/lib/utils.py", line 532, in open_for_read
raise IOError('Cannot open resource "%s"' % name)
OSError: Cannot open resource "Helvetica"
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/Users/bravesword/Desktop/legal soup/test1.py", line 165, in <module>
register_chinese_font()
File "/Users/bravesword/Desktop/legal soup/test1.py", line 41, in register_chinese_font
pdfmetrics.registerFont(TTFont('Fallback', 'Helvetica'))
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/reportlab/pdfbase/ttfonts.py", line 1207, in __init__
self.face = TTFontFace(filename, validate=validate, subfontIndex=subfontIndex)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/reportlab/pdfbase/ttfonts.py", line 1088, in __init__
TTFontFile.__init__(self, filename, validate=validate, subfontIndex=subfontIndex)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/reportlab/pdfbase/ttfonts.py", line 447, in __init__
TTFontParser.__init__(self, file, validate=validate,subfontIndex=subfontIndex)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/reportlab/pdfbase/ttfonts.py", line 183, in __init__
self.readFile(file)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/reportlab/pdfbase/ttfonts.py", line 259, in readFile
self.filename, f = TTFOpenFile(f)
File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/reportlab/pdfbase/ttfonts.py", line 169, in TTFOpenFile
raise TTFError('Can\'t open file "%s"' % fn)
reportlab.pdfbase.ttfonts.TTFError: Can't open file "Helvetica"