关于python 处理pdf的第三方库有很多,基础的是ReportLab
参考 reportlab user guide
Py2pdf 实际上是ReportLab 的Demo, ReportLab中没有直接合并pdf和添加bookmarks的工具, Py2pdf中有经过二次开发得到的这样的工具
from __future__ import print_function
from sys import argv
from PyPDF2 import PdfFileMerger, PdfFileReader
import os
class bookmark_class:
def __init__(self, title, page_num, parent = 0):
self.page_num = page_num
self.parent = parent
self.title = title
def print_content(self):
print("page_num: ", self.page_num)
print("parent: ", self.parent)
print("title: ", self.title)
# get bookmark info from the pdf outlines
def bookmark_list(bookmark_list, parent = 0):
print("bookmark:", bookmark_list)
result = []
parent_suanz = parent
for item in bookmark_list:
if isinstance(item, list):
result += bookmark_list(item, parent_suanz)
else:
bookmark = bookmark_class(item.get("/Title"), item.get("/Page"),
parent)
result.append(bookmark)
print("bookmark_content:")
bookmark.print_content()
parent_suanz = result[-1].page_num
return result
# merge pdf, keep origin bookmarks and add a higher bookmark for each pdf
# if there is marks, one mark corresponds to one pdf
def merge_pdf(pdfs, target_file = "res.pdf", marks = None):
if pdfs:
merger = PdfFileMerger()
page_num = 0
bookmark_r = [None]
for index, pdf_file in enumerate(pdfs):
if os.path.isfile(pdf_file):
try:
file_suanz = PdfFileReader(open(pdf_file, 'rb'))
suanz_num = file_suanz.getNumPages()
print("suanz_num, page_num: ", suanz_num, page_num)
bookmark_r += [None]*suanz_num
bookmarks = bookmark_list(file_suanz.getOutlines(),
page_num)
merger.append(file_suanz)
print("give bookmark pdffile, ", page_num)
mark = pdf_file
if marks:
if len(marks) > index:
mark = marks[index]
bookmark_r[page_num] = merger.addBookmark(mark,
page_num)
for bookmark in bookmarks:
print("give back bookmark, ",
page_num + bookmark.page_num)
print("use back bookmark, ", bookmark.parent)
bookmark_suanz = merger.addBookmark(
bookmark.title,
page_num + bookmark.page_num,
bookmark_r[bookmark.parent])
if not bookmark_r[page_num + bookmark.page_num]:
bookmark_r[page_num + bookmark.page_num] = \
bookmark_suanz
page_num += suanz_num
except:
print("merge pdf, but {} read failed".format(pdf_file))
merger.write(target_file)