首先需要安装:
pip install PyPDF2
批量切割
然后利用下面的代码:
from PyPDF2 import PdfFileReader, PdfFileWriter
# PDF文件分割
def split_pdf(read_file, out_detail):
try:
fp_read_file = open(read_file, 'rb')
pdf_input = PdfFileReader(fp_read_file) # 将要分割的PDF内容格式话
page_count = pdf_input.getNumPages() # 获取PDF页数
print(page_count) # 打印页数
with open(out_detail, 'r',True,'utf-8')as fp:
# print(fp)
txt = fp.readlines()
# print(txt)
for detail in txt: # 打开分割标准文件
# print(type(detail))
pages= detail.strip() # 空格分组
# write_file, write_ext = os.path.splitext(write_file) # 用于返回文件名和扩展名元组
pdf_file = f'{pages}.pdf'
# liststr=list(map(int, pages.split('-')))
# print(type(liststr))
start_page, end_page = list(map(int, pages.split('-'))) # 将字符串数组转换成整形数组
start_page -= 1
try:
print(f'开始分割{start_page}页-{end_page}页,保存为{pdf_file}......')
pdf_output = PdfFileWriter() # 实例一个 PDF文件编写器
for i in range(start_page, end_page):
pdf_output.addPage(pdf_input.getPage(i))
with open(pdf_file, 'wb') as sub_fp:
pdf_output.write(sub_fp)
print(f'完成分割{start_page}页-{end_page}页,保存为{pdf_file}!')
except IndexError:
print(f'分割页数超过了PDF的页数')
# fp.close()
except Exception as e:
print(e)
finally:
fp_read_file.close()
split_pdf('./高考模拟卷(6套)/试卷/2020模拟卷语文·试卷.pdf', 'config.txt')
config.txt为起始页文件,我的config.txt文件为:
1-4
5-8
9-12
13-16
17-20
21-24
切割指定页面的pdf
from PyPDF2 import PdfFileReader, PdfFileWriter
def split_single_pdf(read_file,start_page,end_page,pdf_file):
fp_read_file = open(read_file, 'rb')
pdf_output = PdfFileWriter() # 实例一个 PDF文件编写器
pdf_input = PdfFileReader(fp_read_file) # 将要分割的PDF内容格式话
for i in range(start_page, end_page):
pdf_output.addPage(pdf_input.getPage(i))
with open(pdf_file, 'wb') as sub_fp:
pdf_output.write(sub_fp)
print(f'完成分割{start_page}页-{end_page}页,保存为{pdf_file}!')
pdf_name='Automatic Chinese Spelling Checking and Correction Based on Character-Based Pre-trained Contextual Representations.pdf'
split_single_pdf('2019_Book_NaturalLanguageProcessingAndCh.pdf',569,579,pdf_name)
参考文献
[1].python分割PDF. https://www.cnblogs.com/sunmoon1993/p/11021758.html