目录
一、PyMuPDF
1.安装PyMuPDF
pip install pymupdf
2.pdf转txt样例
import os
import datetime
import fitz # fitz就是pip install PyMuPDF
def pyMuPDF_fitz(pdfPath):
startTime_pdf2img = datetime.datetime.now() # 开始时间
text_list = []
pdfDoc = fitz.open(pdfPath)
for page in pdfDoc:
text = page.get_text()
text_list.append(text)
text_list = "\n".join(text_list)
try:
with open("/home/bingxing2/ailab/group/ai4agr/wzf/LLM/txt/test.txt", 'a+') as neirong:
neirong.write(text_list)
except IOError as e:
print("An error occurred while writing the file:", e)
endTime_pdf2img = datetime.datetime.now() # 结束时间
print('pdf2img时间=', (endTime_pdf2img - startTime_pdf2img).seconds)
def process_all_pdfs_in_directory(directory):
for filename in os.listdir(directory):
if filename.endswith('.pdf'):
pdf_path = os.path.join(directory, filename)
pyMuPDF_fitz(pdf_path)
if __name__ == "__main__":
# 指定PDF所在的目录
pdf_directory = r'/home/bingxing2/ailab/group/ai4agr/wzf/LLM/pdf/'
process_all_pdfs_in_directory(pdf_directory)
注:
pymupdf不能直接提取表格,要使用pdfplumber来实现
提取图片使用img=page.getImageList()
提取后发现,文字可以正常提取但是数字不能正常提取
原因:数字在PDF文件中以图像形式呈现,而不是文本形式。这种情况下,提取数字就需要进行OCR(光学字符识别)处理
因此先将pdf转为图片,在对图片提取文字(采用cnocr、paddleocr、tesseract)
pdf转图片:
import os
import datetime
import fitz # fitz就是pip install PyMuPDF
def pdf_to_images(directory, filename, output_folder):
pdf_path = os.path.join(directory, filename)
pdf_doc = fitz.open(pdf_path)
for page_number in range(len(pdf_doc)):
page = pdf_doc[page_number]
image = page.get_pixmap(matrix=fitz.Matrix(4, 4), alpha=False)
image_path = os.path.join(output_folder, f"{filename[:-4]}_page_{page_number + 1}.png")
image.save(image_path)
pdf_doc.close()
def process_all_pdfs_in_directory(directory, output_folder):
#pdf to img
for filename in os.listdir(directory):
if filename.endswith('.pdf'):
pdf_to_images(directory, filename, output_folder)
if __name__ == "__main__":
# 指定PDF所在的目录
pdf_directory = r'/home/bingxing2/ailab/group/ai4agr/wzf/LLM/pdf/books/'
# 指定输出图片的目录
output_folder = r'/home/bingxing2/ailab/group/ai4agr/wzf/LLM/images/books/'
process_all_pdfs_in_directory(pdf_directory, output_folder)
二、CNOCR
1.安装cnocr
pip install cnocr
2.图片转文字,存入同一个txt文件
import cnocr
import os
import datetime
def recognize_text(txt_directory, image_directory):
# 初始化 cnocr
ocr = cnocr.CnOcr()
text = []
for filename in os.listdir(image_directory):
if filename.endswith('.png'):