import re
import nltk
import pdfplumber
from PyPDF2 import PdfFileReader
from txtai.pipeline import Textractor
class PDFanalysis:
def __init__(self,path):
self.pdf_path=path
self.pypdf2_pdf_run(self.pdf_path)
def ppr_analysis_pdf(self,path):
pdf = pdfplumber.open(path)
tables=[]
for page in pdf.pages:
for table in page.extract_table():
tables.append(table)
pdf.close()
return tables
def txtai_analysis_pdf(self,path):
textractor=Textractor(sentences=True)
data=textractor(path)
return data
def pypdf2_analysis_paf_show(self,path,number,output):
datas = []
sents = []
reader = PdfFileReader(path)
page = reader.getPage(number)
page_text = page.extractText()
if output==0:
print("完整处理 PDF .....","\n")
page_split = page_text.split("\n")
new_list=[]
for index,sent1 in enumerate(page_split[:-1]):
sent2=page_split[index+1]
FTPwords = re.findall(r"^Fig.|^tabel.|^Page ", sent2, re.IGNORECASE)
Cwords = re.findall(r"[* ]Correspondence:", sent1, re.IGNORECASE)
if len(sent1)>=45 and len(sent1)+len(sent2)>=100 and not FTPwords and not Cwords:
new_list.append(sent1)
else:
new_list.append(sent1+"\n")
new_list.append(page_split[-1])
text="".join(i for i in new_list)
text=self.pre_split_sign_text(text)
print(text)
elif output==1:
print("预处理 PDF 长度+解析......","\n")
sign_text= self.pre_split_sign_text(page_text)
page_split = sign_text.split("\n")
for sent in page_split:
sents.append((len(sent), sent))
for ln, sent in sents:
print("len:", ln)
print("sent:", sent)
elif output==2:
datas.append(page_text)
print("解析 PDF ......","\n")
#print(page_text)
print(datas)
elif output==3:
print("预处理pdf...","\n")
sign_text=self.pre_split_sign_text(page_text)
print(sign_text)
def pre_split_sign_text(self,text):
text = self.pre_split_str_text(text, r"[(][\w]+\n","\n"," ")
#特殊单词
text = self.pre_split_str_text(text, r"[( ][\w]+\n\d\w\d","\n","")
text = self.pre_split_str_text(text, r"[( ][\w]+\n\d[+-]","\n","")
text = self.pre_split_str_text(text, r"[( ]C\nmax[ ]|[( ]C\nmin[ ]","\n","")
text = self.pre_split_str_text(text, r"[( ]DL\nCO[ ]|[( ]CL\nCR[ ]","\n","")
sents = self.pre_split_str_text(text, r"\xa0 cm\n[−+]\d\s","\n","")
# ...BMC Cancer, 13, 326. [PubMed: 23819905] \nAndres SA &...
sents = sents.replace("] \n", "]\n")
sents = sents.replace(" \x0b", " ")
# \xad
connect_sents = sents.split("\xad\n")
sents = "".join(i for i in connect_sents)
# \s
sents = sents.replace(" \n", "\n")
connect_words = sents.split(" \n")
sents = " ".join(i for i in connect_words)
# ()
connect_sents = sents.split("(\n")
sents = "(".join(i for i in connect_sents)
connect_sents = sents.split(")\n")
sents = ")".join(i for i in connect_sents)
# 特殊符号
connect_sents = sents.split("\n™")
sents = "™".join(i for i in connect_sents)
connect_sents = sents.split("\n®")
sents = "®".join(i for i in connect_sents)
connect_sents = sents.split("\n®")
sents = "®".join(i for i in connect_sents)
# -
sents = sents.replace("--\n", "~~\n")
relu = re.compile(r"\n[-]\n|[-]\n| [-]\n")
rlist = relu.split(sents)
sents = "".join(i for i in rlist)
# ,
relu = re.compile(r"[,]\n|\n[,]")
rlist = relu.split(sents)
sents = ",".join(i for i in rlist)
# ;
relu = re.compile(r"[;]\n|\n[;]")
rlist = relu.split(sents)
sents = ";".join(i for i in rlist)
# :
relu = re.compile(r"[:]\n|\n[:]")
rlist = relu.split(sents)
sents = ";".join(i for i in rlist)
relu = re.compile(r"[/]\n|\n[/]")
rlist = relu.split(sents)
sents = "/".join(i for i in rlist)
#
sents = sents.replace(".\n", ".\n ")
sents = sents.replace("\n", " \n")
# 关键词
sents = sents.replace(" NSCLC\npatient", " NSCLC patient")
sents = sents.replace("AbstractBackground: ", "Abstract\nBackground: ")
sents = sents.replace("© The Author(s).", "\n© The Author(s).")
sents = sents.replace("* Correspondence:", "\n* Correspondence:")
sents = sents.replace("*Correspondence", "\n*Correspondence")
sents = sents.replace("INTRODUCTION", "\nINTRODUCTION\n")
sents = sents.replace(".DATA SHARING", ".\nDATA SHARING\n")
sents = sents.replace(".REFERENCES", ".\nREFERENCES\n")
return sents
def pre_split_str_text(self,text,rerelu,invalue,outvalue):
relu = re.compile(rerelu)
words = relu.findall(text)
sents = relu.split(text)
if words:
word_list = []
for word in words:
word_list.append(str(word).replace(invalue,outvalue))
text_list = []
for index, sent in enumerate(sents[:-1]):
text_list.append(sent + word_list[index])
retext = "".join(i for i in text_list)
return retext
else:
return text
def split_paras(self,text):
return [i for i in re.split(r'[\n]', text)]
def split_long_sents_re(self,text):
relu = re.compile(r"[.。??!!]")
words = relu.findall(text)
sents = relu.split(text)
text_list = []
for index, sent in enumerate(sents[:-1]):
text_list.append(sent + words[index])
return text_list
def split_long_sents_nltk(self,text):
sent=nltk.tokenize.sent_tokenize(text)
return sent
def clear_read_data(self,data):
data_list=self.split_paras(data)
sents_list=[]
for i in data_list:
relu = re.compile(".")
words = relu.findall(i)
if len(words)<=1:
sents_list.extend(i)
else:
#cache_list=split_long_sents_re(i)
cache_list = self.split_long_sents_nltk(i)
sents_list.extend(cache_list)
return sents_list
def pypdf2_pdf_run(self,path,is_join=False):
reader = PdfFileReader(path)
page_count = reader.getNumPages()
texts = []
for i in range(page_count):
page = reader.getPage(i)
page_text = page.extractText()
page_split = page_text.split("\n")
new_list = []
for index, sent1 in enumerate(page_split[:-1]):
if is_join:
sent2 = page_split[index + 1]
FTPwords = re.findall(r"^Fig.|^tabel.|^Page ", sent2, re.IGNORECASE)
Cwords = re.findall(r"[* ]Correspondence:", sent1, re.IGNORECASE)
if len(sent1) >= 45 and len(sent1) + len(sent2) >= 100 and not FTPwords and not Cwords:
new_list.append(sent1)
else:
new_list.append(sent1 + "\n")
else:
new_list.append(sent1)
new_list.append(page_split[-1]+"\n")
text = "".join(i for i in new_list)
text = self.pre_split_sign_text(text)
texts.append(text)
datas = "".join(i for i in texts)
pdf_data = self.clear_read_data(datas)
return pdf_data
if __name__ == '__main__':
path = "../../datasets/pdf/A001.pdf"
model=PDFanalysis(path)
print(model)
"""
测试36篇 8篇长句问题,即"\n"问题
24篇调式完毕,格式原因,小部分标题与段落组合,解析顺序不一致
"""
PDF解析
最新推荐文章于 2025-10-27 16:32:15 发布
该代码实现了一个名为PDFanalysis的类,用于处理PDF文件。它包含了多个方法,如使用pdfplumber和PyPDF2库提取表格和文本,使用txtai进行句子提取,以及预处理文本,包括分割特殊符号、调整换行等。此外,还提供了拆分长句和清除数据的辅助方法。
部署运行你感兴趣的模型镜像
您可能感兴趣的与本文相关的镜像
Langchain-Chatchat
AI应用
Langchain
Langchain-Chatchat 是一个基于 ChatGLM 等大语言模型和 Langchain 应用框架实现的开源项目,旨在构建一个可以离线部署的本地知识库问答系统。它通过检索增强生成 (RAG) 的方法,让用户能够以自然语言与本地文件、数据库或搜索引擎进行交互,并支持多种大模型和向量数据库的集成,以及提供 WebUI 和 API 服务
1329

被折叠的 条评论
为什么被折叠?



