import os
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
import json
import re
import hashlib
import time
import torch
import numpy as np
import pdfplumber
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# 初始化模型和工具
nlp = spacy.load("zh_core_web_sm")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
class PDFProcessor:
def __init__(self, pdf_path, output_dir="output"):
self.pdf_path = pdf_path
self.output_dir = output_dir
self.document_id = os.path.splitext(os.path.basename(pdf_path))[0]
os.makedirs(output_dir, exist_ok=True)
# 初始化生成模型
model_name = "uer/gpt2-distil-chinese-cluecorpussmall"
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
self.model = AutoModelForCausalLM.from_pretrained(model_name)
self.summarizer = pipeline(
"text-generation",
model=self.model,
tokenizer=self.tokenizer,
device=0 if torch.cuda.is_available() else -1
)
# 文档结构存储
self.full_text = ""
self.structure = []
self.heading_tree = self.HeadingTree()
self.font_stats = defaultdict(int)
class HeadingTree:
"""管理标题层级关系的树结构"""
def __init__(self):
self.root = {"level": 0, "children": [], "text": "ROOT"}
self.current = self.root
self.level_path = "0"
def add_heading(self, level, text, page):
while self.current["level"] >= level:
self.current = self.current["parent"]
new_node = {
"level": level,
"text": text,
"page": page,
"parent": self.current,
"children": [],
"local_index": len(self.current["children"]) + 1
}
self.current["children"].append(new_node)
self.current = new_node
if self.current["parent"] == self.root:
self.level_path = str(new_node["local_index"])
else:
self.level_path = f"{self.current['parent']['path']}.{new_node['local_index']}"
new_node["path"] = self.level_path
return self.level_path
def parse_pdf(self):
"""解析PDF文档"""
with pdfplumber.open(self.pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages, 1):
# 提取文本元素
words = page.extract_words(extra_attrs=["fontname", "size"])
self._analyze_font_features(words)
# 提取结构化文本
text_blocks = page.extract_text_lines()
for block in text_blocks:
self._process_text_block(block, page_num)
# 处理表格
tables = page.extract_tables()
for table in tables:
self._process_table(table, page_num)
# 保存原始文本
self.full_text += page.extract_text() + "\n"
# 保存原始文本
with open(os.path.join(self.output_dir, f"{self.document_id}_full.txt"), "w", encoding="utf-8") as f:
f.write(self.full_text)
def _analyze_font_features(self, words):
"""分析字体特征建立标题识别模型"""
for word in words:
font_key = (word["fontname"], round(word["size"], 1))
self.font_stats[font_key] += 1
def _process_text_block(self, block, page_num):
"""处理文本块并识别标题"""
font_key = (block["fontname"], round(block["size"], 1))
font_freq = self.font_stats[font_key]
# 标题识别逻辑
is_heading = (
block["size"] > 12 and
font_freq < 100 and
len(block["text"].strip()) < 50
)
if is_heading:
# 自动推断标题层级
heading_level = min(int(block["size"] // 2), 6)
self.structure.append({
"type": "heading",
"level": heading_level,
"text": block["text"].strip(),
"page": page_num,
"start_pos": len(self.full_text)
})
self.heading_tree.add_heading(heading_level, block["text"].strip(), page_num)
self.full_text += block["text"].strip() + "\n"
else:
self.structure.append({
"type": "paragraph",
"text": block["text"].strip(),
"page": page_num,
"start_pos": len(self.full_text)
})
self.full_text += block["text"].strip() + "\n"
def _process_table(self, table, page_num):
"""处理表格并转换为Markdown格式"""
markdown_table = []
for row in table:
markdown_row = "| " + " | ".join(str(cell).strip() for cell in row) + " |"
markdown_table.append(markdown_row)
if len(markdown_table) == 1: # 添加表头分隔线
markdown_table.append("| " + " | ".join(["---"] * len(row)) + " |")
table_text = "\n".join(markdown_table)
self.structure.append({
"type": "table",
"text": table_text,
"page": page_num,
"start_pos": len(self.full_text)
})
self.full_text += table_text + "\n"
def dynamic_chunking(self, max_chunk_length=1500, min_chunk_length=200):
"""动态语义分块算法"""
chunks = []
current_chunk = ""
current_start = 0
chunk_id = 0
# 基于结构初步分块
for i, item in enumerate(self.structure):
# 标题作为新块的开始
if item["type"] == "heading" and current_chunk:
chunks.append({
"start": current_start,
"end": len(self.full_text[:current_start]) + len(current_chunk),
"text": current_chunk,
"page": item["page"],
"id": f"{self.document_id}_chunk{chunk_id:03d}"
})
chunk_id += 1
current_chunk = ""
current_start = item["start_pos"]
current_chunk += item["text"] + "\n"
# 长度保护:防止块过长
if len(current_chunk) > max_chunk_length:
chunks.append({
"start": current_start,
"end": current_start + len(current_chunk),
"text": current_chunk,
"page": item["page"],
"id": f"{self.document_id}_chunk{chunk_id:03d}"
})
chunk_id += 1
current_chunk = ""
current_start = self.structure[i+1]["start_pos"] if i+1 < len(self.structure) else current_start + len(current_chunk)
# 添加最后一个块
if current_chunk:
chunks.append({
"start": current_start,
"end": current_start + len(current_chunk),
"text": current_chunk,
"page": self.structure[-1]["page"],
"id": f"{self.document_id}_chunk{chunk_id:03d}"
})
# 语义边界优化
refined_chunks = []
for chunk in chunks:
sentences = [sent.text for sent in nlp(chunk["text"]).sents]
if len(sentences) < 2: # 无需分割
refined_chunks.append(chunk)
continue
# 计算句子嵌入
sentence_embeddings = embedding_model.encode(sentences)
# 寻找最佳分割点
split_points = []
for i in range(1, len(sentences)):
sim = cosine_similarity(
[sentence_embeddings[i-1]],
[sentence_embeddings[i]]
)[0][0]
if sim < 0.65: # 语义相似度阈值
split_points.append(i)
# 如果没有分割点或只有一个句子,保留原块
if not split_points:
refined_chunks.append(chunk)
continue
# 创建新块
start_idx = 0
for split_idx in split_points:
new_chunk_text = " ".join(sentences[start_idx:split_idx])
if len(new_chunk_text) > min_chunk_length: # 最小长度保护
refined_chunks.append({
"start": chunk["start"] + sum(len(s) for s in sentences[:start_idx]),
"end": chunk["start"] + sum(len(s) for s in sentences[:split_idx]),
"text": new_chunk_text,
"page": chunk["page"],
"id": f"{self.document_id}_chunk{chunk_id:03d}"
})
chunk_id += 1
start_idx = split_idx
# 添加最后一段
if start_idx < len(sentences):
new_chunk_text = " ".join(sentences[start_idx:])
if len(new_chunk_text) > min_chunk_length:
refined_chunks.append({
"start": chunk["start"] + sum(len(s) for s in sentences[:start_idx]),
"end": chunk["start"] + len(chunk["text"]),
"text": new_chunk_text,
"page": chunk["page"],
"id": f"{self.document_id}_chunk{chunk_id:03d}"
})
chunk_id += 1
return refined_chunks
def extract_metadata(self, chunk):
"""提取块的元数据"""
metadata = {
"hierarchy": "0.0",
"keywords": [],
"entities": [],
"has_table": False,
"has_formula": False
}
# 1. 提取层级信息
for item in reversed(self.structure):
if item["start_pos"] <= chunk["start"] and item["type"] == "heading":
metadata["hierarchy"] = self._find_heading_path(item)
break
# 2. 提取关键词 (TF-IDF)
vectorizer = TfidfVectorizer(stop_words="english", max_features=10)
try:
tfidf_matrix = vectorizer.fit_transform([chunk["text"]])
feature_names = vectorizer.get_feature_names_out()
tfidf_scores = tfidf_matrix.toarray()[0]
top_indices = np.argsort(tfidf_scores)[-5:] # 取前5个关键词
metadata["keywords"] = [feature_names[i] for i in top_indices if tfidf_scores[i] > 0.1]
except:
metadata["keywords"] = []
# 3. 实体识别
doc = nlp(chunk["text"])
for ent in doc.ents:
if ent.label_ in ["PERSON", "ORG", "GPE", "PRODUCT", "DATE"]: # 过滤实体类型
metadata["entities"].append({
"text": ent.text,
"type": ent.label_,
"start_pos": ent.start_char,
"end_pos": ent.end_char
})
# 4. 检测表格和公式
metadata["has_table"] = bool(re.search(r'\+[-]+\+', chunk["text"])) # 简单表格检测
metadata["has_formula"] = bool(re.search(r'\$(.*?)\$|\\[a-zA-Z]+{', chunk["text"])) # LaTeX或数学公式
return metadata
def _find_heading_path(self, heading_item):
"""根据标题项查找完整层级路径"""
for node in self.heading_tree.root["children"]:
path = self._find_node_path(node, heading_item["text"])
if path:
return path
return "0.0"
def _find_node_path(self, node, text):
"""递归查找标题节点路径"""
if node["text"] == text:
return node["path"]
for child in node["children"]:
path = self._find_node_path(child, text)
if path:
return path
return None
def generate_summary(self, text):
"""生成轻量级摘要"""
prompt = f"请为以下文本生成一句简洁的摘要(20-30字),严格基于内容不要添加新信息:\n{text[:2000]}"
try:
summary = self.summarizer(
prompt,
max_new_tokens=50,
temperature=0.3,
do_sample=True,
pad_token_id=self.tokenizer.pad_token_id, # 使用已设置的pad_token_id
eos_token_id=self.tokenizer.eos_token_id # 显式设置eos_token
)[0]['generated_text']
# 提取生成的摘要部分
return summary.replace(prompt, "").strip()
except Exception as e:
print(f"摘要生成失败: {str(e)}")
# 失败时使用简单的前三句作为摘要
sents = [sent.text for sent in nlp(text).sents][:3]
return " ".join(sents)
def process_to_json(self, chunks):
"""处理为最终的JSON格式"""
results = []
summary_cache = {}
for chunk in chunks:
# 生成摘要(缓存相同文本)
text_hash = hashlib.md5(chunk["text"].encode()).hexdigest()
if text_hash in summary_cache:
summary = summary_cache[text_hash]
else:
summary = self.generate_summary(chunk["text"])
summary_cache[text_hash] = summary
# 提取元数据
metadata = self.extract_metadata(chunk)
# 构建最终JSON对象
result = {
"chunk_id": chunk["id"],
"text": chunk["text"],
"summary": summary,
"metadata": metadata
}
results.append(result)
return results
def process_document(self):
"""处理文档的完整流程"""
print(f"开始处理文档: {self.docx_path}")
total_start = time.time()
try:
# 记录各阶段耗时
parse_start = time.time()
self.parse_docx()
parse_time = time.time() - parse_start
chunk_start = time.time()
chunks = self.dynamic_chunking()
chunk_time = time.time() - chunk_start
json_start = time.time()
json_data = self.process_to_json(chunks)
json_time = time.time() - json_start
# 保存结果
output_path = os.path.join(self.output_dir, f"{self.document_id}_chunks.json")
with open(output_path, "w", encoding="utf-8") as f:
json.dump(json_data, f, ensure_ascii=False, indent=2)
total_time = time.time() - total_start
print(f"\n处理完成! 结果已保存至: {output_path}")
print("="*40)
print(f"总耗时: {total_time:.2f}秒")
print(f"文档解析: {parse_time:.2f}秒")
print(f"语义分块: {chunk_time:.2f}秒")
print(f"元数据处理: {json_time:.2f}秒")
print("="*40)
return json_data
except Exception as e:
print(f"处理过程中发生错误: {str(e)}")
return None
if __name__ == "__main__":
processor = PDFProcessor(
pdf_path="test1.pdf",
output_dir="processed_pdfs"
)
processor.process_document()以上代码报错如下,应该怎么解决?
(venv) C:\Users\Semi-YuLJ\Desktop\learning>python C:\Users\Semi-YuLJ\Desktop\learning\chunk_pdf.py
Traceback (most recent call last):
File "C:\Users\Semi-YuLJ\Desktop\learning\chunk_pdf.py", line 419, in <module>
processor.process_document()
File "C:\Users\Semi-YuLJ\Desktop\learning\chunk_pdf.py", line 376, in process_document
print(f"开始处理文档: {self.docx_path}")
AttributeError: 'PDFProcessor' object has no attribute 'docx_path'