基于语义结构的解析方式。将按照文档的语义结构(章节、段落层级)来组织chunks,而不是简单地按长度切分。
from docling.document_converter import DocumentConverter
from sentence_transformers import SentenceTransformer
import chromadb
import os
converter = DocumentConverter()
pdf_path = "/mnt/f/Projects/test-doc-parser/2408.09869v5.pdf"
try:
result = converter.convert(pdf_path)
except Exception as e:
print(f"解析失败: {e}")
raise
doc = result.document
# 3. 按语义结构提取chunks
chunks = []
metadatas = []
class SemanticChunkBuilder:
"""按照文档语义结构构建chunks"""
def __init__(self):
self.current_section = [] # 当前章节的标题路径
self.current_chunk = [] # 当前chunk的内容
self.current_metadata = {}
def process_element(self, element, level):
"""处理单个文档元素"""
element_type = element.label if hasattr(element, 'label') else 'unknown'
page_num = element.prov[0].page_no if (hasattr(element, 'prov') and element.prov) else None
content = element.text.strip() if hasattr(element, 'text') else ''
if not content:
return None
# 判断是否为标题类元素
is_heading = element_type in ['section_header', 'title', 'heading', 'subtitle']
if is_heading:
# 遇到新标题,先保存之前的chunk
if self.current_chunk:
yield self._build_chunk()
# 更新章节路径
self._update_section_path(content, level)
# 开始新chunk,以标题作为开头
self.current_chunk = [content]
self.current_metadata = {
"source": os.path.basename(pdf_path),
"page": page_num,
"section_path": " > ".join(self.current_section),
"level": level,
"type": element_type
}
else:
# 普通内容追加到当前chunk
self.current_chunk.append(content)
# 如果当前chunk过长(超过1500字符),切分为独立chunk
total_len = sum(len(c) for c in self.current_chunk)
if total_len > 1500:
yield self._build_chunk()
# 保留章节信息,开始新chunk
self.current_chunk = [content]
def _update_section_path(self, title, level):
"""更新章节路径"""
# 根据层级更新章节路径
if level == 0:
self.current_section = [title]
elif level < len(self.current_section):
self.current_section = self.current_section[:level] + [title]
else:
self.current_section.append(title)
def _build_chunk(self):
"""构建一个完整的语义chunk"""
chunk_content = "\n".join(self.current_chunk)
chunk_meta = self.current_metadata.copy()
chunk_meta["char_count"] = len(chunk_content)
chunk_meta["paragraph_count"] = len(self.current_chunk)
return chunk_content, chunk_meta
def finalize(self):
"""处理最后剩余的chunk"""
if self.current_chunk:
return self._build_chunk()
return None
# 使用语义chunk构建器
builder = SemanticChunkBuilder()
for element, level in doc.iterate_items():
chunk_result = builder.process_element(element, level)
if chunk_result:
for chunk_data in chunk_result:
if chunk_data:
content, metadata = chunk_data
chunks.append(content)
metadatas.append(metadata)
# 处理最后的chunk
final_chunk = builder.finalize()
if final_chunk:
content, metadata = final_chunk
chunks.append(content)
metadatas.append(metadata)
print(f"提取了 {len(chunks)} 个语义chunk")
print(f"示例章节路径: {metadatas[0].get('section_path', 'N/A')}")
# 4. 向量化
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks)
# 5. 存储到 Chroma
client = chromadb.Client()
collection = client.create_collection(name="pdf_semantic_chunks")
collection.add(
documents=chunks,
embeddings=embeddings.tolist(),
metadatas=metadatas,
ids=[f"semantic_chunk_{i}" for i in range(len(chunks))]
)
# 6. 增强检索示例
query = "Both models are also powering the open-access deepsearch-experience"
query_embedding = model.encode([query])[0]
results = collection.query(
query_embeddings=[query_embedding.tolist()],
n_results=5
)
print("\n" + "="*80)
print("语义检索结果:")
print("="*80)
for i, (doc_text, meta) in enumerate(zip(results["documents"][0], results["metadatas"][0]), 1):
print(f"\n【结果 {i}】")
print(f"📄 来源: {meta['source']}")
print(f"📖 页码: {meta['page']}")
print(f"📑 章节: {meta.get('section_path', 'N/A')}")
print(f"🏷️ 类型: {meta['type']} (层级 {meta.get('level', 'N/A')})")
print(f"📊 统计: {meta.get('char_count', 0)} 字符, {meta.get('paragraph_count', 0)} 段落")
print(f"\n💬 内容预览:")
# print(doc_text[:300] + "..." if len(doc_text) > 300 else doc_text)
print(doc_text)
print("-" * 80)
# 7. 按章节统计
section_stats = {}
for meta in metadatas:
section = meta.get('section_path', 'Unknown')
section_stats[section] = section_stats.get(section, 0) + 1
print("\n" + "="*80)
print("章节分布统计:")
print("="*80)
for section, count in sorted(section_stats.items(), key=lambda x: x[1], reverse=True)[:10]:
print(f"{section}: {count} chunks")

1364

被折叠的 条评论
为什么被折叠?



