4种解析策略,你可以根据文档特点选择:
四种策略对比
| 策略 | 适用场景 | 优点 | 缺点 |
|---|---|---|---|
| SEMANTIC | 结构化论文、书籍 | 保留完整章节语义 | 可能产生过大chunks |
| HYBRID ⭐ | 通用文档 | 平衡语义和大小 | 最推荐 |
| FIXED_SIZE | 无明显结构的文档 | 大小均匀,适合对比 | 可能切断语义 |
| ELEMENT_BASED | 表格/列表多的文档 | 同类元素聚合 | 可能丢失上下文 |
关键设计点
-
Docling输出的处理:
element.label→ 元素类型(paragraph, title, table等)element.text→ 文本内容element.prov→ 来源页码level→ 标题层级
-
metadata设计:
- 保留了章节路径、页码、元素类型
- 添加了策略标识,方便后续对比
-
灵活切换:只需修改
strategy参数即可测试不同效果
你可以先用 HYBRID 策略试试,它既保留了章节边界,又限制了chunk大小。
from docling.document_converter import DocumentConverter
from sentence_transformers import SentenceTransformer
import chromadb
import os
from typing import List, Dict, Tuple
from enum import Enum
class ChunkStrategy(Enum):
"""分块策略"""
SEMANTIC = "semantic" # 按语义结构(章节)
HYBRID = "hybrid" # 混合策略(章节+段落)
FIXED_SIZE = "fixed_size" # 固定大小滑动窗口
ELEMENT_BASED = "element_based" # 按元素类型
class DocumentChunker:
"""文档分块器"""
def __init__(self, pdf_path: str, strategy: ChunkStrategy = ChunkStrategy.HYBRID):
self.pdf_path = pdf_path
self.strategy = strategy
self.chunks = []
self.metadatas = []
def parse_document(self) -> Tuple[List[str], List[Dict]]:
"""解析文档并返回chunks和metadata"""
converter = DocumentConverter()
result = converter.convert(self.pdf_path)
doc = result.document
if self.strategy == ChunkStrategy.SEMANTIC:
return self._semantic_chunking(doc)
elif self.strategy == ChunkStrategy.HYBRID:
return self._hybrid_chunking(doc)
elif self.strategy == ChunkStrategy.FIXED_SIZE:
return self._fixed_size_chunking(doc)
elif self.strategy == ChunkStrategy.ELEMENT_BASED:
return self._element_based_chunking(doc)
def _semantic_chunking(self, doc) -> Tuple[List[str], List[Dict]]:
"""策略1: 按章节语义结构分块"""
chunks, metadatas = [], []
section_stack = [] # 章节标题栈
current_chunk = []
current_page = None
for element, level in doc.iterate_items():
element_type = self._get_element_type(element)
content = self._get_content(element)
page_num = self._get_page(element)
if not content:
continue
# 判断是否是标题
if self._is_heading(element_type):
# 保存上一个chunk
if current_chunk:
chunks.append("\n".join(current_chunk))
metadatas.append(self._build_metadata(
page_num=current_page,
section=" > ".join(section_stack),
element_type="section_content",
level=len(section_stack)
))
# 更新章节栈
while len(section_stack) > level:
section_stack.pop()
section_stack.append(content)
# 开始新chunk
current_chunk = [f"## {content}"]
current_page = page_num
else:
# 添加内容到当前chunk
current_chunk.append(content)
if current_page is None:
current_page = page_num
# 处理最后一个chunk
if current_chunk:
chunks.append("\n".join(current_chunk))
metadatas.append(self._build_metadata(
page_num=current_page,
section=" > ".join(section_stack),
element_type="section_content",
level=len(section_stack)
))
return chunks, metadatas
def _hybrid_chunking(self, doc) -> Tuple[List[str], List[Dict]]:
"""策略2: 混合策略 - 章节边界 + 大小限制"""
chunks, metadatas = [], []
section_stack = []
current_chunk = []
current_page = None
max_chunk_size = 1000 # 字符数
for element, level in doc.iterate_items():
element_type = self._get_element_type(element)
content = self._get_content(element)
page_num = self._get_page(element)
if not content:
continue
# 标题触发chunk切分
if self._is_heading(element_type):
if current_chunk:
chunks.append("\n".join(current_chunk))
metadatas.append(self._build_metadata(
page_num=current_page,
section=" > ".join(section_stack),
element_type="mixed_content"
))
# 更新章节
while len(section_stack) > level:
section_stack.pop()
section_stack.append(content)
current_chunk = [f"## {content}"]
current_page = page_num
else:
# 检查chunk大小
chunk_text = "\n".join(current_chunk)
if len(chunk_text) + len(content) > max_chunk_size and current_chunk:
# 保存当前chunk
chunks.append(chunk_text)
metadatas.append(self._build_metadata(
page_num=current_page,
section=" > ".join(section_stack),
element_type="mixed_content"
))
# 开始新chunk,但保留章节上下文
current_chunk = [f"[续] {section_stack[-1] if section_stack else ''}"]
current_chunk.append(content)
if current_page is None:
current_page = page_num
if current_chunk:
chunks.append("\n".join(current_chunk))
metadatas.append(self._build_metadata(
page_num=current_page,
section=" > ".join(section_stack),
element_type="mixed_content"
))
return chunks, metadatas
def _fixed_size_chunking(self, doc) -> Tuple[List[str], List[Dict]]:
"""策略3: 固定大小滑动窗口"""
all_text = []
all_elements = []
for element, level in doc.iterate_items():
content = self._get_content(element)
if content:
all_text.append(content)
all_elements.append({
'content': content,
'page': self._get_page(element),
'type': self._get_element_type(element)
})
# 滑动窗口切分
chunks, metadatas = [], []
chunk_size = 500 # 字符
overlap = 100 # 重叠
full_text = " ".join(all_text)
for i in range(0, len(full_text), chunk_size - overlap):
chunk = full_text[i:i + chunk_size]
chunks.append(chunk)
# 估算页码(简化处理)
char_pos = i + chunk_size // 2
estimated_page = min(len(all_elements) - 1,
int(char_pos / (len(full_text) / len(all_elements))))
metadatas.append(self._build_metadata(
page_num=all_elements[estimated_page]['page'],
section="",
element_type="sliding_window"
))
return chunks, metadatas
def _element_based_chunking(self, doc) -> Tuple[List[str], List[Dict]]:
"""策略4: 按元素类型分组"""
chunks, metadatas = [], []
current_group = []
current_type = None
current_page = None
for element, level in doc.iterate_items():
element_type = self._get_element_type(element)
content = self._get_content(element)
page_num = self._get_page(element)
if not content:
continue
# 元素类型变化或组过大时切分
if (current_type and current_type != element_type) or \
(len("\n".join(current_group)) > 800):
if current_group:
chunks.append("\n".join(current_group))
metadatas.append(self._build_metadata(
page_num=current_page,
section="",
element_type=current_type or "unknown"
))
current_group = []
current_type = element_type
current_page = page_num
current_group.append(content)
current_type = element_type
if current_page is None:
current_page = page_num
if current_group:
chunks.append("\n".join(current_group))
metadatas.append(self._build_metadata(
page_num=current_page,
section="",
element_type=current_type or "unknown"
))
return chunks, metadatas
# 辅助方法
def _get_element_type(self, element) -> str:
return element.label if hasattr(element, 'label') else 'unknown'
def _get_content(self, element) -> str:
return element.text.strip() if hasattr(element, 'text') else ''
def _get_page(self, element):
return element.prov[0].page_no if (hasattr(element, 'prov') and element.prov) else None
def _is_heading(self, element_type: str) -> bool:
heading_types = ['section_header', 'title', 'heading', 'subtitle']
return element_type in heading_types
def _build_metadata(self, page_num, section, element_type, level=None) -> Dict:
meta = {
"source": os.path.basename(self.pdf_path),
"page": page_num,
"section": section,
"type": element_type,
"strategy": self.strategy.value
}
if level is not None:
meta["level"] = level
return meta
# === 主程序 ===
def main():
pdf_path = "/mnt/f/Projects/test-doc-parser/2408.09869v5.pdf"
# 选择策略(可以尝试不同策略对比效果)
strategy = ChunkStrategy.HYBRID # 推荐使用混合策略
# strategy = ChunkStrategy.SEMANTIC
# strategy = ChunkStrategy.FIXED_SIZE
# strategy = ChunkStrategy.ELEMENT_BASED
print(f"使用策略: {strategy.value}")
print("="*80)
# 1. 解析文档
chunker = DocumentChunker(pdf_path, strategy)
chunks, metadatas = chunker.parse_document()
print(f"✓ 提取了 {len(chunks)} 个chunks")
print(f"✓ 平均chunk大小: {sum(len(c) for c in chunks) / len(chunks):.0f} 字符")
# 2. 向量化
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks, show_progress_bar=True)
# 3. 存入ChromaDB
client = chromadb.Client()
collection = client.create_collection(name=f"pdf_{strategy.value}")
collection.add(
documents=chunks,
embeddings=embeddings.tolist(),
metadatas=metadatas,
ids=[f"chunk_{i}" for i in range(len(chunks))]
)
print("✓ 已存入向量数据库")
# 4. 测试检索
query = "Both models are also powering the open-access deepsearch-experience"
query_embedding = model.encode([query])[0]
results = collection.query(
query_embeddings=[query_embedding.tolist()],
n_results=3
)
print("\n" + "="*80)
print("检索结果:")
print("="*80)
for i, (doc, meta) in enumerate(zip(results["documents"][0], results["metadatas"][0]), 1):
print(f"\n【Top {i}】")
print(f"📄 {meta['source']} | 页码: {meta['page']}")
if meta.get('section'):
print(f"📑 章节: {meta['section']}")
print(f"🏷️ 类型: {meta['type']}")
print(f"\n内容:\n{doc[:400]}...")
print("-"*80)
if __name__ == "__main__":
main()
614

被折叠的 条评论
为什么被折叠?



