对这段代码进行详细的熟悉和解析:class MarkdownTextSplitter(RecursiveCharacterTextSplitter):
def __init__(
self,
headers_to_split_on: List[Tuple[str, str]],
return_each_line: bool = False,
strip_headers: bool = False,
chunk_size: int = 2000,
chunk_overlap: int = 0,
length_function: Callable[[str], int] = len,
keep_separator: bool = True,
separators: Optional[List[str]] = None,
):
super().__init__(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=length_function,
keep_separator=keep_separator,
separators=separators,
)
self._headers_to_split_on = sorted(headers_to_split_on, key=lambda h: len(h[0]), reverse=True)
self._return_each_line = return_each_line
self._strip_headers = strip_headers
self._header_levels = {sep: sep.count("#") for sep, _ in headers_to_split_on}
self._header_names = [name for _, name in headers_to_split_on]
self._level_to_name = {sep.count("#"): name for sep, name in headers_to_split_on}
# 获取当前段落的最大标题层级
def _get_header_level_from_metadata(self, metadata: Dict[str, str]) -> int:
max_level = 0
for sep, name in self._headers_to_split_on:
if name in metadata:
level = sep.count("#")
if level > max_level:
max_level = level
return max_level
# # 解析 Markdown 文本,提取行内容及其对应标题元数据
def _parse_markdown_to_lines_with_metadata(self, text: str) -> List[LineType]:
md = MarkdownIt()
tokens = md.parse(text)
lines_with_metadata: List[LineType] = []
active_metadata: Dict[str, str] = {}
header_stack: List[HeaderType] = []
i = 0
buffer: List[str] = []
# 将 buffer 中的内容添加为一个段落
def flush_buffer():
if buffer:
content = "\n".join(buffer).strip("\n")
if content:
lines_with_metadata.append({
"content": content,
"metadata": active_metadata.copy()
})
buffer.clear()
while i < len(tokens):
token = tokens[i]
if token.type == "heading_open":
flush_buffer()
level = int(token.tag[1])
heading_text = ""
if i + 1 < len(tokens) and tokens[i + 1].type == "inline":
heading_text = tokens[i + 1].content.strip()
# 移除当前层级以上的旧标题
while header_stack and header_stack[-1]["level"] >= level:
popped = header_stack.pop()
if popped["name"] in active_metadata:
del active_metadata[popped["name"]]
name = self._level_to_name.get(level, f"header{level}")
active_metadata[name] = heading_text
header_stack.append({"level": level, "name": name, "data": heading_text})
if not self._strip_headers:
buffer.append(f"{'#' * level} {heading_text}")
i += 2
elif token.type == "fence" or token.type == "code_block":
flush_buffer()
code = token.markup + token.content + token.markup
lines_with_metadata.append({
"content": code.strip("\n"),
"metadata": active_metadata.copy()
})
i += 1
elif token.type == "paragraph_open":
para = ""
if i + 1 < len(tokens) and tokens[i + 1].type == "inline":
para = tokens[i + 1].content
buffer.append(para)
i += 3
else:
i += 1
flush_buffer()
return lines_with_metadata
# 从 chunk 内容中提取所有子标题(用于 heading_path)
def _extract_subheadings_from_content(self, content: str) -> List[str]:
md = MarkdownIt()
tokens = md.parse(content)
subheadings = []
for i in range(len(tokens)):
if tokens[i].type == "heading_open" and i + 1 < len(tokens) and tokens[i + 1].type == "inline":
heading_text = tokens[i + 1].content.strip()
if heading_text:
subheadings.append(heading_text)
return subheadings
# 合并过小的分块,防止生成碎片过多
def merge_small_chunks(self, docs: List[Document], chunk_size: int, length_function: Callable[[str], int] = len) -> List[Document]:
merged_docs = []
current_content = []
current_metadata = None
current_length = 0
# 合并 heading_path 元信息并去重
def merge_metadata(meta1, meta2):
hp1 = meta1.get("heading_path", [])
hp2 = meta2.get("heading_path", [])
combined_hp = []
seen = set()
for h in hp1 + hp2:
if h not in seen:
combined_hp.append(h)
seen.add(h)
merged_meta = meta1.copy()
merged_meta["heading_path"] = combined_hp
return merged_meta
for doc in docs:
doc_len = length_function(doc.page_content)
if current_length + doc_len <= chunk_size:
current_content.append(doc.page_content)
if current_metadata is None:
current_metadata = doc.metadata.copy()
else:
current_metadata = merge_metadata(current_metadata, doc.metadata)
current_length += doc_len
else:
if current_content:
merged_docs.append(
Document(
page_content="\n".join(current_content),
metadata=current_metadata,
vector=None
)
)
current_content = [doc.page_content]
current_metadata = doc.metadata.copy()
current_length = doc_len
if current_content:
merged_docs.append(
Document(
page_content="\n".join(current_content),
metadata=current_metadata,
vector=None
)
)
return merged_docs
# 处理传入文档集合,返回分块后的文档列表
def split_documents(self, documents: Iterable[Document]) -> List[Document]:
results = []
for doc in documents:
metadata = doc.metadata.copy()
if self._return_each_line:
lines = self._parse_markdown_to_lines_with_metadata(doc.page_content)
for i, line in enumerate(lines):
line_metadata = metadata.copy()
heading_path = [line["metadata"][h] for h in self._header_names if h in line["metadata"]]
line_metadata["heading_path"] = heading_path
line_metadata["position"] = i + 1
line_metadata["content_type"] = self._get_content_type(line["content"])
results.append(Document(page_content=line["content"], metadata=line_metadata))
continue
lines = self._parse_markdown_to_lines_with_metadata(doc.page_content)
has_any_header = any(
any(h in line["metadata"] for h in self._header_names)
for line in lines
)
if has_any_header:
chunks_data = self._recursive_chunk_and_split(lines, {}, 1)
for chunk in chunks_data:
chunk_metadata = {**metadata, **chunk["metadata"]}
heading_path = [chunk_metadata[h] for h in self._header_names if h in chunk_metadata]
subheadings = self._extract_subheadings_from_content(chunk["content"])
full_heading_path = heading_path + subheadings
seen = set()
deduped_heading_path = []
for title in full_heading_path:
if title not in seen:
deduped_heading_path.append(title)
seen.add(title)
chunk_metadata["heading_path"] = deduped_heading_path
chunk_metadata["content_type"] = self._get_content_type(chunk["content"])
for h in self._header_names:
chunk_metadata.pop(h, None)
results.append(Document(page_content=chunk["content"], metadata=chunk_metadata))
else:
text_length = self._length_function(doc.page_content)
if text_length <= self._chunk_size:
metadata["heading_path"] = []
metadata["content_type"] = self._get_content_type(doc.page_content)
results.append(Document(page_content=doc.page_content, metadata=metadata))
else:
sub_texts = super().split_text(doc.page_content)
for chunk_text in sub_texts:
new_meta = metadata.copy()
new_meta["heading_path"] = []
new_meta["content_type"] = self._get_content_type(chunk_text)
results.append(Document(page_content=chunk_text, metadata=new_meta))
if not self._return_each_line:
results = self.merge_small_chunks(results, self._chunk_size, length_function=self._length_function)
for i, doc in enumerate(results):
new_metadata = dict(doc.metadata) if doc.metadata else {}
new_metadata["heading_path"] = " >>> ".join(new_metadata["heading_path"])
new_metadata["position"] = i + 1
doc.metadata = new_metadata
return results
# 按指定 Markdown 标题层级进行递归分块
def _recursive_chunk_and_split( self, lines: List[LineType], current_base_metadata: Dict[str, str], target_level: int = 1) -> List[LineType]:
processed_segments: List[LineType] = []
# 获取配置中定义的最大标题层级(即最多几个#)
max_configured_header_level = 0
if self._headers_to_split_on:
max_configured_header_level = max(sep.count("#") for sep, _ in self._headers_to_split_on)
if target_level > max_configured_header_level or not lines:
if lines:
aggregated_content = "\n".join([line["content"] for line in lines])
segment_metadata = {**current_base_metadata, **lines[0]["metadata"]}
chunks_from_segment = self._handle_section_splitting(
aggregated_content,
segment_metadata,
current_base_metadata,
target_level - 1
)
processed_segments.extend(chunks_from_segment)
return processed_segments
# 检查当前层级的标题是否存在,决定是否可以分块
can_split_at_this_target_level = False
for line_data in lines:
stripped_line_content = line_data["content"].strip()
for sep, _ in self._headers_to_split_on:
if sep.count("#") == target_level and stripped_line_content.startswith(sep) and \
(len(stripped_line_content) == len(sep) or (
len(stripped_line_content) > len(sep) and stripped_line_content[len(sep)] == " ")):
can_split_at_this_target_level = True
break
if can_split_at_this_target_level:
break
# 初始化当前分段缓冲区
current_segment_lines: List[LineType] = []
# 判断第一行是否是当前标题层级的标题
first_line_is_target_level_header = False
if lines:
first_line_stripped = lines[0]["content"].strip()
for sep, _ in self._headers_to_split_on:
if sep.count("#") == target_level and first_line_stripped.startswith(sep) and \
(len(first_line_stripped) == len(sep) or (
len(first_line_stripped) > len(sep) and first_line_stripped[len(sep)] == " ")):
first_line_is_target_level_header = True
break
# 第一行不是标题,作为段前内容
if not first_line_is_target_level_header and lines:
current_segment_lines.append(lines[0])
start_index = 1
else:
if lines:
current_segment_lines.append(lines[0])
start_index = 1
# 遍历每一行,检测是否是当前标题层级的起始,如果是则划分段
for i in range(start_index, len(lines)):
line_data = lines[i]
stripped_line_content = line_data["content"].strip()
header_level_on_this_line = 0
for sep, _ in self._headers_to_split_on:
if stripped_line_content.startswith(sep) and \
(len(stripped_line_content) == len(sep) or (
len(stripped_line_content) > len(sep) and stripped_line_content[len(sep)] == " ")):
header_level_on_this_line = sep.count("#")
break
# 如果是当前层级或更浅层级的标题,视为新段起始
if header_level_on_this_line > 0 and header_level_on_this_line <= target_level:
if current_segment_lines:
aggregated_content = "\n".join([l["content"] for l in current_segment_lines])
segment_metadata = {**current_base_metadata, **current_segment_lines[0]["metadata"]}
processed_segments.append({
"content": aggregated_content,
"metadata": segment_metadata
})
current_segment_lines.clear()
current_segment_lines.append(line_data)
else:
current_segment_lines.append(line_data)
# 处理最后一个段落
if current_segment_lines:
aggregated_content = "\n".join([l["content"] for l in current_segment_lines])
segment_metadata = {**current_base_metadata, **current_segment_lines[0]["metadata"]}
processed_segments.append({
"content": aggregated_content,
"metadata": segment_metadata
})
# 对每个段落递归处理或判断是否拆分为更小的块
final_chunks: List[LineType] = []
for segment in processed_segments:
segment_content = segment["content"]
segment_metadata = segment["metadata"]
# 获取该段最深标题层级
defined_level_for_handling = self._get_header_level_from_metadata(segment_metadata)
# 如果没有标题信息,从父段继承定义层级
if defined_level_for_handling == 0 and len(
segment_metadata) == 0:
defined_level_for_handling = target_level - 1
elif defined_level_for_handling < target_level:
defined_level_for_handling = target_level - 1
# 进一步处理该段
chunks_from_segment = self._handle_section_splitting(
segment_content,
segment_metadata,
current_base_metadata,
defined_level_for_handling
)
final_chunks.extend(chunks_from_segment)
return final_chunks
# 用于处理某一个逻辑段落的最终拆分
def _handle_section_splitting(
self,
content: str,
section_metadata: Dict[str, str],
parent_base_metadata: Dict[str, str],
actual_defined_level: int
) -> List[LineType]:
section_length = self._length_function(content)
result_chunks: List[LineType] = []
if section_length <= self._chunk_size:
result_chunks.append({
"content": content,
"metadata": {
**section_metadata,
"content_type": self._get_content_type(content)
}
})
else:
next_level_to_split_by = actual_defined_level + 1
max_configured_header_level = max(sep.count("#") for sep, _ in self._headers_to_split_on)
can_recurse_deeper = False
lines_for_sub_splitting = []
if next_level_to_split_by <= max_configured_header_level:
temp_splitter_for_parsing = MarkdownTextSplitter(
headers_to_split_on=self._headers_to_split_on,
return_each_line=self._return_each_line,
strip_headers=self._strip_headers,
chunk_size=self._chunk_size,
chunk_overlap=self._chunk_overlap,
length_function=self._length_function,
keep_separator=self._keep_separator,
separators=self._separators,
)
lines_for_sub_splitting = temp_splitter_for_parsing._parse_markdown_to_lines_with_metadata(content)
for line_data in lines_for_sub_splitting:
for sep_in_config, name_in_config in self._headers_to_split_on:
if sep_in_config.count("#") == next_level_to_split_by and name_in_config in line_data[
"metadata"]:
can_recurse_deeper = True
break
if can_recurse_deeper:
break
if can_recurse_deeper:
sub_segments = self._recursive_chunk_and_split(
lines_for_sub_splitting,
current_base_metadata=section_metadata,
target_level=next_level_to_split_by
)
for sub_segment in sub_segments:
sub_segment_content = sub_segment["content"]
sub_segment_metadata = sub_segment["metadata"]
sub_segment_defined_level = self._get_header_level_from_metadata(sub_segment_metadata)
if sub_segment_defined_level == 0:
pass
if sub_segment_defined_level < next_level_to_split_by:
sub_segment_defined_level = next_level_to_split_by
chunks_from_sub_segment = self._handle_section_splitting(
sub_segment_content,
sub_segment_metadata,
section_metadata,
sub_segment_defined_level
)
result_chunks.extend(chunks_from_sub_segment)
else:
plain_text_sub_chunks = super().split_text(content)
if not plain_text_sub_chunks and content.strip():
plain_text_sub_chunks = [content]
if plain_text_sub_chunks is None:
plain_text_sub_chunks = []
header_to_prepend_text = ""
if not self._strip_headers:
deepest_level_in_metadata = 0
deepest_header_name_key = None
for header_sep_in_config, header_name_in_config in self._headers_to_split_on:
if header_name_in_config in section_metadata:
current_level = header_sep_in_config.count("#")
if current_level > deepest_level_in_metadata:
deepest_level_in_metadata = current_level
deepest_header_name_key = header_name_in_config
if deepest_header_name_key:
separator_to_use = ""
for sep, name in self._headers_to_split_on:
if name == deepest_header_name_key:
separator_to_use = sep
break
header_value = section_metadata.get(deepest_header_name_key)
if separator_to_use and header_value:
header_to_prepend_text = f"{separator_to_use} {header_value}\n\n"
for pt_chunk in plain_text_sub_chunks:
pt_chunk_stripped = pt_chunk.lstrip()
if header_to_prepend_text.strip() and pt_chunk_stripped.startswith(header_to_prepend_text.strip()):
final_content = pt_chunk
else:
final_content = header_to_prepend_text + pt_chunk if header_to_prepend_text else pt_chunk
result_chunks.append({
"content": final_content,
"metadata": {
**section_metadata,
"content_type": self._get_content_type(final_content)
}
})
return result_chunks
# 将文本切分为多个字符串片段,不包含元数据
def split_text(self, text: str) -> List[str]:
lines_with_metadata = self._parse_markdown_to_lines_with_metadata(text)
final_chunks_data = self._recursive_chunk_and_split(
lines_with_metadata,
current_base_metadata={},
target_level=1,
)
return [lt["content"] for lt in final_chunks_data]
# 识别文本块中的内容类型(纯文本、图片或图文)
def _get_content_type(self, content: str) -> List[str]:
md = MarkdownIt()
tokens = md.parse(content)
has_text = False
has_image = False
for token in tokens:
if token.type == "inline":
for child in token.children or []:
if child.type == "image":
has_image = True
elif child.type == "text" and child.content.strip():
has_text = True
elif token.type == "fence" or token.type == "paragraph_open":
has_text = True
if has_text and has_image:
return ["text", "image"]
elif has_image:
return ["image"]
else:
return ["text"]