- 直接上代码+注释
- 有意尝试可交流
- 效果正在验证中。
###1.短文本处理(<500tokens)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2') # 384维小型模型
def process_short(text):
"""直接全文本编码"""
return model.encode(text, convert_to_tensor=True)
# 示例
short_text = "自然语言处理的基础概念" # 长度约15 tokens
vector = process_short(short_text)
2. 中长文本处理 (500-2000 tokens)
from langchain_text_splitters import RecursiveCharacterTextSplitter
def process_medium(text):
"""重叠分块策略"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50,
separators=["\n\n", "\n", "。", "!", "?"]
)
chunks = splitter.split_text(text)
return [model.encode(chunk) for chunk in chunks]
# 示例
medi