混合检索源码解析
RetrievalService.retrieve
代码目录:api/core/rag/datasource/retrieval_service.py
该函数用于根据指定的检索方法执行检索操作
def retrieve(cls, retrival_method: str, dataset_id: str, query: str,
top_k: int, score_threshold: Optional[float] = .0,
reranking_model: Optional[dict] = None, reranking_mode: Optional[str] = None,
weights: Optional[dict] = None):
_"""_
_ 根据指定的检索方法执行检索操作。_
_ :param retrival_method: 检索方法类型。 semantic_search full_text_search keyword_search,hybrid_search_
_ :param dataset_id: 数据集ID。_
_ :param query: 检索查询字符串。_
_ :param top_k: 返回结果的最大数量。_
_ :param score_threshold: 分数阈值,用于过滤结果。_
_ :param reranking_model: 重排序模型配置,用于对结果进行二次排序。_
_ :return: 检索结果列表。_
_ """_
_ _dataset = db.session.query(Dataset).filter(
Dataset.id == dataset_id
).first()
if not dataset or dataset.available_document_count == 0 or dataset.available_segment_count == 0:
return []
all_documents = [] # 存储所有检索结果
keyword_search_documents = []
embedding_search_documents = []
full_text_search_documents = []
hybrid_search_documents = []
threads = [] # 存储执行检索的线程
exceptions = [] # 存储执行过程中遇到的异常
# retrieval_model source with keyword
# 关键词检索
if retrival_method == 'keyword_search':
keyword_thread = threading.Thread(target=RetrievalService.keyword_search, kwargs={
'flask_app': current_app._get_current_object(),
'dataset_id': dataset_id,
'query': query,
'top_k': top_k,
'all_documents': all_documents,
'exceptions': exceptions,
})
threads.append(keyword_thread)
keyword_thread.start()
# 向量检索(混合检索中也会调用)
if RetrievalMethod.is_support_semantic_search(retrival_method):
embedding_thread = threading.Thread(target=RetrievalService.embedding_search, kwargs={
'flask_app': current_app._get_current_object(),
'dataset_id': dataset_id,
'query': query,
'top_k': top_k,
'score_threshold': score_threshold,
'reranking_model': reranking_model,
'all_documents': all_documents,
'retrival_method': retrival_method,
'exceptions': exceptions,
})
threads.append(embedding_thread)
embedding_thread.start()
# 文本检索(混合检索中也会调用)
if RetrievalMethod.is_support_fulltext_search(retrival_method):
full_text_index_thread = threading.Thread(target=RetrievalService.full_text_index_search, kwargs={
'flask_app': current_app._get_current_object(),
'dataset_id': dataset_id,
'query': query,
'retrival_method': retrival_method,
'scor