函数 | 解释 | 导包 |
---|---|---|
doc=[Document(page_content="文本", metadata=({文本出处}))] | langchain 中处理文本的类型 | from langchain_core.documents import Document |
vectorstore=Chroma.from_documents (documents=doc,embedding= MistralAIEmbedding()) | 将文档转化为向量存储器 | from langchain_chrom import Chroma 、from langchain_mistralai import MistralAIEmbedding |
vectorstore.similarity_search("文本") | 根据字符对向量存储进行相似搜索 | - |
await vectorstore.asimilarity_search("文本") | 异步搜索 | - |
vectorstore. similarity_search_with_score("文本") | 带相似度的相似搜索 | - |
embedding= MistralAIEmbeddings().embed_query("文本") | 将文本处理成向量形式 | from langchain_mistralai import MistralAIEmbedding |
retriver=Runnablelambda(vectorstore .similarity_search),bind(k=1) | 创建对向量存储器的相似搜索的检索器 | from langchain_core.runnables import RunnableLambda |
检索器名.invoke("字符串") | 对字符串进行检索 | - |
检索器名.batch([字符串数组]) | 批量检索 | - |
retriever=vectorstion.as_retriever( search_type="similarity",search_kwargs={"k":1}) | 使用 as_retriever 创建检索器 | - |
"question":RunnablePassthrough() | 直接传递 | from langchain_core.runables import RunnablePassthrough |
向量存储和检索器
- 是什么?
- 向量存储器:将文本转换为向量,并存储在数据库中
- 检索器:从数据库中检索与查询最相似的向量
- 检索增强生成器:使用检索到的信息来增强生成文本 - 有什么用?
- 可以使用大模型与本地数据库结合,实现更精准的检索
-
将文档(documents)转化为向量存储(vectorstore)
- vectorstore不是Runnable,不能直接连接到模型中
# 加载环境变量
from dotenv import load_dotenv
load_dotenv()
True
# 添加langsmith,在.env文件中添加LANGCHAIN_API_KEY
import os
os.environ["LANGCHAIN_TRACING_V2"]="true"
# 创建一些来自不同原的文档
from langchain_core.documents import Document #可以将非结构化文本转化为结构化数据
doc=[
Document(page_content="狗是人类很棒的朋友,它们忠诚、友好、聪明。",
metadata={"source":"mamal_pets-doc"}),
Document(page_content="猫通常喜欢自己的空间,它们独立、优雅、聪明。",
metadata={"source":"mamal_pets-doc"}),
Document(page_content="鸟类通常是杂食性的,它们灵活、好奇、聪明。",
metadata={"source":"brid_pets-doc"}),
Document(page_content="金鱼是冷血动物,它们适应性强、好奇、聪明。",
metadata={"source":"fish_pets-doc"}),
Document(page_content="乌龟是爬行动物,它们适应性强、好奇、聪明。",
metadata={"source":"reptile_pets-doc"}),
]
# 将文档转化为向量存储
from langchain_chroma import Chroma
from langchain_mistralai import MistralAIEmbeddings
vectorstore=Chroma.from_documents( #创建向量存储器
documents=doc,
embedding=MistralAIEmbeddings()
)
d:\software\anaconda\envs\langchain-310py\lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
d:\software\anaconda\envs\langchain-310py\lib\site-packages\langchain_mistralai\embeddings.py:169: UserWarning: Could not download mistral tokenizer from Huggingface for calculating batch sizes. Set a Huggingface token via the HF_TOKEN environment variable to download the real tokenizer. Falling back to a dummy tokenizer that uses len()
.
warnings.warn(
# 根据字符串对向量存储进行相似搜索
vectorstore.similarity_search("猫")
[Document(metadata={‘source’: ‘mamal_pets-doc’}, page_content=‘猫通常喜欢自己的空间,它们独立、优雅、聪明。’),
Document(metadata={‘source’: ‘fish_pets-doc’}, page_content=‘金鱼是冷血动物,它们适应性强、好奇、聪明。’),
Document(metadata={‘source’: ‘reptile_pets-doc’}, page_content=‘乌龟是爬行动物,它们适应性强、好奇、聪明。’),
Document(metadata={‘source’: ‘brid_pets-doc’}, page_content=‘鸟类通常是杂食性的,它们灵活、好奇、聪明。’)]
#异步搜索
await vectorstore.asimilarity_search("猫")
[Document(metadata={‘source’: ‘mamal_pets-doc’}, page_content=‘猫通常喜欢自己的空间,它们独立、优雅、聪明。’),
Document(metadata={‘source’: ‘fish_pets-doc’}, page_content=‘金鱼是冷血动物,它们适应性强、好奇、聪明。’),
Document(metadata={‘source’: ‘reptile_pets-doc’}, page_content=‘乌龟是爬行动物,它们适应性强、好奇、聪明。’),
Document(metadata={‘source’: ‘brid_pets-doc’}, page_content=‘鸟类通常是杂食性的,它们灵活、好奇、聪明。’)]
# 根据字符串对向量存储进行相似搜索,并返回相似度
vectorstore.similarity_search_with_score("猫")
[(Document(metadata={‘source’: ‘mamal_pets-doc’}, page_content=‘猫通常喜欢自己的空间,它们独立、优雅、聪明。’),
0.3135683536529541),
(Document(metadata={‘source’: ‘fish_pets-doc’}, page_content=‘金鱼是冷血动物,它们适应性强、好奇、聪明。’),
0.5208978056907654),
(Document(metadata={‘source’: ‘reptile_pets-doc’}, page_content=‘乌龟是爬行动物,它们适应性强、好奇、聪明。’),
0.5280909538269043),
(Document(metadata={‘source’: ‘brid_pets-doc’}, page_content=‘鸟类通常是杂食性的,它们灵活、好奇、聪明。’),
0.5349432826042175)]
# 根据embeding处理成向量后进行相似搜索
embedding=MistralAIEmbeddings().embed_query("猫")
vectorstore.similarity_search_by_vector(embedding)
[Document(metadata={‘source’: ‘mamal_pets-doc’}, page_content=‘猫通常喜欢自己的空间,它们独立、优雅、聪明。’),
Document(metadata={‘source’: ‘fish_pets-doc’}, page_content=‘金鱼是冷血动物,它们适应性强、好奇、聪明。’),
Document(metadata={‘source’: ‘reptile_pets-doc’}, page_content=‘乌龟是爬行动物,它们适应性强、好奇、聪明。’),
Document(metadata={‘source’: ‘brid_pets-doc’}, page_content=‘鸟类通常是杂食性的,它们灵活、好奇、聪明。’)]
使用检索器(Retriever)
- 检索器是Runnable,可以直接连接到模型中
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda
retriever=RunnableLambda(vectorstore.similarity_search).bind(k=1) #创建检索器
#RunnableLambda将函数转换为Runnable
#bind(k=1):设置每次搜索返回的结果数为1
# 使用检索器
retriever.batch(["猫","狗"]) #批量搜索
[[Document(metadata={‘source’: ‘mamal_pets-doc’}, page_content=‘猫通常喜欢自己的空间,它们独立、优雅、聪明。’)],
[Document(metadata={‘source’: ‘mamal_pets-doc’}, page_content=‘狗是人类很棒的朋友,它们忠诚、友好、聪明。’)]]
# 使用as_retriever创建检索器(更灵活)
retriever=vectorstore.as_retriever(
search_type="similarity", #设置搜索类型为相似度搜索,还有"mmr"最大边际相关性搜索,“similarity_score_threshold”相似度阈值搜索
search_kwargs={"k":1}
)
retriever.batch(["猫","狗"])
[[Document(metadata={‘source’: ‘mamal_pets-doc’}, page_content=‘猫通常喜欢自己的空间,它们独立、优雅、聪明。’)],
[Document(metadata={‘source’: ‘mamal_pets-doc’}, page_content=‘狗是人类很棒的朋友,它们忠诚、友好、聪明。’)]]
使用示例
from langchain_mistralai import ChatMistralAI
chat_model=ChatMistralAI(model="mistral-large-latest")
chat_model.invoke("你好")
AIMessage(content=‘你好!有什么我可以帮忙的吗?’, additional_kwargs={}, response_metadata={‘token_usage’: {‘prompt_tokens’: 6, ‘total_tokens’: 23, ‘completion_tokens’: 17}, ‘model’: ‘mistral-large-latest’, ‘finish_reason’: ‘stop’}, id=‘run-e17c914e-4aff-427a-a674-df93eb3896e4-0’, usage_metadata={‘input_tokens’: 6, ‘output_tokens’: 17, ‘total_tokens’: 23})
# 创建提示词
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
message="""
你只能使用提供的内容来回答以下问题:
问题:
{question}
内容:
{context}
"""
prompt=ChatPromptTemplate.from_messages([
("human","message")
])
rag_chain={
"context":retriever, #先在检索器根据问题检索到的内容,在将内容模型回答的上下文
"question":RunnablePassthrough() #将问题直接传递给模型
} | prompt | chat_model
# 使用RAG强化生成链
response=rag_chain.invoke("猫有哪些特点?")
# 实际执行过程:
# 1. {"context": "猫通常喜欢自己的空间,它们独立、优雅、聪明。",
# "question": "猫的特点是什么?"}
# 2. 填入提示词模板
# 3. 发送给语言模型
# 4. 生成回答
print(response.content)
error
示例(英语)
documents = [
Document(
page_content="Dogs are great companions, known for their loyalty and friendliness.",
metadata={"source": "mammal-pets-doc"},
),
Document(
page_content="Cats are independent pets that often enjoy their own space.",
metadata={"source": "mammal-pets-doc"},
),
Document(
page_content="Goldfish are popular pets for beginners, requiring relatively simple care.",
metadata={"source": "fish-pets-doc"},
),
Document(
page_content="Parrots are intelligent birds capable of mimicking human speech.",
metadata={"source": "bird-pets-doc"},
),
Document(
page_content="Rabbits are social animals that need plenty of space to hop around.",
metadata={"source": "mammal-pets-doc"},
),
]
vectorstore = Chroma.from_documents(
documents,
embedding=MistralAIEmbeddings(),
)
d:\software\anaconda\envs\langchain-310py\lib\site-packages\langchain_mistralai\embeddings.py:169: UserWarning: Could not download mistral tokenizer from Huggingface for calculating batch sizes. Set a Huggingface token via the HF_TOKEN environment variable to download the real tokenizer. Falling back to a dummy tokenizer that uses len()
.
warnings.warn(
retriever = vectorstore.as_retriever(
search_type="similarity",
search_kwargs={"k": 1},
)
message = """
Answer this question using the provided context only.
{question}
Context:
{context}
"""
prompt = ChatPromptTemplate.from_messages([("human", message)])
rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | chat_model
response = rag_chain.invoke("tell me about cats")
print(response.content)
Cats are independent pets that often enjoy their own space.