根据 LangChain 学习(二)进行优化调整嵌入模型,然后重新进行问答。
import os
import nltk
from langchain_community.document_loaders import PyPDFLoader
from sentence_transformers import SentenceTransformer
from pymilvus import connections, utility, FieldSchema, CollectionSchema, DataType, Collection
# 禁用该警告
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"
# Step 1: 加载文档目录中PDF文件
loader = PyPDFLoader("pdf/AI大模型面试题.pdf")
documents = loader.load()
# 查看加载的文档
# print(f"加载 {len(documents)} 页文档。")
# total_text_length = sum(len(doc.page_content) for doc in documents)
# print(f"文本总长度: {total_text_length} 字符。")
# Step 2: 创建 Sentence-Transformers 模型
model_name = "sentence-transformers/all-mpnet-base-v2" # 使用 SentenceTransformer
model = SentenceTransformer(model_name)
# Step 3: 拆分文档
# 下载 nltk 的 punkt 句子拆分模型
try:
nltk.download("punkt", quiet=True)
except Exception as e:
print(f"下载NLTK punkt模型时出错: {e}")
# 使用 nltk 拆分每个文档的句子
all_sentences = []
texts = []
ids = [] # ID 列表
sentence_id = 0 # 初始化 ID 设计器
for document in documents:
# print(document.__dict__)
# 确保 document 有 page_content 属性
if hasattr(document, "page_content"):
document_text = document.page_content # 获取文本内容
sentences = nltk.sent_tokenize(document_text) # 对每个文本内容拆分句子
all_sentences.extend(sentences) # 将句子添加到总句子列表中
texts.extend(sentences) # 扁平化存储文本
# 为每个句子生成唯一 ID
ids.extend([sentence_id + i for i in range(len(sentences))])
sentence_id += len(sentences)
else:
print("当前 document 缺少 'page_content' 属性:", document)
# Step 4: 为每个句子生成嵌入向量
try:
sentence_embeddings = model.encode(all_sentences, convert_to_tensor=True)
# 打印嵌入结果
print("嵌入向量形状:", sentence_embeddings.shape)
print("嵌入向量:", sentence_embeddings)
except Exception as e:
print(f"生成嵌入向量时出错: {e}")
# Step 5: 连接到 Milvus 数据库
connections.connect(host="127.99.199.88", port="19530", alias="default", user="root", password="Milvus2020")
# Step 6: 定义 Milvus 集合的 schema
fields = [
FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=False), # 主键字段
FieldSchema(name="sentence_embedding", dtype=DataType.FLOAT_VECTOR, dim=768),
FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=5000)
]
schema = CollectionSchema(fields=fields, description="句子嵌入")
# Step 7: 创建集合(如果不存在)
collection_name = "sentence_embeddings"
if not utility.has_collection(collection_name):
collection = Collection(name=collection_name, schema=schema)
else:
collection = Collection(name=collection_name)
# Step 8: 准备数据
data = [
ids, # 自增 ID 列表
sentence_embeddings.tolist(), # 嵌入向量转换为列表格式
texts # 文本内容
]
# Step 9: 将数据插入到 Mivus
collection.insert(data)
# Step 10: 刷新集合以确保数据写入
collection.flush()
print("数据插入 Milvus 成功!")
from langchain_community.chat_models.tongyi import ChatTongyi
from pymilvus import connections, Collection
from sentence_transformers import SentenceTransformer
# Step 1: 创建 Hugging Face 嵌入对象
model_name = "sentence-transformers/all-mpnet-base-v2" # 使用 SentenceTransformer
model = SentenceTransformer(model_name)
# Step 2: 连接到 Milvus 数据库
connections.connect(host="127.99.199.88", port="19530", alias="default", user="root", password="Milvus2020")
# Step 3: 加载集合
collection_name = "sentence_embeddings"
collection = Collection(name=collection_name)
# Step 4: 创建索引
indexs = collection.indexes
if not any(index.field_name == "sentence_embedding" for index in indexs):
index_params = {
"index_type": "IVF_SQ8", # 或者选择其他类型的索引
"metric_type": "IP", # 可以选择 L2, IP 等
"params": {"nlist": 32} # nlist 是根据你的数据调整的参数
}
collection.create_index(field_name="sentence_embedding", index_params=index_params)
else:
print("嵌入字段的索引已存在,无需重新创建。")
collection.load() # 加载集合,以便进行搜索
# 查询集合中的前10条数据
# all_data = collection.query(expr="id >= 0", limit=10, output_fields=["id", "embedding", "text"])
# print("集合中的数据:", all_data)
# Step 5: 定义查询的嵌入向量
query_text = "目前主流的大模型体系有哪些?"
query_embedding = model.encode([query_text])
# 将查询嵌入转换为列表格式,以便 Milvus 接受
query_embedding = query_embedding.tolist()
# 打印查询向量
print(f"查询向量维度: {len(query_embedding)}")
print(f"查询向量内容: {query_embedding}")
# Step 6: 从 Milvus 中检索相似的文档
search_params = {"metric_type": "IP", "params": {"nprobe": 16}} # 选择相似度度量和检索参数
results = collection.search(
data=query_embedding, # 查询数据必须是二维列表
anns_field="sentence_embedding",
param=search_params,
limit=5, # 返回前 5 个最相似的结果
output_fields=["id","sentence_embedding","text"] # 指定要返回的字段
)
print(results)
# Step 7: 获取检索结果
retrieved_texts = [result.entity.get("text") for result in results[0]] # 提取文本内容
print(retrieved_texts)
# Step 8: 构建提示词
import os
import json # 导入 json 模块
from openai import OpenAI
from dotenv import load_dotenv
# 记载环境变量
load_dotenv(dotenv_path="config/config.env")
app_key = os.getenv("DASHSCOPE_API_KEY")
client = OpenAI(
# 若没有配置环境变量,请用百炼API Key将下行替换为:api_key="sk-xxx",
api_key=app_key,
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
model="qwen-plus", # 模型列表:https://help.aliyun.com/zh/model-studio/getting-started/models
messages=[
{"role": "system", "content": "你是一个问答机器人。你的任务是根据下述给定的已知信息回答用户问题。确保你的回复完全依据下述已知信息,不要编造答案。如果下述已知信息不足以回答用户的问题,请直接回复'我无法回答您的问题'。"},
{"role": "user", "content": f"已知信息:\n{retrieved_texts}\n\n用户问题:\n{query_text}\n\n请用中文回答用户问题。"}
]
)
# print(completion.model_dump_json())
try:
response_json = json.loads(completion.model_dump_json()) # 解析 JSON 字符串为字典
print("大模型的回答:", response_json['choices'][0]['message']['content'])
except json.JSONDecodeError as e:
print(f"解析 JSON 时出错: {e}")
except Exception as e:
print(f"获取模型回答时出错: {e}")