ElasticSearch-RAG-LangChain-从入门到实践-检索案例

import os
from dotenv import load_dotenv
# 加载环境变量
load_dotenv()
True
# print(os.getenv("OPENAI_API_KEY"))
import os
import glob
from langchain_core.documents import Document
from langchain_community.document_loaders import TextLoader, UnstructuredPDFLoader, UnstructuredMarkdownLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain_community.vectorstores import ElasticsearchStore
# 使用新的导入路径
from langchain_elasticsearch import ElasticsearchStore
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.agents import create_tool_calling_agent
from langchain.agents.agent import AgentExecutor
from langchain.tools.retriever import create_retriever_tool
from langgraph.graph import StateGraph, END
# ========== 配置部分 ========== #
DOCS_PATH = "docs/"
INDEX_NAME = "rag_agent_index"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ES_URL = "https://localhost:9200/"
# ========== 步骤1:文档加载 ========== #
def load_documents(path: str):
    documents = []
    for file in glob.glob(os.path.join(path, "**/*"), recursive=True):
        if file.endswith(".txt"):
            loader = TextLoader(file)
        elif file.endswith(".pdf"):
            loader = UnstructuredPDFLoader(file)
        elif file.endswith(".md"):
            loader = UnstructuredMarkdownLoader(file)
        else:
            continue
        docs = loader.load()
        documents.extend(docs)
        print(f"加载了 {
     
     len(docs)} 个文档,从 {
     
     file}, 显示前2个:", docs[:2])
    print(f"已加载文档总数: {
     
     len(documents)}。")
    return documents
# ========== 步骤2:文档切片 ========== #
def split_documents(documents):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    chunks = splitter.split_documents(documents)
    print(f"拆分成 {
     
     len(chunks)} 个块,显示前2个:", chunks[:2])
    return chunks
# ========== 步骤3:向量存储(Elasticsearch) ========== #
from elasticsearch import Elasticsearch
import urllib3
import ssl

# 禁用SSL警告
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def store_embeddings(chunks):
    embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

    # 创建自定义的Elasticsearch客户端
    es_client = Elasticsearch(
        hosts=[ES_URL],
        # 添加身份验证信息
        basic_auth=("elastic", "aBq2tOp_scT-crZnaKJa"),  # 替换为实际的用户名和密码
        verify_certs=False,
        ssl_show_warn=False,
        request_timeout=30,
        max_retries=3
    )
    
    # 使用自定义客户端
    db = ElasticsearchStore(
        index_name=INDEX_NAME,
        embedding=embeddings,
        es_connection=es_client
    )
    
    # 添加文档
    db.add_documents(chunks)

    # 添加文档
    added_ids = db.add_documents(chunks)
    
    print(f"在 Elasticsearch 中存储了 {
     
     len(
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值