【人工智能】QWEN + RAG系统

结合qwen2.5-7b + bge-reranker-base + stella-base-zh-v2 组合成的rag系统

其中测试数据和模型需要自行下载,modelscope就可以。

测试数据链接:Streamlit

# -*- coding: utf-8 -*-

from flask import Flask, request, jsonify
import jieba
import json
import pdfplumber
from rank_bm25 import BM25Okapi
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
import numpy as np


app = Flask(__name__)

# 加载模型和初始化
device = "cuda" if torch.cuda.is_available() else "cpu"

rerank_tokenizer = AutoTokenizer.from_pretrained('/home/sky/model_data/bge-reranker-base')
rerank_model = AutoModelForSequenceClassification.from_pretrained('/home/sky/model_data/bge-reranker-base').to(device)

model_path = '/home/sky/model_data/Qwen/Qwen2.5-7B-Instruct'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_path, torch_dtype="auto",
                                             device_map="auto").to(device)

sent_model = SentenceTransformer('/home/sky/model_data/stella-base-zh-v2')

pdf_content = []


def load_pdf(knowledge_data_path):
    global pdf_content
    pdf = pdfplumber.open(knowledge_data_path)
    pdf_content = []
    for page_idx in range(len(pdf.pages)):
        text = pdf.pages[page_idx].extract_text()
        new_text = split_text_fixed_size(text, chunk_size=100, overlap_size=5)
        for chunk_text in new_text:
            pdf_content.append({
                'page': 'page_' + str(page_idx + 1),
                'content': chunk_text
            })

    # 构建索引
    global pdf_content_words, bm25, pdf_embeddings
    pdf_content_words = [jieba.lcut(x['content']) for x in pdf_content]
    bm25 = BM25Okapi(pdf_content_words)
    pdf_content_sentences = [x['content'] for x in pdf_content]
    pdf_embeddings = sent_model.encode(pdf_content_sentences, normalize_embeddings=True)


def split_text_fixed_size(text, chunk_size, overlap_size):
    new_text = []
    for i in range(0, len(text), chunk_size - overlap_size):
        end = min(i + chunk_size, len(text))
        start = max(0, i - overlap_size)
        new_text.append(text[start:end])
    return new_text


def get_rank_index(max_score_page_idxs_, question_, pdf_content_):
    pairs = [[question_, pdf_content_[idx]['content']] for idx in max_score_page_idxs_]

    inputs = rerank_tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512).to(device)
    with torch.no_grad():
        logits = rerank_model(**inputs, return_dict=True).logits.view(-1).float().cpu().numpy()
        # Apply sigmoid to convert logits to probabilities
        scores = 1 / (1 + np.exp(-logits))  # Sigmoid function

    max_score_idx = scores.argmax()
    max_score = scores[max_score_idx]
    index = max_score_page_idxs_[max_score_idx]
    return max_score, index

def qwen_preprocess(tokenizer_, ziliao, question):
    if ziliao:
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"帮我结合给定的资料,回答问题。如果问题答案无法从资料中获得,"
                                        f"输出结合给定的资料,无法回答问题. 如果找到答案, 就输出找到的答案, 资料:{ziliao}, 问题:{question}"},
        ]
    else:
        messages = [
            {"role": "system", "content": "你是由智子引擎研发的AI小助手,可以帮助解答各种领域的问题."},
            {"role": "user", "content": f"问题:{question}"},
        ]


    text = tokenizer_.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    model_inputs_ = tokenizer_([text], return_tensors="pt").to(device)

    return model_inputs_


@app.route('/qa', methods=['POST'])
def qa():
    data = request.json
    question = data.get('question', '')

    if not question:
        return jsonify({'error': 'No question provided'}), 400

    # 首先进行BM25检索
    doc_scores = bm25.get_scores(jieba.lcut(question))
    bm25_score_page_idxs = doc_scores.argsort()[-10:][::-1]

    # 再进行语义检索
    question_embedding = sent_model.encode([question], normalize_embeddings=True)
    score = question_embedding @ pdf_embeddings.T
    ste_score_page_idxs = score.argsort()[0][-10:][::-1]

    bm25_score, bm25_index = get_rank_index(bm25_score_page_idxs, question, pdf_content)
    ste_score, ste_index = get_rank_index(ste_score_page_idxs, question, pdf_content)

    max_score_page_idx = ste_index if ste_score >= bm25_score else bm25_index
    # 得分
    final_score = ste_score if ste_score >= bm25_score else bm25_score
    # 设定一个阈值来判断是否找到了有效的答案
    threshold = 0.5
    if final_score < threshold:
        # 如果得分低于阈值,直接用Qwen模型回答
        model_inputs = qwen_preprocess(tokenizer, "", question)  # 不提供参考资料
        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=128,
            attention_mask=model_inputs.attention_mask,
            pad_token_id=tokenizer.eos_token_id
        )
        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return jsonify({
            'answer': response,
            'score': float(final_score),
            'reference_page': "N/A"  # 没有参考页面
        })


    model_inputs = qwen_preprocess(tokenizer, pdf_content[max_score_page_idx]['content'], question)

    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=128,
        attention_mask=model_inputs.attention_mask,
        pad_token_id=tokenizer.eos_token_id
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return jsonify({'answer': response,"score": float(final_score),'reference_page': pdf_content[max_score_page_idx]['page']})


if __name__ == '__main__':
    knowledge_data_path = './初赛训练数据集.pdf'
    load_pdf(knowledge_data_path)
    app.run(host='0.0.0.0',port=5201,debug=False)

### Ollama与RAG技术介绍 #### Ollama概述 Ollama是一个强大的框架,旨在通过结合多种先进的人工智能模型和技术来提升自然语言处理的效果。该平台不仅支持高效的推理工具,而且能够在资源受限的环境中运行良好[^2]。 #### RAG技术详解 RAG(Retrieval-Augmented Generation)是一种融合了信息检索和生成式模型的方法。这种方法能够针对用户的查询提供高度相关的回复,因为它会先从大量的文档集合中找到最匹配的信息片段再进行响应构建[^3]。 ### 集成方案展示 为了实现基于Ollama的RAG应用,可以考虑如下架构: 1. **数据预处理** 数据源中的文本需被分割成适合索引的小段落或句子,并存储在一个向量数据库里以便快速访问。 2. **查询解析与相似度计算** 当接收到新的询问时,系统将其转换为嵌入表示形式并与预先存在的资料对比找出最佳匹配项。 3. **上下文增强的回答生成** 利用像Qwen这样的大型语言模型,在已获取的相关材料基础上创建更加精准且连贯的答案内容[^4]。 ```python from langchain import LangChain, RagFlow import ollama def create_ollama_rag(): # 初始化OLLAMA环境并加载所需的模型组件 model = ollama.load_model('qwen2') # 设置LangChain实例用于管理对话流程控制逻辑 chain = LangChain() # 构建完整的RAG工作流管道 rag_pipeline = RagFlow(model=model, lang_chain=chain) return rag_pipeline ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值