Question2Answer插件:qa Connect

qaConnect是一款用于Question2Answer系统的插件,支持新浪微博和QQ社交账号登录,并改进了头像处理方式,避免数据库臃肿。此外,该插件将所有链接的相对地址改为绝对地址,以解决扩展链接时的问题。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

http://www.smyx.net/qa-connect.html

qa Connect 是Question2Answer问答系统的一个插件,可以使用社交帐号登录,目前支持使用新浪微博和QQ帐号登录。

去年底搭建了 微问答,在试用tomheng开发的Socail Login插件时发现连接慢等一些问题,因为我有开发WordPress连接微博的经验,对社交帐号登录有一些研究,决定自己搞一个,在独享了几个月后,决定整理出来分享给大家使用。

打开演示地址“>打开演示地址

提示:使用新浪微博或者QQ时,要在插件页面填写您申请的开放平台key。否则登录图标不会显示。

qa Connect 插件除了能用社交帐号登录外,还增加了一些功能:

1、支持使用社交帐号的URL头像作为用户头像:

默认情况下,QA上传头像或者保存社交帐号头像时,是把图片的整个数据流保存在数据库,有点吓人。我做了一些改动,直接把社交头像的URL保存在数据库,避免数据库越来越臃肿。

2、将所有链接的相对地址设置为绝对地址

在使用中发现qa默认调用的是相对地址,在扩展一些链接时发现不少问题,所以改为绝对地址,就是加上http://你的qa地址

下载地址“>下载地址

如果你想把个人资料弄得跟我问答系统一样,即“使用社交帐号登录时可以选择社交帐号头像,其他方式注册的用户可以上传头像”,如下图所示:

可以修改 qa-include/qa-page-account.php 文件,

找到以下内容,293行(Question2Answe V1.5.4)

if (isset($useraccount['avatarblobid']))

$avataroptions['uploaded']='<SPAN STYLE="margin:2px 0; display:inline-block;">'.

qa_get_avatar_blob_html($useraccount['avatarblobid'], $useraccount['avatarwidth'], $useraccount['avatarheight'], 32).

'</SPAN>'.$avataroptions['uploaded'];


改成

if (isset($useraccount['avatarblobid'])) {

$avataroptions['uploaded'] = '<SPAN STYLE="margin:2px 0; display:inline-block;">' . qa_get_avatar_blob_html($useraccount['avatarblobid'], $useraccount['avatarwidth'], $useraccount['avatarheight'], 32) . '</SPAN> ' . $avataroptions['uploaded'];

} else {

if (!empty($userprofile['social_avatar'])) {

$avataroptions['uploaded'] = '<SPAN STYLE="margin:2px 0; display:inline-block;"><img src="' . $userprofile['social_avatar'] . '" width="32" height="32" class="qa-avatar-image" /> 社交帐号头像</SPAN>';

} 

}



因为含有中文,请把修改后的文件保存为utf-8的格式。或者自定义语言,


import os import re import time import torch import torch.nn as nn import pandas as pd import numpy as np from datetime import datetime from torch.utils.data import Dataset, DataLoader, random_split from tqdm import tqdm import pickle import mysql.connector from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from gensim.models import Word2Vec # SpaCy 用来取代 nltk 的低效文本处理 import spacy nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) # 使用 gensim 的 Word2Vec 和 KeyedVectors from gensim.models import Word2Vec, TfidfModel from gensim.corpora import Dictionary # 自定义 Preprocess(快速版本) STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS def clean_text(text): return re.sub(r'[^a-zA-Z0-9\s]', '', str(text)).strip().lower() def tokenize(text): doc = nlp(clean_text(text)) return [token.text for token in doc if token.text not in STOPWORDS and token.text.isalnum()] def preprocess(text): tokens = tokenize(text) return " ".join(tokens) class SemanticMatchModel(nn.Module): def __init__(self, input_dim): super().__init__() self.fc1 = nn.Linear(input_dim, 256) self.bn1 = nn.BatchNorm1d(256) self.fc2 = nn.Linear(256, 128) self.bn2 = nn.BatchNorm1d(128) self.fc3 = nn.Linear(128, 64) self.bn3 = nn.BatchNorm1d(64) self.fc4 = nn.Linear(64, 1) self.dropout = nn.Dropout(0.3) self.relu = nn.ReLU() self.sigmoid = nn.Sigmoid() def forward(self, x): x = self.relu(self.bn1(self.fc1(x))) x = self.dropout(x) x = self.relu(self.bn2(self.fc2(x))) x = self.dropout(x) x = self.relu(self.bn3(self.fc3(x))) x = self.dropout(x) x = self.sigmoid(self.fc4(x)) return x class QADataset(Dataset): """ 数据集:将正样本 (question, answer) 与随机负样本 (question, random_answer) 拼接在一起, 其中正样本 label=1,负样本 label=0。 """ def __init__(self, qa_pairs, tfidf_vectorizer, negative_ratio=1.0): """ :param qa_pairs: [(question_text, answer_text), ...] :param tfidf_vectorizer: 已经fit好的 TfidfVectorizer :param negative_ratio: 每个正样本对应的负样本倍数 """ self.qa_pairs = qa_pairs self.vectorizer = tfidf_vectorizer self.samples = [] # 构造正样本 for i, (q, a) in enumerate(self.qa_pairs): self.samples.append((q, a, 1)) # label=1 # 构建负样本:random替换answer if negative_ratio > 0: negative_samples = [] total_pairs = len(self.qa_pairs) for i, (q, a) in enumerate(self.qa_pairs): for _ in range(int(negative_ratio)): rand_idx = np.random.randint(total_pairs) # 若随机到同一个qa对,就重新随机 while rand_idx == i: rand_idx = np.random.randint(total_pairs) neg_q, neg_a = self.qa_pairs[rand_idx] # 保持question不变,随机替换答案 negative_samples.append((q, neg_a, 0)) self.samples.extend(negative_samples) def __len__(self): return len(self.samples) def __getitem__(self, idx): q, a, label = self.samples[idx] q_vec = self.vectorizer.transform([preprocess(q)]).toarray()[0] a_vec = self.vectorizer.transform([preprocess(a)]).toarray()[0] pair_vec = np.concatenate((q_vec, a_vec)) return torch.tensor(pair_vec, dtype=torch.float32), torch.tensor(label, dtype=torch.float32) class KnowledgeBase: def __init__(self, host='localhost', user='root', password='hy188747', database='ubuntu_qa', table='qa_pair', model_dir=r"D:\NLP-PT\PT4\model", negative_ratio=1.0): print("🔄 初始化知识库...") self.host = host self.user = user self.password = password self.database = database self.table = table self.model_dir = model_dir self.negative_ratio = negative_ratio # 确保模型目录存在 os.makedirs(self.model_dir, exist_ok=True) self.qa_pairs = [] self.q_texts = [] self.a_texts = [] self.semantic_model = None self.word2vec_model = None self.tfidf_vectorizer = None self.tfidf_matrix = None # 第一步:从数据库载入数据 self.load_data_from_mysql() # 第二步:加载或缓存预处理后的文本 self.load_or_cache_processed_questions() # 第三步:加载 TF-IDF + 向量化 self.load_cached_tfidf() # 第四步:加载 Word2Vec 或使用缓存 self.load_cached_word2vec_model() # 第五步:加载 PyTorch 模型 model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') if os.path.exists(model_path): self.load_model() def load_data_from_mysql(self): print("🔄 正在连接 MySQL,加载问答数据...") conn = mysql.connector.connect( host=self.host, user=self.user, password=self.password, database=self.database ) cursor = conn.cursor() query = f"SELECT question_text, answer_text FROM {self.table}" cursor.execute(query) rows = cursor.fetchall() conn.close() self.qa_pairs = [(row[0], row[1]) for row in rows] self.q_texts = [pair[0] for pair in self.qa_pairs] self.a_texts = [pair[1] for pair in self.qa_pairs] print(f"✅ 成功从 MySQL 加载 {len(self.qa_pairs)} 条问答数据。") def load_or_cache_processed_questions(self): """使用本地缓存避免每次都预处理大量数据""" cache_path = os.path.join(self.model_dir, 'processed_questions.pkl') if os.path.exists(cache_path): print("🔄 使用缓存预处理后的分词文本。") with open(cache_path, 'rb') as f: self.processed_q_list = pickle.load(f) else: print("🔄 正在预处理问题文本(首次较慢)...") self.processed_q_list = [preprocess(q) for q in self.q_texts] with open(cache_path, 'wb') as f: pickle.dump(self.processed_q_list, f) print("✅ 预处理缓存已保存。") def load_cached_tfidf(self): """加载已存在的 TfidfVectorizer 或构建""" cache_tfidf_matrix = os.path.join(self.model_dir, 'tfidf_matrix.npz') cache_qa_list = os.path.join(self.model_dir, 'tfidf_qa.pkl') tfidf_path = os.path.join(self.model_dir, 'tfidf_vectorizer.pkl') if os.path.exists(tfidf_path) and os.path.exists(cache_tfidf_matrix) and os.path.exists(cache_qa_list): print("🔄 加载 TF-IDF 缓存版本。") import joblib self.tfidf_vectorizer = joblib.load(tfidf_path) self.tfidf_matrix = np.load(cache_tfidf_matrix)['tfidf'] with open(cache_qa_list, 'rb') as f: self.tfidf_qa = pickle.load(f) else: print("🔄 创建并构建 TF-IDF(首次较慢)...") self.tfidf_vectorizer = TfidfVectorizer( tokenizer=lambda x: x.split(), lowercase=False, max_features=10000 ) self.tfidf_qa = self.processed_q_list self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.tfidf_qa).toarray() print("✅ TF-IDF 构建完成。") import joblib joblib.dump(self.tfidf_vectorizer, tfidf_path) np.savez_compressed(cache_tfidf_matrix, tfidf=self.tfidf_matrix) with open(cache_qa_list, 'wb') as f: pickle.dump(self.tfidf_qa, f) def load_cached_word2vec_model(self): """加载已训练好的 Word2Vec 模型,没有就训练""" word2vec_path = os.path.join(self.model_dir, 'word2vec.model') if os.path.exists(word2vec_path): print("🔄 加载缓存中的 Word2Vec 模型...") self.word2vec_model = Word2Vec.load(word2vec_path) else: print("🔄 训练 Word2Vec 模型(首次较慢)...") tokenized_questions = [preprocess(q).split() for q in self.q_texts] self.word2vec_model = Word2Vec( sentences=tokenized_questions, vector_size=100, window=5, min_count=1, workers=4 ) self.word2vec_model.save(word2vec_path) print("✅ Word2Vec 模型训练完成并保存。") def sentence_to_vec(self, sentence): """将句子转换为向量表示""" tokens = preprocess(sentence).split() if self.word2vec_model: vecs = [self.word2vec_model.wv[w] for w in tokens if w in self.word2vec_model.wv] return np.mean(vecs, axis=0) if vecs else np.zeros(self.word2vec_model.vector_size) else: # 没有 Word2Vec 模型时,使用 TF-IDF 向量 return self.tfidf_vectorizer.transform([preprocess(sentence)]).toarray()[0] def build_model(self, epochs=10, batch_size=128, lr=1e-3): """ 构建并训练语义匹配模型,包含训练集/验证集拆分与性能监控。 """ # 创建数据集 full_dataset = QADataset(self.qa_pairs, self.tfidf_vectorizer, negative_ratio=self.negative_ratio) # 划分训练集/验证集 train_size = int(len(full_dataset) * 0.8) val_size = len(full_dataset) - train_size train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size]) # 创建数据加载器 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2) # 初始化模型 sample_input, _ = full_dataset[0] input_dim = sample_input.shape[0] self.semantic_model = SemanticMatchModel(input_dim) criterion = nn.BCELoss() optimizer = optim.Adam(self.semantic_model.parameters(), lr=lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9) # 训练模型 best_val_acc = 0.0 print("\n开始模型训练...") start_time = time.time() for epoch in range(epochs): self.semantic_model.train() total_loss, total_correct, total_samples = 0.0, 0, 0 for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs} - 训练中"): optimizer.zero_grad() outputs = self.semantic_model(X_batch).squeeze() loss = criterion(outputs, y_batch) loss.backward() optimizer.step() total_loss += loss.item() * len(y_batch) preds = (outputs >= 0.5).float() total_correct += (preds == y_batch).sum().item() total_samples += len(y_batch) train_loss = total_loss / total_samples train_acc = total_correct / total_samples # 验证阶段 self.semantic_model.eval() val_loss, val_correct, val_samples = 0.0, 0, 0 with torch.no_grad(): for X_val, y_val in val_loader: outputs_val = self.semantic_model(X_val).squeeze() loss_val = criterion(outputs_val, y_val) val_loss += loss_val.item() * len(y_val) preds_val = (outputs_val >= 0.5).float() val_correct += (preds_val == y_val).sum().item() val_samples += len(y_val) val_loss /= val_samples val_acc = val_correct / val_samples # 更新学习率 scheduler.step() print(f"Epoch [{epoch + 1}/{epochs}] | " f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | " f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}") # 保存最优模型 if val_acc > best_val_acc: best_val_acc = val_acc model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') torch.save(self.semantic_model.state_dict(), model_path) print(f"✅ 新的最优模型已保存 (Val Acc: {best_val_acc:.4f})") end_time = time.time() print(f"\n训练完成,共耗时 {end_time - start_time:.2f} 秒。") # 加载最优模型权重 model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') self.semantic_model.load_state_dict(torch.load(model_path)) self.semantic_model.eval() def load_model(self): """加载训练好的语义匹配 PyTorch 模型""" input_dim = self.tfidf_matrix.shape[1] * 2 model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') self.semantic_model = SemanticMatchModel(input_dim) self.semantic_model.load_state_dict(torch.load(model_path, map_location='cpu')) self.semantic_model.eval() print("✅ 语义匹配模型加载完成。") def retrieve(self, query, semantic_topk=100): """ 检索接口:先通过 TF-IDF + 句向量评分做粗检,再对Top-K结果用语义模型做精检,返回最匹配的 QA。 """ # 粗检 query_tfidf = self.tfidf_vectorizer.transform([preprocess(query)]).toarray()[0] tfidf_scores = cosine_similarity([query_tfidf], self.tfidf_matrix).flatten() query_sent_vec = self.sentence_to_vec(query) sent_vecs = np.array([self.sentence_to_vec(q) for q in self.q_texts]) sent_scores = cosine_similarity([query_sent_vec], sent_vecs).flatten() sim_scores = tfidf_scores + sent_scores topk_indices = np.argpartition(sim_scores, -semantic_topk)[-semantic_topk:] topk_indices = topk_indices[np.argsort(sim_scores[topk_indices])[::-1]] # 精检 if self.semantic_model: with torch.no_grad(): batch_inputs = [] for i in topk_indices: q = preprocess(self.q_texts[i]) a = preprocess(self.a_texts[i]) q_vec = self.tfidf_vectorizer.transform([q]).toarray()[0] a_vec = self.tfidf_vectorizer.transform([a]).toarray()[0] pair_input = np.concatenate((q_vec, a_vec)) batch_inputs.append(pair_input) batch_inputs = torch.tensor(np.stack(batch_inputs), dtype=torch.float32) batch_scores = self.semantic_model(batch_inputs).squeeze().cpu().numpy() semantic_scores = batch_scores # 综合得分 final_scores = sim_scores[topk_indices] + semantic_scores best_idx = topk_indices[np.argmax(final_scores)] return self.qa_pairs[best_idx], final_scores.max() else: # 没有语义模型时,只使用粗检结果 best_idx = topk_indices[0] return self.qa_pairs[best_idx], sim_scores[best_idx] def recommend_similar(self, query, topk=3): """针对未命中答案的情况,推荐相似问题""" query_tfidf = self.tfidf_vectorizer.transform([preprocess(query)]).toarray()[0] scores = cosine_similarity([query_tfidf], self.tfidf_matrix).flatten() topk_idx = scores.argsort()[0][-topk:][::-1] return [(self.qa_pairs[i][0], self.qa_pairs[i][1]) for i in topk_idx] class FeedbackRecorder: """记录未回答问题""" def __init__(self, file_path='unanswered_questions.csv'): self.file_path = file_path if not os.path.exists(self.file_path): with open(self.file_path, 'w', newline='', encoding='utf-8') as f: import csv csv.writer(f).writerow(['time', 'question']) def record_question(self, question): with open(self.file_path, 'a', newline='', encoding='utf-8') as f: import csv writer = csv.writer(f) writer.writerow([datetime.now().isoformat(), question]) def main(): kb = KnowledgeBase( host='localhost', user='root', password='hy188747', database='ubuntu_qa', table='qa_pair', model_dir=r"D:\NLP-PT\PT4\model", negative_ratio=1.0 ) # 是否重新训练语义匹配模型 if input("是否重新训练语义匹配模型?(y/n): ").strip().lower() == 'y': kb.build_model( epochs=5, # 训练轮数 batch_size=128, # 批大小 lr=1e-3 # 学习率 ) recorder = FeedbackRecorder() print("\n🎯 智能知识问答系统已启动(输入'q'退出聊天)\n") while True: query = input("🧐 问题:") if query.strip().lower() == 'q': break result, score = kb.retrieve(query) if result: print("💡 回答:", result[1]) print(f"📊 匹配信心分数: {score:.4f}\n") else: print("⚠ 没有找到合适的答案,已将你的问题记录下来。") recorder.record_question(query) print("🔥 相似问题推荐:") for q, a in kb.recommend_similar(query): print(f"Q: {q}\nA: {a}\n") if __name__ == "__main__": main()
07-05
import os import re import time import torch import torch.nn as nn import torch.optim as optim import pandas as pd import numpy as np import joblib # 添加缺失的导入 from datetime import datetime from torch.utils.data import Dataset, DataLoader, random_split from tqdm import tqdm import pickle import mysql.connector from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from gensim.models import Word2Vec import spacy nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS def clean_text(text): return re.sub(r'[^a-zA-Z0-9\s]', '', str(text)).strip().lower() def tokenize(text): """优化tokenize函数,移除冗余检查""" doc = nlp(clean_text(text)) return [token.text for token in doc if token.text not in STOPWORDS] # 移除isalnum检查 def preprocess(text): tokens = tokenize(text) return " ".join(tokens) class SemanticMatchModel(nn.Module): def __init__(self, input_dim): super().__init__() self.fc1 = nn.Linear(input_dim, 256) self.bn1 = nn.BatchNorm1d(256) self.fc2 = nn.Linear(256, 128) self.bn2 = nn.BatchNorm1d(128) self.fc3 = nn.Linear(128, 64) self.bn3 = nn.BatchNorm1d(64) self.fc4 = nn.Linear(64, 1) self.dropout = nn.Dropout(0.3) self.relu = nn.ReLU() self.sigmoid = nn.Sigmoid() def forward(self, x): x = self.relu(self.bn1(self.fc1(x))) x = self.dropout(x) x = self.relu(self.bn2(self.fc2(x))) x = self.dropout(x) x = self.relu(self.bn3(self.fc3(x))) x = self.dropout(x) x = self.sigmoid(self.fc4(x)) return x class QADataset(Dataset): def __init__(self, qa_pairs, tfidf_vectorizer, negative_ratio=1.0): self.qa_pairs = qa_pairs self.vectorizer = tfidf_vectorizer self.samples = [] # 构建正样本 for i, (q, a) in enumerate(self.qa_pairs): self.samples.append((q, a, 1)) # 优化负样本构建逻辑 if negative_ratio > 0: total_pairs = len(self.qa_pairs) all_answers = [a for _, a in self.qa_pairs] # 预先生成负样本索引 neg_indices = np.random.choice( len(all_answers), size=int(total_pairs * negative_ratio), replace=True ) for idx, (q, a) in enumerate(self.qa_pairs): sample_count = int(negative_ratio) start = idx * sample_count end = start + sample_count for j in range(start, end): if j < len(neg_indices): neg_a = all_answers[neg_indices[j]] # 确保不是当前答案 if neg_a != a: self.samples.append((q, neg_a, 0)) def __len__(self): return len(self.samples) def __getitem__(self, idx): q, a, label = self.samples[idx] q_vec = self.vectorizer.transform([preprocess(q)]).toarray()[0] a_vec = self.vectorizer.transform([preprocess(a)]).toarray()[0] pair_vec = np.concatenate((q_vec, a_vec)) return torch.tensor(pair_vec, dtype=torch.float32), torch.tensor(label, dtype=torch.float32) class KnowledgeBase: def __init__(self, host='localhost', user='root', password='hy188747', database='ubuntu_qa', table='qa_pair', model_dir=r"D:\NLP-PT\PT4\model", negative_ratio=1.0): print("🔄 初始化知识库...") self.host = host self.user = user self.password = password self.database = database self.table = table self.model_dir = model_dir self.negative_ratio = negative_ratio os.makedirs(self.model_dir, exist_ok=True) self.qa_pairs = [] self.q_texts = [] self.a_texts = [] self.semantic_model = None self.word2vec_model = None self.tfidf_vectorizer = None self.tfidf_matrix = None # 调整初始化顺序 self.load_data_from_mysql() self.load_or_cache_processed_questions() self.load_cached_tfidf() self.load_cached_word2vec_model() # 最后加载模型(确保依赖项已初始化) model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') if os.path.exists(model_path): self.load_model() else: print("⚠ 语义匹配模型未训练,请先训练模型。") def load_data_from_mysql(self): print("🔄 正在连接 MySQL,加载问答数据...") try: conn = mysql.connector.connect( host=self.host, user=self.user, password=self.password, database=self.database ) cursor = conn.cursor() query = f"SELECT question_text, answer_text FROM {self.table}" cursor.execute(query) rows = cursor.fetchall() self.qa_pairs = [(row[0], row[1]) for row in rows] self.q_texts = [pair[0] for pair in self.qa_pairs] self.a_texts = [pair[1] for pair in self.qa_pairs] print(f"✅ 成功从 MySQL 加载 {len(self.qa_pairs)} 条问答数据。") except Exception as e: print(f"❌ 数据库连接失败: {e}") self.qa_pairs = [] finally: if conn.is_connected(): conn.close() def load_or_cache_processed_questions(self): cache_path = os.path.join(self.model_dir, 'processed_questions.pkl') if os.path.exists(cache_path): print("🔄 使用缓存预处理后的分词文本。") with open(cache_path, 'rb') as f: self.processed_q_list = pickle.load(f) else: print("🔄 正在预处理问题文本(首次较慢)...") self.processed_q_list = [preprocess(q) for q in tqdm(self.q_texts)] with open(cache_path, 'wb') as f: pickle.dump(self.processed_q_list, f) print("✅ 预处理缓存已保存。") def load_cached_tfidf(self): cache_tfidf_matrix = os.path.join(self.model_dir, 'tfidf_matrix.npz') cache_qa_list = os.path.join(self.model_dir, 'tfidf_qa.pkl') tfidf_path = os.path.join(self.model_dir, 'tfidf_vectorizer.pkl') if os.path.exists(tfidf_path) and os.path.exists(cache_tfidf_matrix) and os.path.exists(cache_qa_list): print("🔄 加载 TF-IDF 缓存版本。") self.tfidf_vectorizer = joblib.load(tfidf_path) self.tfidf_matrix = np.load(cache_tfidf_matrix)['tfidf'] with open(cache_qa_list, 'rb') as f: self.tfidf_qa = pickle.load(f) else: print("🔄 创建并构建 TF-IDF(首次较慢)...") self.tfidf_vectorizer = TfidfVectorizer( tokenizer=lambda x: x.split(), lowercase=False, max_features=10000 ) self.tfidf_qa = self.processed_q_list self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.tfidf_qa).toarray() print("✅ TF-IDF 构建完成。") joblib.dump(self.tfidf_vectorizer, tfidf_path) np.savez_compressed(cache_tfidf_matrix, tfidf=self.tfidf_matrix) with open(cache_qa_list, 'wb') as f: pickle.dump(self.tfidf_qa, f) def load_cached_word2vec_model(self): word2vec_path = os.path.join(self.model_dir, 'word2vec.model') if os.path.exists(word2vec_path): print("🔄 加载缓存中的 Word2Vec 模型...") self.word2vec_model = Word2Vec.load(word2vec_path) else: print("🔄 训练 Word2Vec 模型(首次较慢)...") tokenized_questions = [preprocess(q).split() for q in self.q_texts] self.word2vec_model = Word2Vec( sentences=tokenized_questions, vector_size=100, window=5, min_count=1, workers=4, epochs=10 ) self.word2vec_model.save(word2vec_path) print("✅ Word2Vec 模型训练完成并保存。") def sentence_to_vec(self, sentence): """修复空向量问题""" tokens = preprocess(sentence).split() if not tokens: return np.zeros(100) # 默认向量大小 if self.word2vec_model: vecs = [self.word2vec_model.wv[w] for w in tokens if w in self.word2vec_model.wv] return np.mean(vecs, axis=0) if vecs else np.zeros(self.word2vec_model.vector_size) else: vec = self.tfidf_vectorizer.transform([preprocess(sentence)]).toarray()[0] return vec def build_model(self, epochs=10, batch_size=128, lr=1e-3): # 创建数据集 full_dataset = QADataset(self.qa_pairs, self.tfidf_vectorizer, negative_ratio=self.negative_ratio) # 划分训练集/验证集 train_size = int(len(full_dataset) * 0.8) val_size = len(full_dataset) - train_size train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size]) # 创建数据加载器 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2) val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2) # 初始化模型 sample_input, _ = full_dataset[0] input_dim = sample_input.shape[0] self.semantic_model = SemanticMatchModel(input_dim) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.semantic_model.to(device) criterion = nn.BCELoss() optimizer = optim.Adam(self.semantic_model.parameters(), lr=lr) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=2, factor=0.5) # 训练模型 best_val_acc = 0.0 print("\n开始模型训练...") start_time = time.time() for epoch in range(epochs): self.semantic_model.train() total_loss, total_correct, total_samples = 0.0, 0, 0 for X_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs} - 训练中"): X_batch, y_batch = X_batch.to(device), y_batch.to(device) optimizer.zero_grad() outputs = self.semantic_model(X_batch).squeeze() loss = criterion(outputs, y_batch) loss.backward() optimizer.step() total_loss += loss.item() * len(y_batch) preds = (outputs >= 0.5).float() total_correct += (preds == y_batch).sum().item() total_samples += len(y_batch) train_loss = total_loss / total_samples train_acc = total_correct / total_samples # 验证阶段 self.semantic_model.eval() val_loss, val_correct, val_samples = 0.0, 0, 0 with torch.no_grad(): for X_val, y_val in val_loader: X_val, y_val = X_val.to(device), y_val.to(device) outputs_val = self.semantic_model(X_val).squeeze() loss_val = criterion(outputs_val, y_val) val_loss += loss_val.item() * len(y_val) preds_val = (outputs_val >= 0.5).float() val_correct += (preds_val == y_val).sum().item() val_samples += len(y_val) val_loss /= val_samples val_acc = val_correct / val_samples # 更新学习率 scheduler.step(val_acc) print(f"Epoch [{epoch + 1}/{epochs}] | " f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | " f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}") # 保存最优模型 if val_acc > best_val_acc: best_val_acc = val_acc model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') torch.save(self.semantic_model.state_dict(), model_path) print(f"✅ 新的最优模型已保存 (Val Acc: {best_val_acc:.4f})") end_time = time.time() print(f"\n训练完成,共耗时 {end_time - start_time:.2f} 秒。") # 加载最优模型权重 model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') self.semantic_model.load_state_dict(torch.load(model_path, map_location=device)) self.semantic_model.eval() def load_model(self): """加载训练好的语义匹配模型""" input_dim = self.tfidf_matrix.shape[1] * 2 model_path = os.path.join(self.model_dir, 'semantic_match_model.pth') self.semantic_model = SemanticMatchModel(input_dim) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.semantic_model.load_state_dict(torch.load(model_path, map_location=device)) self.semantic_model.to(device) self.semantic_model.eval() print("✅ 语义匹配模型加载完成。") def retrieve(self, query, semantic_topk=100): # 粗检 query_tfidf = self.tfidf_vectorizer.transform([preprocess(query)]).toarray()[0] tfidf_scores = cosine_similarity([query_tfidf], self.tfidf_matrix).flatten() query_sent_vec = self.sentence_to_vec(query) sent_vecs = np.array([self.sentence_to_vec(q) for q in self.q_texts]) sent_scores = cosine_similarity([query_sent_vec], sent_vecs).flatten() # 归一化句子向量相似度到[0,1] sent_scores = (sent_scores + 1) / 2 sim_scores = tfidf_scores + sent_scores # 确保有足够的数据 if len(sim_scores) == 0: return ("抱歉,知识库中没有找到相关信息。", 0.0) topk_indices = np.argpartition(sim_scores, -semantic_topk)[-semantic_topk:] topk_indices = topk_indices[np.argsort(sim_scores[topk_indices])[::-1]] # 精检 if self.semantic_model: device = next(self.semantic_model.parameters()).device with torch.no_grad(): batch_inputs = [] for i in topk_indices: q = preprocess(self.q_texts[i]) a = preprocess(self.a_texts[i]) q_vec = self.tfidf_vectorizer.transform([q]).toarray()[0] a_vec = self.tfidf_vectorizer.transform([a]).toarray()[0] pair_input = np.concatenate((q_vec, a_vec)) batch_inputs.append(pair_input) if batch_inputs: batch_inputs = torch.tensor(np.stack(batch_inputs), dtype=torch.float32).to(device) batch_scores = self.semantic_model(batch_inputs).squeeze().cpu().numpy() semantic_scores = batch_scores else: semantic_scores = np.zeros(len(topk_indices)) # 综合得分(添加归一化权重) coarse_scores = sim_scores[topk_indices] / 2.0 # 归一化到[0,1] final_scores = 0.3 * coarse_scores + 0.7 * semantic_scores best_idx_in_topk = np.argmax(final_scores) best_idx = topk_indices[best_idx_in_topk] return self.qa_pairs[best_idx], final_scores[best_idx_in_topk] else: best_idx = topk_indices[0] if topk_indices.size > 0 else 0 return self.qa_pairs[best_idx], sim_scores[best_idx] def recommend_similar(self, query, topk=3): """修复索引越界问题""" query_tfidf = self.tfidf_vectorizer.transform([preprocess(query)]).toarray()[0] scores = cosine_similarity([query_tfidf], self.tfidf_matrix).flatten() # 安全获取topk索引 if len(scores) == 0: return [] if len(scores) < topk: topk = len(scores) topk_idx = np.argpartition(scores, -topk)[-topk:] topk_idx = topk_idx[np.argsort(scores[topk_idx])[::-1]] return [(self.q_texts[i], self.a_texts[i]) for i in topk_idx] class FeedbackRecorder: def __init__(self, file_path='unanswered_questions.csv'): self.file_path = file_path if not os.path.exists(self.file_path): with open(self.file_path, 'w', newline='', encoding='utf-8') as f: import csv csv.writer(f).writerow(['time', 'question']) def record_question(self, question): with open(self.file_path, 'a', newline='', encoding='utf-8') as f: import csv writer = csv.writer(f) writer.writerow([datetime.now().isoformat(), question]) def main(): kb = KnowledgeBase( host='localhost', user='root', password='hy188747', database='ubuntu_qa', table='qa_pair', model_dir=r"D:\NLP-PT\PT4\model", negative_ratio=1.0 ) if input("是否重新训练语义匹配模型?(y/n): ").strip().lower() == 'y': kb.build_model( epochs=5, batch_size=128, lr=1e-3 ) recorder = FeedbackRecorder() print("\n🎯 智能知识问答系统已启动(输入'q'退出聊天)\n") while True: query = input("🧐 问题:") if query.strip().lower() == 'q': break try: result, score = kb.retrieve(query) if result: print(f"💡 回答:{result[1]}") print(f"📊 匹配信心分数: {score:.4f}\n") else: print("⚠ 没有找到合适的答案,已将你的问题记录下来。") recorder.record_question(query) print("🔥 相似问题推荐:") similar_questions = kb.recommend_similar(query, topk=3) for q, a in similar_questions: print(f"Q: {q}\nA: {a}\n") except Exception as e: print(f"❌ 检索过程中发生错误: {e}") if __name__ == "__main__": main()
07-05
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值