#!pip install xgboost scikit-learn matplotlib pandas scipyimport pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV # 导入 RandomizedSearchCVfrom sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from scipy.stats import uniform, randint # 导入用于参数分布的模块# --- 1. 数据加载与预处理 ---# Load the dataset
file_path ='/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
df_train = pd.read_csv(file_path)# Fill NaN in 'Misconception' with 'No_Misconception'
df_train['Misconception']= df_train['Misconception'].fillna('No_Misconception')# Combine QuestionText, MC_Answer, and StudentExplanation
df_train['CombinedText']= df_train['QuestionText']+" "+ df_train['MC_Answer']+" "+ df_train['StudentExplanation']# --- 1.1. 加载 GloVe 词向量 ---print("--- 正在加载 GloVe 词向量 ---")# Adjust this path if your GloVe file is located elsewhere# glove_file_path = '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt'
glove_file_path ='/kaggle/input/dataword/glove.6B.100d.txt'
word_embeddings ={}
embedding_dim =100# Using 100-dimensional GloVe vectorstry:withopen(glove_file_path,'r', encoding='utf-8')as f:for line in f:
values = line.split()
word = values[0]
vector = np.asarray(values[1:], dtype='float32')
word_embeddings[word]= vector
print(f"已加载 {len(word_embeddings)} 个词的 GloVe 词向量 (维度: {embedding_dim})。")except FileNotFoundError:print(f"错误: GloVe 文件 '{glove_file_path}' 未找到。请确保文件已上传或路径正确。")print("将跳过词嵌入,使用一个简化的特征提取器进行演示。")# Fallback: create a dummy word_embeddings if file not found for demonstration
word_embeddings ={"dummy": np.zeros(embedding_dim)}# Placeholder
embedding_dim =100# Still define dimension for consistency# --- 1.2. 创建词嵌入特征提取器函数 ---defget_embedding_features(texts, word_embeddings, embedding_dim):"""
将文本列表转换为词嵌入特征矩阵。
每个文本的特征是其所有词向量的平均值。
"""
features_matrix = np.zeros((len(texts), embedding_dim))for i, text inenumerate(texts):
words = text.lower().split()# Convert to lowercase and split into words
word_vectors =[]for word in words:if word in word_embeddings:
word_vectors.append(word_embeddings[word])if word_vectors:# If there are any valid word vectors
features_matrix[i]= np.mean(word_vectors, axis=0)# else: features_matrix[i] remains all zeros (for empty texts or OOV texts)return features_matrix
# --- 1.3. 使用 TF-IDF 和词嵌入提取特征并拼接 ---print("--- 正在使用 TF-IDF 和词嵌入提取文本特征并拼接 ---")# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_tfidf = tfidf_vectorizer.fit_transform(df_train['CombinedText'])# Word Embedding features
X_embeddings = get_embedding_features(df_train['CombinedText'].astype(str).tolist(), word_embeddings, embedding_dim)# Concatenate TF-IDF and Word Embedding features
X_text = np.hstack((X_tfidf.toarray().astype(np.float32), X_embeddings))# Convert sparse TF-IDF to dense before hstackprint(f"组合后的文本特征矩阵形状: {X_text.shape}")# Label Encoding for 'Category'
le_category = LabelEncoder()
le_category.fit(df_train['Category'])
y_category = le_category.transform(df_train['Category'])# Label Encoding for 'Misconception'
le_misconception = LabelEncoder()
le_misconception.fit(df_train['Misconception'])
y_misconception = le_misconception.transform(df_train['Misconception'])# --- 2. 定义计算 MAP@K 的函数 ---defmean_average_precision_at_k(y_true, y_pred_proba, k=3):"""
计算 Mean Average Precision @ K (MAP@K)。
Args:
y_true (np.array): 真实标签的数组(整数编码)。
y_pred_proba (np.array): 预测概率的数组,形状为 (n_samples, n_classes)。
k (int): 考虑的前 K 个预测。
Returns:
float: MAP@K 值。
"""
average_precisions =[]for i inrange(len(y_true)):
true_label = y_true[i]
probas_for_sample = y_pred_proba[i]
top_k_indices = np.argsort(probas_for_sample)[::-1][:k]
precision_at_k =0
num_correct =0for j, predicted_index inenumerate(top_k_indices):if predicted_index == true_label:
num_correct +=1
precision_at_k += num_correct /(j +1)if true_label in top_k_indices:
average_precisions.append(precision_at_k / num_correct if num_correct >0else0)else:
average_precisions.append(0)return np.mean(average_precisions)# --- 3. 定义 XGBoost 参数分布用于 Random Search ---# 定义一个通用的参数分布,可以根据需要调整范围
param_dist ={'n_estimators': randint(100,500),# 树的数量'learning_rate': uniform(0.01,0.2),# 学习率'max_depth': randint(3,10),# 树的最大深度'subsample': uniform(0.6,0.4),# 每次迭代采样训练样本的比例'colsample_bytree': uniform(0.6,0.4),# 每次迭代采样特征的比例'gamma': uniform(0,0.5),# 剪枝的最小损失减少'reg_alpha': uniform(0,1),# L1 正则化'reg_lambda': uniform(0,1)# L2 正则化}# --- 4. Category 模型训练与评估 (XGBoost with Random Search) ---print("--- 正在对 Category 模型执行 Random Search ---")
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(
X_text, y_category, test_size=0.2, random_state=42, stratify=y_category
)
xgb_cat = xgb.XGBClassifier(
objective='multi:softmax',
num_class=len(le_category.classes_),
eval_metric='mlogloss',
use_label_encoder=False,
random_state=42,
n_jobs=-1,
device='cuda'# 启用GPU加速,替代 tree_method='gpu_hist' 和 predictor='gpu_predictor')# RandomizedSearchCV for Category Model
random_search_cat = RandomizedSearchCV(
estimator=xgb_cat,
param_distributions=param_dist,
n_iter=50,# 迭代次数,可以根据计算资源调整
cv=3,# 交叉验证折数
scoring='f1_weighted',# 评估指标,考虑到类别不平衡,使用加权F1
verbose=1,# 打印训练过程
random_state=42,
n_jobs=-1)
random_search_cat.fit(X_train_cat, y_train_cat)print("\nCategory 模型最佳参数:", random_search_cat.best_params_)print("Category 模型最佳F1分数 (训练集):", random_search_cat.best_score_)# Use the best estimator found by Random Search for evaluation
model_category_eval = random_search_cat.best_estimator_
# Predict and evaluate Category model on the TEST SET
y_pred_cat_test = model_category_eval.predict(X_test_cat)
y_pred_proba_cat_test = model_category_eval.predict_proba(X_test_cat)print(f"\nCategory 模型测试集准确率: {accuracy_score(y_test_cat, y_pred_cat_test):.4f}")print("\nCategory 模型测试集分类报告:")print(classification_report(y_test_cat, y_pred_cat_test, target_names=le_category.classes_))# Output Category model's MAP@3 on test set
map_at_3_cat_test = mean_average_precision_at_k(y_test_cat, y_pred_proba_cat_test, k=3)print(f"\nCategory 模型测试集 Mean Average Precision @3 (MAP@3): {map_at_3_cat_test:.4f}")# Calculate and display Confusion Matrix for Category model on test setprint("\nCategory 模型测试集混淆矩阵:")
cm_category_test = confusion_matrix(y_test_cat, y_pred_cat_test)
disp_cat_test = ConfusionMatrixDisplay(confusion_matrix=cm_category_test, display_labels=le_category.classes_)
fig_cat_test, ax_cat_test = plt.subplots(figsize=(8,6))
disp_cat_test.plot(cmap=plt.cm.Blues, ax=ax_cat_test)
ax_cat_test.set_title('Category 模型测试集混淆矩阵 (XGBoost - Random Search)')
plt.show()# --- 5. Misconception 模型训练与评估 (XGBoost with Random Search) ---print("\n--- 正在对 Misconception 模型执行 Random Search ---")
X_train_mis, X_test_mis, y_train_mis, y_test_mis = train_test_split(
X_text, y_misconception, test_size=0.2, random_state=42, stratify=y_misconception
)
xgb_mis = xgb.XGBClassifier(
objective='multi:softmax',
num_class=len(le_misconception.classes_),
eval_metric='mlogloss',
use_label_encoder=False,
random_state=42,
n_jobs=-1,
device='cuda'# 启用GPU加速,替代 tree_method='gpu_hist' 和 predictor='gpu_predictor')# RandomizedSearchCV for Misconception Model
random_search_mis = RandomizedSearchCV(
estimator=xgb_mis,
param_distributions=param_dist,
n_iter=50,# 迭代次数,可以根据计算资源调整
cv=3,# 交叉验证折数
scoring='f1_weighted',# 评估指标
verbose=1,# 打印训练过程
random_state=42,
n_jobs=-1)
random_search_mis.fit(X_train_mis, y_train_mis)print("\nMisconception 模型最佳参数:", random_search_mis.best_params_)print("Misconception 模型最佳F1分数 (训练集):", random_search_mis.best_score_)# Use the best estimator found by Random Search for evaluation
model_misconception_eval = random_search_mis.best_estimator_
# Predict and evaluate Misconception model on the TEST SET
y_pred_mis_test = model_misconception_eval.predict(X_test_mis)
y_pred_proba_mis_test = model_misconception_eval.predict_proba(X_test_mis)print(f"\nMisconception 模型测试集准确率: {accuracy_score(y_test_mis, y_pred_mis_test):.4f}")print("\nMisconception 模型测试集分类报告:")print(classification_report(y_test_mis, y_pred_mis_test, target_names=le_misconception.classes_))# Output Misconception model's MAP@3 on test set
map_at_3_mis_test = mean_average_precision_at_k(y_test_mis, y_pred_proba_mis_test, k=3)print(f"\nMisconception 模型测试集 Mean Average Precision @3 (MAP@3): {map_at_3_mis_test:.4f}")# Calculate and display Confusion Matrix for Misconception model on test setprint("\nMisconception 模型测试集混淆矩阵:")
cm_misconception_test = confusion_matrix(y_test_mis, y_pred_mis_test)
disp_mis_test = ConfusionMatrixDisplay(confusion_matrix=cm_misconception_test, display_labels=le_misconception.classes_)
fig_mis_test, ax_mis_test = plt.subplots(figsize=(15,12))
disp_mis_test.plot(cmap=plt.cm.Blues, ax=ax_mis_test)
ax_mis_test.set_title('Misconception 模型测试集混淆矩阵 (XGBoost - Random Search)')
plt.show()print("\n模型评估完成。")# --- 6. 重新训练最终模型 (在整个训练集上) 用于对 test.csv 的预测 ---print("\n--- 正在整个训练集上重新训练最终模型,用于对 test.csv 的预测 ---")# Use the best parameters found by Random Search for final models
final_model_category = xgb.XGBClassifier(
objective='multi:softmax', num_class=len(le_category.classes_),
eval_metric='mlogloss', use_label_encoder=False,
random_state=42, n_jobs=-1,
device='cuda',# 启用GPU加速,替代 tree_method='gpu_hist' 和 predictor='gpu_predictor'**random_search_cat.best_params_ # 使用最佳参数)
final_model_category.fit(X_text, y_category)# Train on FULL df_train data
final_model_misconception = xgb.XGBClassifier(
objective='multi:softmax', num_class=len(le_misconception.classes_),
eval_metric='mlogloss', use_label_encoder=False,
random_state=42, n_jobs=-1,
device='cuda',# 启用GPU加速,替代 tree_method='gpu_hist' 和 predictor='gpu_predictor'**random_search_mis.best_params_ # 使用最佳参数)
final_model_misconception.fit(X_text, y_misconception)# Train on FULL df_train dataprint("最终模型已在整个训练集上重新训练完毕。")# --- 7. 定义获取前 K 个预测组合的函数 (现在使用 final_models) ---defget_top_k_combined_predictions(question_text, mc_answer, student_explanation, k=3):# Added mc_answer parameter"""
为给定数据生成前 K 个 Category : Misconception 预测组合。
Args:
question_text (str): 问题文本。
mc_answer (str): 多项选择答案。
student_explanation (str): 学生解释文本。
k (int): 要返回的最高概率组合的数量。
Returns:
list: 包含前 K 个 (Category : Misconception, probability) 元组的列表。
"""# Prepare new data
combined_text = question_text +" "+ mc_answer +" "+ student_explanation # Correctly combine all three parts# Transform new data using both TF-IDF and Word Embeddings
X_new_tfidf = tfidf_vectorizer.transform([combined_text])
X_new_embeddings = get_embedding_features([combined_text], word_embeddings, embedding_dim)
X_new_transformed = np.hstack((X_new_tfidf.toarray(), X_new_embeddings))# Get probability predictions from final_models
proba_category = final_model_category.predict_proba(X_new_transformed)[0]
proba_misconception = final_model_misconception.predict_proba(X_new_transformed)[0]# Get all class names
category_names = le_category.classes_
misconception_names = le_misconception.classes_
# Combine probabilities and store all possibilities
all_combinations =[]for i, cat_name inenumerate(category_names):for j, mis_name inenumerate(misconception_names):
combined_prob = proba_category[i]* proba_misconception[j]
all_combinations.append((f"{cat_name}:{mis_name}", combined_prob))# Sort by probability in descending order and get top K results
all_combinations.sort(key=lambda x: x[1], reverse=True)return all_combinations[:k]# --- 8. 加载 test.csv 并进行预测 ---print("\n--- 正在加载 test.csv 并进行预测 ---")
testfile_path ='/kaggle/input/map-charting-student-math-misunderstandings/test.csv'try:
df_test = pd.read_csv(testfile_path)except FileNotFoundError:print("错误: 'test.csv' 文件未找到。请确保文件已上传。")
df_test = pd.DataFrame({'row_id':[100000,100001,100002],'QuestionId':[99999,88888,77777],'QuestionText':["What is 5 + 3?","If x = 10, what is 2x?","What is the capital of France?"],'MC_Answer':[# Ensure MC_Answer is in dummy data"8","20","Paris"],'StudentExplanation':["5 and 3 makes 8.","2 times 10 is 20.","It is the city of lights."]})print("已创建示例测试数据进行演示。")# Prepare test data for prediction - now correctly includes MC_Answer# df_test['CombinedText'] = df_test['QuestionText'] + " " + df_test['MC_Answer'] + " " + df_test['StudentExplanation'] # This line is no longer needed here# Create a new column to store the combined top 3 predictions
df_test['Category:Misconception']=None# Iterate through each row of the test DataFrame and make predictionsfor index, row in df_test.iterrows():
question_text = row['QuestionText']
student_explanation = row['StudentExplanation']
mc_answer = row['MC_Answer']# Get MC_Answer from the row# Pass all three parts to the prediction function
top_predictions = get_top_k_combined_predictions(question_text, mc_answer, student_explanation, k=3)
predictions_list_str =[]iflen(top_predictions)>0:
predictions_list_str.append(f"{top_predictions[0][0]}")iflen(top_predictions)>1:
predictions_list_str.append(f"{top_predictions[1][0]}")iflen(top_predictions)>2:
predictions_list_str.append(f"{top_predictions[2][0]}")
df_test.at[index,'Category:Misconception']=" ".join(predictions_list_str)print("\n测试数据预测完成。")# Display the first few rows of the test DataFrame with predictionsprint("\n预测结果示例 (前5行):")print(df_test[['row_id','Category:Misconception']].head().to_markdown(index=False, numalign="left", stralign="left"))# You can save the results to a new CSV file if needed
df_test.to_csv('submission.csv', index=False, columns=['row_id','Category:Misconception'])print("\n预测结果已保存到 'submission.csv'。")