【XGBoost】两个单任务的模型 MAP - Charting Student Math Misunderstandings-优快云博客

#!pip install xgboost scikit-learn matplotlib pandas scipy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV # 导入 RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from scipy.stats import uniform, randint # 导入用于参数分布的模块

# --- 1. 数据加载与预处理 ---
# Load the dataset
file_path = '/kaggle/input/map-charting-student-math-misunderstandings/train.csv'
df_train = pd.read_csv(file_path)

# Fill NaN in 'Misconception' with 'No_Misconception'
df_train['Misconception'] = df_train['Misconception'].fillna('No_Misconception')

# Combine QuestionText, MC_Answer, and StudentExplanation
df_train['CombinedText'] = df_train['QuestionText'] + " " + df_train['MC_Answer'] + " " + df_train['StudentExplanation']


# --- 1.1. 加载 GloVe 词向量 ---
print("--- 正在加载 GloVe 词向量 ---")
# Adjust this path if your GloVe file is located elsewhere
# glove_file_path = '/kaggle/input/glove-global-vectors-for-word-representation/glove.6B.100d.txt'
glove_file_path = '/kaggle/input/dataword/glove.6B.100d.txt'
word_embeddings = {}
embedding_dim = 100 # Using 100-dimensional GloVe vectors

try:
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            word_embeddings[word] = vector
    print(f"已加载 {len(word_embeddings)} 个词的 GloVe 词向量 (维度: {embedding_dim})。")
except FileNotFoundError:
    print(f"错误: GloVe 文件 '{glove_file_path}' 未找到。请确保文件已上传或路径正确。")
    print("将跳过词嵌入，使用一个简化的特征提取器进行演示。")
    # Fallback: create a dummy word_embeddings if file not found for demonstration
    word_embeddings = {"dummy": np.zeros(embedding_dim)} # Placeholder
    embedding_dim = 100 # Still define dimension for consistency


# --- 1.2. 创建词嵌入特征提取器函数 ---
def get_embedding_features(texts, word_embeddings, embedding_dim):
    """
    将文本列表转换为词嵌入特征矩阵。
    每个文本的特征是其所有词向量的平均值。
    """
    features_matrix = np.zeros((len(texts), embedding_dim))
    for i, text in enumerate(texts):
        words = text.lower().split() # Convert to lowercase and split into words
        word_vectors = []
        for word in words:
            if word in word_embeddings:
                word_vectors.append(word_embeddings[word])
        
        if word_vectors: # If there are any valid word vectors
            features_matrix[i] = np.mean(word_vectors, axis=0)
        # else: features_matrix[i] remains all zeros (for empty texts or OOV texts)
            
    return features_matrix

# --- 1.3. 使用 TF-IDF 和词嵌入提取特征并拼接 ---
print("--- 正在使用 TF-IDF 和词嵌入提取文本特征并拼接 ---")
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_tfidf = tfidf_vectorizer.fit_transform(df_train['CombinedText'])

# Word Embedding features
X_embeddings = get_embedding_features(df_train['CombinedText'].astype(str).tolist(), word_embeddings, embedding_dim)

# Concatenate TF-IDF and Word Embedding features
X_text = np.hstack((X_tfidf.toarray().astype(np.float32), X_embeddings)) # Convert sparse TF-IDF to dense before hstack
print(f"组合后的文本特征矩阵形状: {X_text.shape}")


# Label Encoding for 'Category'
le_category = LabelEncoder()
le_category.fit(df_train['Category'])
y_category = le_category.transform(df_train['Category'])

# Label Encoding for 'Misconception'
le_misconception = LabelEncoder()
le_misconception.fit(df_train['Misconception'])
y_misconception = le_misconception.transform(df_train['Misconception'])


# --- 2. 定义计算 MAP@K 的函数 ---
def mean_average_precision_at_k(y_true, y_pred_proba, k=3):
    """
    计算 Mean Average Precision @ K (MAP@K)。

    Args:
        y_true (np.array): 真实标签的数组（整数编码）。
        y_pred_proba (np.array): 预测概率的数组，形状为 (n_samples, n_classes)。
        k (int): 考虑的前 K 个预测。

    Returns:
        float: MAP@K 值。
    """
    average_precisions = []
    for i in range(len(y_true)):
        true_label = y_true[i]
        probas_for_sample = y_pred_proba[i]
        top_k_indices = np.argsort(probas_for_sample)[::-1][:k]

        precision_at_k = 0
        num_correct = 0
        for j, predicted_index in enumerate(top_k_indices):
            if predicted_index == true_label:
                num_correct += 1
                precision_at_k += num_correct / (j + 1)

        if true_label in top_k_indices:
            average_precisions.append(precision_at_k / num_correct if num_correct > 0 else 0)
        else:
            average_precisions.append(0)

    return np.mean(average_precisions)


# --- 3. 定义 XGBoost 参数分布用于 Random Search ---
# 定义一个通用的参数分布，可以根据需要调整范围
param_dist = {
    'n_estimators': randint(100, 500), # 树的数量
    'learning_rate': uniform(0.01, 0.2), # 学习率
    'max_depth': randint(3, 10), # 树的最大深度
    'subsample': uniform(0.6, 0.4), # 每次迭代采样训练样本的比例
    'colsample_bytree': uniform(0.6, 0.4), # 每次迭代采样特征的比例
    'gamma': uniform(0, 0.5), # 剪枝的最小损失减少
    'reg_alpha': uniform(0, 1), # L1 正则化
    'reg_lambda': uniform(0, 1) # L2 正则化
}


# --- 4. Category 模型训练与评估 (XGBoost with Random Search) ---

print("--- 正在对 Category 模型执行 Random Search ---")
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(
    X_text, y_category, test_size=0.2, random_state=42, stratify=y_category
)

xgb_cat = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(le_category.classes_),
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1,
    device='cuda' # 启用GPU加速，替代 tree_method='gpu_hist' 和 predictor='gpu_predictor'
)

# RandomizedSearchCV for Category Model
random_search_cat = RandomizedSearchCV(
    estimator=xgb_cat,
    param_distributions=param_dist,
    n_iter=50, # 迭代次数，可以根据计算资源调整
    cv=3, # 交叉验证折数
    scoring='f1_weighted', # 评估指标，考虑到类别不平衡，使用加权F1
    verbose=1, # 打印训练过程
    random_state=42,
    n_jobs=-1
)

random_search_cat.fit(X_train_cat, y_train_cat)

print("\nCategory 模型最佳参数:", random_search_cat.best_params_)
print("Category 模型最佳F1分数 (训练集):", random_search_cat.best_score_)

# Use the best estimator found by Random Search for evaluation
model_category_eval = random_search_cat.best_estimator_

# Predict and evaluate Category model on the TEST SET
y_pred_cat_test = model_category_eval.predict(X_test_cat)
y_pred_proba_cat_test = model_category_eval.predict_proba(X_test_cat)

print(f"\nCategory 模型测试集准确率: {accuracy_score(y_test_cat, y_pred_cat_test):.4f}")
print("\nCategory 模型测试集分类报告:")
print(classification_report(y_test_cat, y_pred_cat_test, target_names=le_category.classes_))

# Output Category model's MAP@3 on test set
map_at_3_cat_test = mean_average_precision_at_k(y_test_cat, y_pred_proba_cat_test, k=3)
print(f"\nCategory 模型测试集 Mean Average Precision @3 (MAP@3): {map_at_3_cat_test:.4f}")

# Calculate and display Confusion Matrix for Category model on test set
print("\nCategory 模型测试集混淆矩阵:")
cm_category_test = confusion_matrix(y_test_cat, y_pred_cat_test)
disp_cat_test = ConfusionMatrixDisplay(confusion_matrix=cm_category_test, display_labels=le_category.classes_)
fig_cat_test, ax_cat_test = plt.subplots(figsize=(8, 6))
disp_cat_test.plot(cmap=plt.cm.Blues, ax=ax_cat_test)
ax_cat_test.set_title('Category 模型测试集混淆矩阵 (XGBoost - Random Search)')
plt.show()


# --- 5. Misconception 模型训练与评估 (XGBoost with Random Search) ---

print("\n--- 正在对 Misconception 模型执行 Random Search ---")
X_train_mis, X_test_mis, y_train_mis, y_test_mis = train_test_split(
    X_text, y_misconception, test_size=0.2, random_state=42, stratify=y_misconception
)

xgb_mis = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=len(le_misconception.classes_),
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1,
    device='cuda' # 启用GPU加速，替代 tree_method='gpu_hist' 和 predictor='gpu_predictor'
)

# RandomizedSearchCV for Misconception Model
random_search_mis = RandomizedSearchCV(
    estimator=xgb_mis,
    param_distributions=param_dist,
    n_iter=50, # 迭代次数，可以根据计算资源调整
    cv=3, # 交叉验证折数
    scoring='f1_weighted', # 评估指标
    verbose=1, # 打印训练过程
    random_state=42,
    n_jobs=-1
)

random_search_mis.fit(X_train_mis, y_train_mis)

print("\nMisconception 模型最佳参数:", random_search_mis.best_params_)
print("Misconception 模型最佳F1分数 (训练集):", random_search_mis.best_score_)

# Use the best estimator found by Random Search for evaluation
model_misconception_eval = random_search_mis.best_estimator_

# Predict and evaluate Misconception model on the TEST SET
y_pred_mis_test = model_misconception_eval.predict(X_test_mis)
y_pred_proba_mis_test = model_misconception_eval.predict_proba(X_test_mis)

print(f"\nMisconception 模型测试集准确率: {accuracy_score(y_test_mis, y_pred_mis_test):.4f}")
print("\nMisconception 模型测试集分类报告:")
print(classification_report(y_test_mis, y_pred_mis_test, target_names=le_misconception.classes_))

# Output Misconception model's MAP@3 on test set
map_at_3_mis_test = mean_average_precision_at_k(y_test_mis, y_pred_proba_mis_test, k=3)
print(f"\nMisconception 模型测试集 Mean Average Precision @3 (MAP@3): {map_at_3_mis_test:.4f}")

# Calculate and display Confusion Matrix for Misconception model on test set
print("\nMisconception 模型测试集混淆矩阵:")
cm_misconception_test = confusion_matrix(y_test_mis, y_pred_mis_test)
disp_mis_test = ConfusionMatrixDisplay(confusion_matrix=cm_misconception_test, display_labels=le_misconception.classes_)
fig_mis_test, ax_mis_test = plt.subplots(figsize=(15, 12))
disp_mis_test.plot(cmap=plt.cm.Blues, ax=ax_mis_test)
ax_mis_test.set_title('Misconception 模型测试集混淆矩阵 (XGBoost - Random Search)')
plt.show()

print("\n模型评估完成。")


# --- 6. 重新训练最终模型 (在整个训练集上) 用于对 test.csv 的预测 ---
print("\n--- 正在整个训练集上重新训练最终模型，用于对 test.csv 的预测 ---")
# Use the best parameters found by Random Search for final models
final_model_category = xgb.XGBClassifier(
    objective='multi:softmax', num_class=len(le_category.classes_),
    eval_metric='mlogloss', use_label_encoder=False,
    random_state=42, n_jobs=-1,
    device='cuda', # 启用GPU加速，替代 tree_method='gpu_hist' 和 predictor='gpu_predictor'
    **random_search_cat.best_params_ # 使用最佳参数
)
final_model_category.fit(X_text, y_category) # Train on FULL df_train data

final_model_misconception = xgb.XGBClassifier(
    objective='multi:softmax', num_class=len(le_misconception.classes_),
    eval_metric='mlogloss', use_label_encoder=False,
    random_state=42, n_jobs=-1,
    device='cuda', # 启用GPU加速，替代 tree_method='gpu_hist' 和 predictor='gpu_predictor'
    **random_search_mis.best_params_ # 使用最佳参数
)
final_model_misconception.fit(X_text, y_misconception) # Train on FULL df_train data
print("最终模型已在整个训练集上重新训练完毕。")


# --- 7. 定义获取前 K 个预测组合的函数 (现在使用 final_models) ---
def get_top_k_combined_predictions(question_text, mc_answer, student_explanation, k=3): # Added mc_answer parameter
    """
    为给定数据生成前 K 个 Category : Misconception 预测组合。

    Args:
        question_text (str): 问题文本。
        mc_answer (str): 多项选择答案。
        student_explanation (str): 学生解释文本。
        k (int): 要返回的最高概率组合的数量。

    Returns:
        list: 包含前 K 个 (Category : Misconception, probability) 元组的列表。
    """
    # Prepare new data
    combined_text = question_text + " " + mc_answer + " " + student_explanation # Correctly combine all three parts
    
    # Transform new data using both TF-IDF and Word Embeddings
    X_new_tfidf = tfidf_vectorizer.transform([combined_text])
    X_new_embeddings = get_embedding_features([combined_text], word_embeddings, embedding_dim)
    X_new_transformed = np.hstack((X_new_tfidf.toarray(), X_new_embeddings))


    # Get probability predictions from final_models
    proba_category = final_model_category.predict_proba(X_new_transformed)[0]
    proba_misconception = final_model_misconception.predict_proba(X_new_transformed)[0]

    # Get all class names
    category_names = le_category.classes_
    misconception_names = le_misconception.classes_

    # Combine probabilities and store all possibilities
    all_combinations = []
    for i, cat_name in enumerate(category_names):
        for j, mis_name in enumerate(misconception_names):
            combined_prob = proba_category[i] * proba_misconception[j]
            all_combinations.append((f"{cat_name}:{mis_name}", combined_prob))

    # Sort by probability in descending order and get top K results
    all_combinations.sort(key=lambda x: x[1], reverse=True)

    return all_combinations[:k]


# --- 8. 加载 test.csv 并进行预测 ---
print("\n--- 正在加载 test.csv 并进行预测 ---")

testfile_path = '/kaggle/input/map-charting-student-math-misunderstandings/test.csv'

try:
    df_test = pd.read_csv(testfile_path)
except FileNotFoundError:
    print("错误: 'test.csv' 文件未找到。请确保文件已上传。")
    df_test = pd.DataFrame({
        'row_id': [100000, 100001, 100002],
        'QuestionId': [99999, 88888, 77777],
        'QuestionText': [
            "What is 5 + 3?",
            "If x = 10, what is 2x?",
            "What is the capital of France?"
        ],
        'MC_Answer': [ # Ensure MC_Answer is in dummy data
            "8",
            "20",
            "Paris"
        ],
        'StudentExplanation': [
            "5 and 3 makes 8.",
            "2 times 10 is 20.",
            "It is the city of lights."
        ]
    })
    print("已创建示例测试数据进行演示。")


# Prepare test data for prediction - now correctly includes MC_Answer
# df_test['CombinedText'] = df_test['QuestionText'] + " " + df_test['MC_Answer'] + " " + df_test['StudentExplanation'] # This line is no longer needed here

# Create a new column to store the combined top 3 predictions
df_test['Category:Misconception'] = None

# Iterate through each row of the test DataFrame and make predictions
for index, row in df_test.iterrows():
    question_text = row['QuestionText']
    student_explanation = row['StudentExplanation']
    mc_answer = row['MC_Answer'] # Get MC_Answer from the row

    # Pass all three parts to the prediction function
    top_predictions = get_top_k_combined_predictions(question_text, mc_answer, student_explanation, k=3)

    predictions_list_str = []
    if len(top_predictions) > 0:
        predictions_list_str.append(f"{top_predictions[0][0]}")
    if len(top_predictions) > 1:
        predictions_list_str.append(f"{top_predictions[1][0]}")
    if len(top_predictions) > 2:
        predictions_list_str.append(f"{top_predictions[2][0]}")

    df_test.at[index, 'Category:Misconception'] = " ".join(predictions_list_str)


print("\n测试数据预测完成。")

# Display the first few rows of the test DataFrame with predictions
print("\n预测结果示例 (前5行):")
print(df_test[['row_id', 'Category:Misconception']].head().to_markdown(index=False, numalign="left", stralign="left"))

# You can save the results to a new CSV file if needed
df_test.to_csv('submission.csv', index=False, columns=['row_id', 'Category:Misconception'])
print("\n预测结果已保存到 'submission.csv'。")