实验要求
能够对数据进行预处理;能够使用随机森林算法构建学习失败预警模型;能够通过网格搜索方法对随机森林算法各项参数进行优化;能够应用scikit-learn中的支持向量机、逻辑回归和AdaBoost算法进行对比试验。
- 对数据进行预处理;
- 处理数据不平衡;
- 样本生成及标准化处理;
- 使用随机森林算法构建模型;
- 结果分析与可视化;
- 特征重要性分析;
- 与其他算法比较,并计算不同算法的准确率、查全率、F1值和AUC指标;
- 对几种模型的结果进行分析。
代码部分:
main.py
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn.svm import SVC
import tkinter as tk
from tkinter import Label, Button, Toplevel
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import matplotlib.pyplot as plt
from pages import show_chart
# 设置显示选项
pd.set_option('display.max_columns', 100)
# 数据加载与预处理
def load_and_preprocess_data():
df = pd.read_csv('uwide.csv', encoding='utf-8')
factor = pd.factorize(df['SEX']) #因子化,转换成整数编码
df['SEX'] = factor[0]
df = df.fillna(0)#缺失值替换为0
df['ssate'] = np.where(df['TOTALSCORE'] >= 60, 1, 0) #新建ssate列,总成绩大于等于60的为1,否则为0
df = df[['BROWSER_COUNT', 'COURSE_COUNT', 'COURSE_AVG_SCORE', 'EXAM_AH_SCORE', 'EXAM_WRITEN_SCORE', 'EXAM_MIDDLE_SCORE',
'EXAM_LAB', 'EXAM_PROGRESS', 'EXAM_GROUP_SCORE', 'EXAM_FACE_SCORE', 'EXAM_ONLINE_SCORE', 'NODEBB_CHANNEL_COUNT',
'NODEBB_TOPIC_COUNT', 'COURSE_SUM_VIDEO_LEN', 'SEX', 'GRADE', 'EXAM_HOMEWORK', 'EXAM_LABSCORE', 'EXAM_OTHERSCORE',
'NODEBB_PARTICIPATIONRATE', 'COURSE_WORKTIME', 'COURSE_WORKCOMPLETERATE', 'NODEBB_POSTSCOUNT',
'NODEBB_NORMALBBSPOSTSCOUONT', 'NODEBB_REALBBSARCHIVECOUNT', 'NORMALBBSARCHIVECOUNT', 'COURSE_WORKCOUNT',
'HOMEWORKSCORE', 'WRITTENASSIGNMENTSCORE', 'MIDDLEASSIGNMENTSCORE', 'ssate']] #选择需要的特征和标签
return df
def balance_data(df):#处理数据不平衡,下采样
df_major = df[df.ssate == 1]
df_minor = df[df.ssate == 0]
df_major_down = df_major
if len(df_major) > len(df_minor) * 8:
new_major_count = len(df_minor) * 8
df_major_down = resample(df_major, replace=False, n_samples=new_major_count, random_state=66)
df_balanced = pd.concat([df_major_down, df_minor])
return df_balanced
def split_and_scale_data(df):#划分训练集和测试集
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=27)
sm = SMOTE(random_state=27)#合成少数过采样技术
X_train_res, Y_train_res = sm.fit_resample(X_train, Y_train)
scaler = StandardScaler() #特征进行标准化,使它们具有均值为 0 和方差为 1
X_train_res = scaler.fit_transform(X_train_res)#计算均值和方差后标准化
X_test = scaler.transform(X_test)
return X_train_res, X_test, Y_train_res, Y_test
# 模型训练与评价
def evaluate_model(clf, X_train, Y_train, X_test, Y_test):
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
Y_pred_proba = clf.predict_proba(X_test)[:, 1] if hasattr(clf, "predict_proba") else Y_pred #计算每个样本属于每个类别的概率
return {
"model": clf, # 返回训练好的模型对象
"accuracy": accuracy_score(Y_test, Y_pred),
"recall": recall_score(Y_test, Y_pred),#所有实际为正例的样本中,模型正确预测为正例的比例
"precision": precision_score(Y_test, Y_pred),#所有模型预测为正例的样本中,实际为正例的比例
"f1_score": f1_score(Y_test, Y_pred),#精确率和召回率的调和平均值
"roc_auc": roc_auc_score(Y_test, Y_pred_proba)#模型预测结果的可靠性和有效性
}
# 计算特征重要性
def calculate_feature_importance(clf, feature_names):
importances = clf.feature_importances_ #随机森林训练后得到的特征重要性数组
indices = np.argsort(importances)[::-1] #对特征重要性进行降序排序,并返回索引
feature_importance = []
for f in range(len(feature_names)):
feature_importance.append((feature_names[indices[f]], importances[indices[f]]))
return feature_importance
# 显示特征重要性图表
def visualize_feature_importance(feature_importances, parent_window=None):
new_window = Toplevel()#顶级窗口对象
new_window.title("Feature Importance Visualization")
window_width = 1400
window_height = 800
screen_width = new_window.winfo_screenwidth()
screen_height = new_window.winfo_screenheight()
# 计算窗口左上角坐标使其居中
window_x = (screen_width - window_width) // 2
window_y = (screen_height - window_height) // 2 - 50
new_window.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
new_window.configure(bg="AliceBlue")
new_window.resizable(True, True)
fig, ax = plt.subplots(figsize=(12, 8)) # 增加图形的高度和宽度
y_pos = range(len(feature_importances))
ax.barh(y_pos, [imp[1] for imp in feature_importances], align="center")#绘制水平条形图
ax.set_yticks(y_pos)#设置 y 轴刻度位置
ax.set_yticklabels([imp[0] for imp in feature_importances], fontsize=10) # 调整字体大小
ax.set_xlabel("Feature Importance")
ax.set_ylabel("Feature")
ax.set_title("Random Forest Feature Importance")
plt.tight_layout() # 确保图形不重叠
canvas = FigureCanvasTkAgg(fig, master=new_window)#创建一个 FigureCanvasTkAgg 对象,将之前创建的 Matplotlib 图形 fig 嵌入到 new_window 主窗口中
canvas.draw()#绘制
canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)#expand = 1 允许控件在窗口的所有可用空间内扩展,以填充其父容器
if parent_window:
parent_window.attributes('-topmost', False) # 取消主界面的顶层设置
new_window.mainloop()
# 显示随机森林结果可视化
def visualize_rf_results(results, parent_window=None):
new_window = Toplevel()
new_window.title("Random Forest Model Results Visualization")
window_width = 1400
window_height = 900
screen_width = new_window.winfo_screenwidth()
screen_height = new_window.winfo_screenheight()
# 计算窗口左上角坐标使其居中
window_x = (screen_width - window_width) // 2
window_y = (screen_height - window_height) // 2 - 50
new_window.geometry(f"{window_width}x{window_height}+{window_x}+{window_y}")
new_window.configure(bg="AliceBlue")
new_window.resizable(False, False)
fig, ax = plt.subplots(2, 2, figsize=(12, 10)) #创建了一个包含 2 行 2 列的子图网格
metrics = ["accuracy", "recall", "precision", "f1_score"]
for i, metric in enumerate(metrics):
row, col = i // 2, i % 2 #计算位置
ax[row, col].bar(["Random Forest"], [results["Random Forest"][metric]], width=0.4) #绘制柱状图
ax[row, col].set_title(metric.capitalize(), fontsize=14) #标题字母大写
# 显示数值,并设置精度
for bar in ax[row, col].patches:#遍历当前子图中的每个柱状图条
ax[row, col].annotate(f"{bar.get_height():.8f}", (bar.get_x() + bar.get_width() / 2, bar.get_height()),
ha='center', va='center', xytext=(0, 5), textcoords='offset points') #设置具体的数值,像素偏移方式
plt.tight_layout()#自动调整子图参数
canvas = FigureCanvasTkAgg(fig, master=new_window)
canvas.draw()
canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)
if parent_window:
parent_window.attributes('-topmost', False) # 取消主界面的顶层设置
new_window.mainloop()
/**/
Pages.py
import tkinter as tk
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import matplotlib.pyplot as plt
import numpy as np
/**/
完整代码打包
基于分类算法的学习失败预警【可视化】 – AGLUO@world
运行结果截图
1.启动页面
2.主页面
3.随机森林算法运行结果
4.特征重要性分析
5.不同算法比较-准确率
6.不同算法比较-精准率
7.不同算法比较-召回率
8.不同算法比较-F1值
9.不同算法比较-ROC AUC