数据分析02
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, f1_score
import joblib
## 加载数据
df = pd.read_csv(r'C:\Users\78731\Desktop\Artificial Intelligence Trainer Competition\01_data analysis\credit_with_missing_values.csv')
## 数据空值处理 平均值mean() 中值median() 众数mode()
df['income'] = df['income'].fillna(df['income'].mean())
df['limit'] = df['limit'].fillna(df['limit'].mean())
df['age'] = df['age'].fillna(df['age'].median())
df['default_fre'] = df['default_fre'].fillna(df['default_fre'].mode()[0])
df['default'] = df['default'].fillna(df['default'].mode()[0])
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])
df['province'] = df['province'].fillna(df['province'].mode()[0])
gender_mapping = {'male':0, 'female':1}
df['gender'] = df['gender'].map(gender_mapping)
# 使用 OneHotEncoder 来对省份进行 one-hot 编码
onehot_encoder = OneHotEncoder(sparse=False, dtype=int) # 使用整数类型
province_encoded = onehot_encoder.fit_transform(df[['province']]) # 必须是二维数组形式
# 将one-hot编码转换为单列,每行是编码的字符串
df['province'] = [''.join(map(str, row)) for row in province_encoded]
q_low = df['income'].quantile(0.01)
q_high = df['income'].quantile(0.99)
df['income'] = np.where(df['income'] < q_low, q_low, df['income'])
df['income'] = np.where(df['income'] > q_high, q_high, df['income'])
# 处理 limit 字段的异常值
q_low = df['limit'].quantile(0.01)
q_high = df['limit'].quantile(0.99)
df['limit'] = np.where(df['limit'] < q_low, q_low, df['limit'])
df['limit'] = np.where(df['limit'] > q_high, q_high, df['limit'])
# 选择特征和目标变量
X = df.drop('default', axis=1) # 特征: 去除 'default' 列
y = df['default'] # 目标变量: 'default' 列
# 划分训练集和测试集,比例为 80% 训练,20% 测试
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 查看结果
print("训练集样本数: ", X_train.shape[0])
print("测试集样本数: ", X_test.shape[0])
model = RandomForestClassifier(n_estimators=100, random_state=42)
# 设置参数网格
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# 创建 GridSearchCV 对象
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', verbose=2)
# 执行网格搜索
grid_search.fit(X_train, y_train)
# 找到最佳模型
best_model = grid_search.best_estimator_
# 使用最佳模型对测试集进行预测
y_pred = best_model.predict(X_test)
# 保存模型
joblib.dump(best_model, r'C:\Users\78731\Desktop\Artificial Intelligence Trainer Competition\01_data analysis\final_model.pkl')
# 评估模型
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred))
# 混淆矩阵
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)
# PR曲线和F1指标
precision, recall, _ = precision_recall_curve(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)
plt.figure()
plt.plot(recall, precision, marker='.')
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()
# ROC曲线和AUC指标
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print("AUC Score:", roc_auc)
plt.figure()
plt.plot(fpr, tpr, marker='.')
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()