import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score
import shap
# 定义一个函数来正确转换数据格式
def convert_value(value):
if isinstance(value, bytes):
value = value.decode('utf-8')
value = str(value).strip("b'")
try:
return int(value)
except ValueError:
return float(value)
# 加载数据集
df = pd.read_csv('caesarian.csv')
# 应用函数转换数据
df = df.applymap(convert_value)
# 查看数据基本信息
print('数据基本信息:')
df.info()
# 查看数据集行数和列数
rows, columns = df.shape
if rows < 100 and columns < 20:
# 短表数据(行数少于100且列数少于20)查看全量数据信息
print('数据全部内容信息:')
print(df.to_csv(sep='\t', na_rep='nan'))
else:
# 长表数据查看数据前几行信息
print('数据前几行内容信息:')
print(df.head().to_csv(sep='\t', na_rep='nan'))
# 设置图片清晰度
plt.rcParams['figure.dpi'] = 300
# 设置中文字体(解决中文显示问题)
plt.rcParams['font.sans-serif'] = ['SimHei'] # Windows系统常用黑体字体
plt.rcParams['axes.unicode_minus'] = False # 正常显示负号
# 绘制热力图
corr = df.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm',)
plt.xticks(rotation=45, ha='right', fontsize=8) # 旋转 45 度,右对齐,设置字体大小
plt.yticks(fontsize=5)
plt.title('数据相关性热力图')
print('热力图绘制完成')
plt.show()
# 绘制子图
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
sns.histplot(df['Age'], bins=10, kde=False, ax=axes[0, 0])
axes[0, 0].set_title('年龄分布')
sns.histplot(df['Delivery number'], bins=10, kde=False, ax=axes[0, 1])
axes[0, 1].set_title('分娩次数分布')
sns.histplot(df['Delivery time'], bins=10, kde=False, ax=axes[0, 2])
axes[0, 2].set_title('分娩时间分布')
sns.histplot(df['Blood of Pressure'], bins=10, kde=False, ax=axes[1, 0])
axes[1, 0].set_title('血压分布')
sns.histplot(df['Heart Problem'], bins=10, kde=False, ax=axes[1, 1])
axes[1, 1].set_title('心脏问题分布')
sns.histplot(df['Caesarian'], bins=10, kde=False, ax=axes[1, 2])
axes[1, 2].set_title('剖腹产分布')
plt.tight_layout()
print('子图绘制完成')
plt.show()
# 定义特征和目标变量
X = df.drop('Caesarian', axis=1)
y = df['Caesarian']
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X_scaled, y, test_size=0.2, random_state=42)
print('训练集和测试集划分完成')
# 定义要尝试的模型
models = {
'逻辑回归': LogisticRegression(),
'决策树': DecisionTreeClassifier(),
'随机森林': RandomForestClassifier(),
'支持向量机': SVC()
}
# 训练并评估模型
results = {}
for model_name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
results[model_name] = {'准确率': accuracy, 'F1分数': f1}
# 输出模型评估结果
print('模型评估结果:')
for model_name, metrics in results.items():
print(f'{model_name}: 准确率 = {metrics["准确率"]:.4f}, F1分数 = {metrics["F1分数"]:.4f}')
# 找到最佳模型
best_model_name = max(results, key=lambda x: results[x]['准确率'])
best_model = models[best_model_name]
print(f'最佳模型为 {best_model_name}')
# 对最佳模型进行调参
param_grid = {
'C': [0.001, 0.01, 0.1, 1, 10, 100],
'penalty': ['l1', 'l2']
}
grid_search = GridSearchCV(best_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f'{best_model_name} 模型调参完成')
# 使用最佳模型进行预测
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f'调参后的最佳模型 {best_model_name}: 准确率 = {accuracy:.4f}, F1分数 = {f1:.4f}')
# 绘制 SHAP 图
if best_model_name == '逻辑回归':
explainer = shap.LinearExplainer(best_model, X_train)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names=X.columns)
print('SHAP 图绘制完成')
plt.show()