python打卡DAY22

##注入所需库

import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt

import random

import numpy as np

import time

import shap

# from sklearn.svm import SVC #支持向量机分类器

# # from sklearn.neighbors import KNeighborsClassifier #K近邻分类器

# # from sklearn.linear_model import LogisticRegression #逻辑回归分类器

# import xgboost as xgb #XGBoost分类器

# import lightgbm as lgb #LightGBM分类器

from sklearn.ensemble import RandomForestClassifier #随机森林分类器

# # from catboost import CatBoostClassifier #CatBoost分类器

# # from sklearn.tree import DecisionTreeClassifier #决策树分类器

# # from sklearn.naive_bayes import GaussianNB #高斯朴素贝叶斯分类器

# from skopt import BayesSearchCV

# from skopt.space import Integer

# from deap import base, creator, tools, algorithms

# from sklearn.model_selection import StratifiedKFold, cross_validate # 引入分层 K 折和交叉验证工具

# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标

from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵

from sklearn.metrics import make_scorer#定义函数

# import warnings #用于忽略警告信息

# warnings.filterwarnings("ignore") # 忽略所有警告信息

#聚类

from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

#3D可视化

from mpl_toolkits.mplot3d import Axes3D

#设置中文字体&负号正确显示

plt.rcParams['font.sans-serif']=['STHeiti']

plt.rcParams['axes.unicode_minus']=True

plt.rcParams['figure.dpi']=100

#读取数据

data=pd.read_csv(r'data.csv')

#数据填补

for i in data.columns:

if data[i].dtype!='object':

if data[i].isnull().sum()>0:

data[i].fillna(data[i].mean(),inplace=True)

else:

if data[i].isnull().sum()>0:

data[i].fillna(data[i].mode()[0],inplace=True)

mapping={'10+ years':0,

'9 years':1,

'8 years':2,

'7 years':3,

'6 years':4,

'5 years':5,

'4 years':6,

'3 years':7,

'2 years':8,

'1 year':9,

'< 1 year':10}

data['Years in current job']=data['Years in current job'].map(mapping)

dummies_list=[]

data2=pd.read_csv(r'data.csv')

data=pd.get_dummies(data=data,drop_first=True)

for i in data.columns:

if i not in data2.columns:

dummies_list.append(i)

for i in dummies_list:

data[i]=data[i].astype(int)

print(f'{data.info()}')

#划分数据集

from sklearn.model_selection import train_test_split

x=data.drop(columns=['Credit Default','Id'],axis=1)

y=data['Credit Default']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

#smote

from imblearn.over_sampling import SMOTE

smote=SMOTE(random_state=42)

x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)


 

#标准化数据,将自变量标准化,聚类就是从自变量中聚合新的自变量,与因变量无关

scaler=StandardScaler()

x_scaled=scaler.fit_transform(x)

# #KMeans++

# k_range=range(2,5)

# inertia_value=[]

# silhouette_scores=[]

# ch_scores=[]

# db_scores=[]

# start_time=time.time()

# for k in k_range:

# kmeans=KMeans(n_clusters=k,random_state=42)

# kmeans_label=kmeans.fit_predict(x_scaled)#提供了每个数据点所属的簇的信息,用于区分不同簇的数据点

# inertia_value.append(kmeans.inertia_)

# silhouette=silhouette_score(x_scaled,kmeans_label)

# silhouette_scores.append(silhouette)

# ch=calinski_harabasz_score(x_scaled,kmeans_label)

# ch_scores.append(ch)

# db=davies_bouldin_score(x_scaled,kmeans_label)

# db_scores.append(db)

# # print(f'k={k}\n 惯性:{kmeans.inertia_:.2f}\n轮廓系数:{silhouette:.3f}\n CH系数:{ch:.2f}\n DB{db:.3f}')

# end_time=time.time()

# print(f'聚类分析耗时:{end_time-start_time:.4f}')

# #绘制评估指标图

# plt.figure(figsize=(12,6))

# #肘部法则图

# plt.subplot(2,2,1)

# plt.plot(k_range,inertia_value,marker='o')

# plt.title('肘部法则确定最优聚类数 k(惯性,越小越好)')

# plt.xlabel('聚类数 (k)')

# plt.ylabel('惯性')

# plt.grid(True)

# #轮廓系数图

# plt.subplot(2,2,2)

# plt.plot(k_range,silhouette_scores,marker='o',color='orange')

# plt.title('轮廓系数确定最优聚类数 k(越大越好)')

# plt.xlabel('聚类数 (k)')

# plt.ylabel('轮廓系数')

# plt.grid(True)

# #CH指数图

# plt.subplot(2,2,3)

# plt.plot(k_range,ch_scores,marker='o',color='red')

# plt.title('Calinski-Harabasz 指数确定最优聚类数 k(越大越好)')

# plt.xlabel('聚类数 (k)')

# plt.ylabel('CH 指数')

# plt.grid(True)

# #DB指数图

# plt.subplot(2,2,4)

# plt.plot(k_range,db_scores,marker='o',color='yellow')

# plt.xlabel('聚类数 (k)')

# plt.ylabel('DB 指数')

# plt.grid(True)

# plt.tight_layout()

# plt.show()

#选择K值进行聚类

selected_k=3

kmeans=KMeans(n_clusters=selected_k,random_state=42)

kmeans_label=kmeans.fit_predict(x_scaled)

x['KMeans_Cluster']=kmeans_label

##PCA降维

pca=PCA(n_components=3)

x_pca=pca.fit_transform(x_scaled)

# # ##聚类可视化

# # plt.figure(figsize=(6,5))

# # sns.scatterplot(

# # x=x_pca[:,0],

# # y=x_pca[:,1],

# # hue=kmeans_label,

# # palette='viridis'

# # )

# # plt.title(f'KMean Clustering with k={selected_k} (PCA Visualization)')

# # plt.xlabel('PCA Component 1')

# # plt.ylabel('PCA Component 2')

# # plt.show()

# # #3D可视化

# pca=PCA(n_components=3)

# import plotly.express as px

# import plotly.graph_objects as go

# # 准备数据

# df_pca = pd.DataFrame(x_pca, columns=['PC1', 'PC2', 'PC3'])

# df_pca['Cluster'] = kmeans_label

# # 创建3D散点图

# fig = px.scatter_3d(df_pca, x='PC1', y='PC2', z='PC3', color='Cluster',

# color_continuous_scale=px.colors.sequential.Viridis,

# title=f'KMeans Clustering with k={selected_k} (PCA 3D Visualization)')

# # 调整图形

# fig.update_layout(scene=dict(xaxis_title='PCA Component 1',

# yaxis_title='PCA Component 2',

# zaxis_title='PCA Component 3'),

# width=1200, height=1000)

# # 显示图形

# fig.show()

# ##打印KMeans聚类前几行

# print(f'KMeans Cluster labels(k={selected_k}added to x):')

# print(x[['KMeans_Cluster']].value_counts())

start_time=time.time()

x1=x.drop('KMeans_Cluster',axis=1)

y1=x['KMeans_Cluster']

rf1_model=RandomForestClassifier(random_state=42,class_weight='balanced')

rf1_model.fit(x1,y1)

explainer=shap.TreeExplainer(rf1_model)

shap_values=explainer.shap_values(x1)

print(shap_values.shape)

end_time=time.time()

print(f'SHAP分析耗时:{end_time-start_time:.4f}')

# # --- 1. SHAP 特征重要性条形图 (Summary Plot - Bar) ---

# print("--- 1. SHAP 特征重要性条形图 ---")

# shap.summary_plot(shap_values[:,:,0],x1,plot_type='bar',show=False)

# plt.title('shap feature importance (bar plot)')

# plt.tight_layout()

# plt.show()

selected_features=['Purpose_debt consolidation','Home Ownership_Home Mortgage','Purpose_home improvements','Purpose_other']

# for feature in selected_features:

# unique_count=x[feature].nunique()

# print(f'{feature}的唯一值数量:{unique_count}')

# if unique_count<10:

# print(f'{feature}可能是离散型变量')

# else:

# print(f'{feature}可能是连续性变量')

# fig,axes=plt.subplots(2,2,figsize=(10,8))

# axes=axes.flatten()

# for i,feature in enumerate(selected_features):

# axes[i].hist(x[feature],bins=10)

# axes[i].set_title(f'histogram of {feature}')

# axes[i].set_xlabel(feature)

# axes[i].set_ylabel('frequency')

# plt.tight_layout()

# plt.show()

print(x[['KMeans_Cluster']].value_counts())

x_cluster0=x[x['KMeans_Cluster']==0]

x_cluster1=x[x['KMeans_Cluster']==1]

x_cluster2=x[x['KMeans_Cluster']==2]

x_cluster3=x[x['KMeans_Cluster']==3]

# #簇0

# fig,axes=plt.subplots(2,2,figsize=(6,4))

# axes=axes.flatten()

# for i,feature in enumerate(selected_features):

# sns.countplot(x=x_cluster0[feature],ax=axes[i])

# axes[i].set_title(f'countplot of {feature}')

# axes[i].set_xlabel(feature)

# axes[i].set_ylabel('count')

# plt.tight_layout()

# plt.show()

# #簇1

# fig,axes=plt.subplots(2,2,figsize=(6,4))

# axes=axes.flatten()

# for i,feature in enumerate(selected_features):

# sns.countplot(x=x_cluster1[feature],ax=axes[i])

# axes[i].set_title(f'countplot of {feature}')

# axes[i].set_xlabel(feature)

# axes[i].set_ylabel('count')

# plt.tight_layout()

# plt.show()

# #簇2

# fig,axes=plt.subplots(2,2,figsize=(6,4))

# axes=axes.flatten()

# for i,feature in enumerate(selected_features):

# sns.countplot(x=x_cluster2[feature],ax=axes[i])

# axes[i].set_title(f'countplot of {feature}')

# axes[i].set_xlabel(feature)

# axes[i].set_ylabel('count')

# plt.tight_layout()

# plt.show()


 

print("--- 递归特征消除 (RFE) ---")

from sklearn.feature_selection import RFE

base_model=RandomForestClassifier(random_state=42,class_weight='balanced')

rfe=RFE(base_model,n_features_to_select=3)

rfe.fit(x_train_smote,y_train_smote)

x_train_rfe=rfe.transform(x_train_smote)

x_test_rfe=rfe.transform(x_test)

selected_features_rfe=x_train.columns[rfe.support_]

print(f"RFE筛选后保留的特征数量: {len(selected_features_rfe)}")

print(f"保留的特征: {selected_features_rfe}")


 

# #3D可视化

import plotly.express as px

import plotly.graph_objects as go

x_selected=x[selected_features_rfe]

df_viz=pd.DataFrame(x_selected)

df_viz['cluster']=x['KMeans_Cluster']

fig=px.scatter_3d(

df_viz,

x=selected_features_rfe[0],

y=selected_features_rfe[1],

z=selected_features_rfe[2],

color='cluster',

color_continuous_scale=px.colors.sequential.Viridis,

title='RFE特征选择的3D可视化'

)

fig.update_layout(

scene=dict(

xaxis_title=selected_features_rfe[0],

yaxis_title=selected_features_rfe[1],

zaxis_title=selected_features_rfe[2]

),

width=1200,

height=1000

)

fig.show()

#训练随机森林模型

rf_model_rfe=RandomForestClassifier(random_state=42,class_weight='balanced')

rf_model_rfe.fit(x_train_rfe,y_train)

rf_pred_rfe=rf_model_rfe.predict(x_test_rfe)

print("\nRFE筛选后随机森林在测试集上的分类报告:")

print(classification_report(y_test, rf_pred_rfe))

print("RFE筛选后随机森林在测试集上的混淆矩阵:")

print(confusion_matrix(y_test, rf_pred_rfe))

# 决策树模型构建(基于处理后的初创企业收购数据) --- ## ✅ 目标 使用 `startup_encoded` 数据集训练一个**决策树分类器**,预测初创公司是否会被收购(`Acquired = 1 or 0`)。 --- ## 🔧 步骤 1:准备特征和标签 ```python from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report, confusion_matrix, accuracy_score import pandas as pd # 特征矩阵 X(所有输入变量) X = startup_encoded.drop('Acquired', axis=1) # 删除目标列 # 目标向量 y(是否被收购) y = startup_encoded['Acquired'] ``` --- ## 🔧 步骤 2:划分训练集和测试集 ```python # 按 80% 训练,20% 测试 分割数据 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y # 保持类别比例一致 ) ``` --- ## 🔧 步骤 3:训练决策树模型 ```python # 创建决策树分类器 clf = DecisionTreeClassifier( random_state=42, max_depth=6, # 控制树深,防止过拟合 min_samples_split=10, # 内部节点分裂最小样本数 min_samples_leaf=5, # 叶子节点最小样本数 class_weight='balanced' # 处理类别不平衡问题 ) # 训练模型 clf.fit(X_train, y_train) ``` --- ## 🔍 步骤 4:模型评估 ### (1) 预测测试集结果 ```python y_pred = clf.predict(X_test) ``` ### (2) 输出准确率 ```python print("Accuracy:", accuracy_score(y_test, y_pred)) ``` > 示例输出: > ``` > Accuracy: 0.875 > ``` ### (3) 混淆矩阵 ```python print("Confusion Matrix:") print(confusion_matrix(y_test, y_pred)) ``` > 示例: > ``` > [[15 3] > [ 2 10]] > ``` > - 正确预测未被收购:15 > - 错误预测为未被收购:3(实际被收购) > - 错误预测为被收购:2(实际未被收购) > - 正确预测被收购:10 ### (4) 分类报告(精确率、召回率、F1-score) ```python print("Classification Report:") print(classification_report(y_test, y_pred)) ``` > 示例输出片段: > ``` > precision recall f1-score support > 0 0.88 0.83 0.86 18 > 1 0.77 0.83 0.80 12 > accuracy 0.84 30 > ``` --- ## 📊 步骤 5:可视化决策树(可选) ```python from sklearn.tree import plot_tree import matplotlib.pyplot as plt plt.figure(figsize=(20, 10)) plot_tree(clf, feature_names=X.columns, class_names=['Not Acquired', 'Acquired'], filled=True, fontsize=10) plt.title("Decision Tree for Startup Acquisition Prediction", fontsize=16) plt.show() ``` 📌 这将展示模型是如何根据 `Tech_AI`, `Valuation`, `Funding Stage` 等特征进行判断的。 --- ## 💡 模型洞察与关键特征 你可以查看哪些特征最重要: ```python # 获取特征重要性 importances = clf.feature_importances_ feature_importance_df = pd.DataFrame({ 'Feature': X.columns, 'Importance': importances }).sort_values(by='Importance', ascending=False) # 显示前10个重要特征 print(feature_importance_df.head(10)) ``` > 典型输出示例: > > | Feature | Importance | > |---------------------|------------| > | Tech_AI | 0.32 | > | Valuation | 0.25 | > | Funding Stage_Series B | 0.15 | > | Annual Revenue | 0.10 | ✅ **结论**: - 使用 **AI 技术** 和拥有高 **估值** 的公司最可能被收购 - 融资阶段 **Series B** 是并购活跃期 --- ## ✅ 总结:这个模型能做什么? | 功能 | 说明 | |------|------| | 🎯 预测新公司被收购概率 | 输入一家公司的信息,输出“是否会被人收购” | | 🔍 发现关键驱动因素 | 找出哪些技术、行业、财务指标影响最大 | | 💼 辅助投资决策 | 帮助风投判断退出路径(IPO or 被收购) | --- ## 知识点 - **train_test_split 划分数据**:确保模型在未见数据上验证性能。 - **DecisionTreeClassifier 支持分类任务**:适合处理混合类型特征(数值+虚拟变量)。 - **feature_importances_ 揭示关键变量**:帮助理解模型逻辑,提升可解释性。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值