import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
from matplotlib import rcParams
import warnings
warnings.filterwarnings('ignore')
# 设置中文显示
rcParams['font.sans-serif'] = ['SimHei']
rcParams['axes.unicode_minus'] = False
# 读取数据
file_path = 'C:/Users/yuanx/Desktop/数据/附件1.xlsx'
df = pd.read_excel(file_path)
# 要处理的列
numeric_columns = ['abs赔付差额', 'abs赔付差额/实际赔付金额', 'log(实际赔付差额)']
categorical_columns = ['异常原因', '商品类型']
# 处理缺失值
imputer_cat = SimpleImputer(strategy='constant', fill_value='未知')
df['异常原因'] = imputer_cat.fit_transform(df[['异常原因']]).ravel()
df = df.dropna(subset=numeric_columns).copy()
# 构建预处理器
scaler = StandardScaler()
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop=None)
preprocessor = ColumnTransformer(
transformers=[
('num', scaler, numeric_columns),
('cat', Pipeline(steps=[('imputer', imputer_cat), ('encoder', encoder)]), categorical_columns)
],
remainder='drop'
)
# 创建聚类管道
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('kmeans', kmeans)
])
# 拟合并获取标签
labels = pipeline.fit_predict(df)
X_transformed = pipeline[:-1].transform(df) # 注意:这里应该用 transform 而非 fit_transform
# ========================
# 1. 原有轮廓图(保持不变)
# ========================
fig, ax1 = plt.subplots(1, 1, figsize=(8, 6))
silhouette_avg = silhouette_score(X_transformed, labels)
sample_silhouette_values = silhouette_samples(X_transformed, labels)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10
ax1.set_title("轮廓图(Silhouette Plot)")
ax1.set_xlabel("轮廓系数值")
ax1.set_ylabel("簇标签")
ax1.axvline(x=silhouette_avg, color="red", linestyle="--", label=f'平均轮廓系数: {silhouette_avg:.2f}')
ax1.legend()
plt.tight_layout()
plt.show()
# ========================
# 2. 聚类中心反变换与可解释性分析
# ========================
# 获取聚类中心(在标准化+编码后的空间)
cluster_centers_encoded = pipeline.named_steps['kmeans'].cluster_centers_
# 定义一个函数来将编码后的中心逆变换为原始特征
def inverse_transform_cluster_centers(preprocessor, cluster_centers, numeric_cols, cat_cols):
# 分离数值和类别部分
num_features = len(numeric_cols)
num_encoded_features = preprocessor.transformers_[0][1].n_features_in_
cat_encoded_features_start = num_encoded_features
encoded_feature_names = preprocessor.get_feature_names_out()
# 逆标准化数值特征
scaler = preprocessor.transformers_[0][1]
centers_numeric_scaled = cluster_centers[:, :num_encoded_features]
centers_numeric_raw = scaler.inverse_transform(centers_numeric_scaled)
# 处理分类特征(需要从 OneHotEncoder 映射回来)
encoder = preprocessor.transformers_[1][1]['encoder']
cat_encoded_array = cluster_centers[:, num_encoded_features:]
# 获取每个分类变量对应的类别名
cat_encoder_features = encoder.get_feature_names_out(cat_cols)
cat_data_start_idx = 0
centers_categorical_raw = []
for i, cat_col in enumerate(cat_cols):
# 找出该列编码了多少个类别
n_classes = encoder.categories_[i].shape[0]
cat_end_idx = cat_data_start_idx + n_classes
# 提取每个簇在该分类变量上的 one-hot 向量
cat_vectors = cat_encoded_array[:, cat_data_start_idx:cat_end_idx]
# 对每个簇,找最大概率的类别作为代表(软分配)
predicted_classes = [encoder.categories_[i][np.argmax(vec)] for vec in cat_vectors]
centers_categorical_raw.append(predicted_classes)
cat_data_start_idx = cat_end_idx
# 组合成 DataFrame
numeric_df = pd.DataFrame(centers_numeric_raw, columns=numeric_cols)
categorical_df = pd.DataFrame(centers_categorical_raw, index=cat_cols).T
result = pd.concat([numeric_df, categorical_df], axis=1)
return result
# 执行逆变换
cluster_centers_original = inverse_transform_cluster_centers(
preprocessor, cluster_centers_encoded, numeric_columns, categorical_columns
)
print("各簇聚类中心(原始尺度):")
print(cluster_centers_original.round(4))
# ========================
# 3. 多特征雷达图可视化(适合展示多个特征)
# ========================
from math import pi
def plot_radar_charts(data_df, title="聚类中心雷达图"):
categories = data_df.columns.tolist()
N = len(categories)
# 角度设置
angles = [n / float(N) * 2 * pi for n in range(N)]
angles += angles[:1] # 闭合图形
# 创建子图
fig, axes = plt.subplots(1, n_clusters, subplot_kw=dict(polar=True), figsize=(15, 6))
if n_clusters == 1:
axes = [axes]
# 归一化数据用于绘图(避免某些特征主导)
normalized_df = data_df.copy()
for col in numeric_columns:
min_val, max_val = normalized_df[col].min(), normalized_df[col].max()
if max_val != min_val:
normalized_df[col] = (normalized_df[col] - min_val) / (max_val - min_val)
else:
normalized_df[col] = 0.5
for col in categorical_columns:
# 分类变量转为序数表示(基于出现顺序)
le = pd.Categorical(normalized_df[col]).codes
normalized_df[col] = (le - le.min()) / (le.max() - le.min() + 1e-8)
for i, ax in enumerate(axes):
values = normalized_df.iloc[i].values.flatten().tolist()
values += values[:1]
ax.plot(angles, values, linewidth=2, linestyle='solid', label=f'簇 {i}')
ax.fill(angles, values, alpha=0.3)
ax.set_thetagrids([a * 180 / pi for a in angles[:-1]], categories)
ax.set_title(f"簇 {i}", size=14, pad=20)
ax.grid(True)
fig.suptitle(title, fontsize=16, y=1.05)
plt.tight_layout()
plt.show()
# 绘制雷达图
plot_radar_charts(cluster_centers_original[numeric_columns + categorical_columns])
# ========================
# 4. 补充:二维散点图(仍可用,但需说明是前两维)
# ========================
plt.figure(figsize=(8, 6))
plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=labels, cmap='viridis', alpha=0.6)
# 聚类中心投影(注意:这里是编码+标准化后的坐标)
center_proj = pipeline.named_steps['kmeans'].cluster_centers_
plt.scatter(center_proj[:, 0], center_proj[:, 1], marker='X', s=200, c='red', label='聚类中心(编码空间)')
plt.title('聚类结果投影(前两个主成分)')
plt.xlabel('abs赔付差额(标准化)')
plt.ylabel('abs赔付差额/实际赔付金额(标准化)')
plt.legend()
plt.grid(True)
plt.show() 聚类前加随机森林求特征重要性,将特征重要性作为权重加到聚类中
最新发布