C:\Users\w0928\PycharmProjects\PythonProject5\.venv\Scripts\python.exe C:\Users\w0928\PycharmProjects\PythonProject5\.venv\customer_segmentation.py
正在加载数据...
元数据表 shape: (47, 6)
客户表 shape: (41138, 18)
订单表 shape: (110500, 28)
--- 元数据前5行 ---
ID ... Intro
0 223563eaa3dd11eb91291e0062022f71 ... 该表记录了客户的付款信息,包括客户信息、运输的费用信息等。
1 2235d5faa3dd11eb91291e0062022f71 ... NaN
2 2235f65ca3dd11eb91291e0062022f71 ... NaN
3 2236046ca3dd11eb91291e0062022f71 ... NaN
4 22361308a3dd11eb91291e0062022f71 ... NaN
[5 rows x 6 columns]
Traceback (most recent call last):
File "C:\Users\w0928\PycharmProjects\PythonProject5\.venv\customer_segmentation.py", line 124, in <module>
X_scaled = scaler.fit_transform(X)
File "C:\Users\w0928\PycharmProjects\PythonProject5\.venv\Lib\site-packages\sklearn\utils\_set_output.py", line 316, in wrapped
data_to_wrap = f(self, X, *args, **kwargs)
File "C:\Users\w0928\PycharmProjects\PythonProject5\.venv\Lib\site-packages\sklearn\base.py", line 894, in fit_transform
return self.fit(X, **fit_params).transform(X)
~~~~~~~~^^^^^^^^^^^^^^^^^
File "C:\Users\w0928\PycharmProjects\PythonProject5\.venv\Lib\site-packages\sklearn\preprocessing\_data.py", line 907, in fit
return self.partial_fit(X, y, sample_weight)
~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\w0928\PycharmProjects\PythonProject5\.venv\Lib\site-packages\sklearn\base.py", line 1365, in wrapper
return fit_method(estimator, *args, **kwargs)
File "C:\Users\w0928\PycharmProjects\PythonProject5\.venv\Lib\site-packages\sklearn\preprocessing\_data.py", line 943, in partial_fit
X = validate_data(
self,
...<4 lines>...
reset=first_call,
)
File "C:\Users\w0928\PycharmProjects\PythonProject5\.venv\Lib\site-packages\sklearn\utils\validation.py", line 2954, in validate_data
out = check_array(X, input_name="X", **check_params)
File "C:\Users\w0928\PycharmProjects\PythonProject5\.venv\Lib\site-packages\sklearn\utils\validation.py", line 1128, in check_array
raise ValueError(
...<3 lines>...
)
ValueError: Found array with 0 sample(s) (shape=(0, 3)) while a minimum of 1 is required by StandardScaler.出现了这个错误给我修改代码,顺便把其他错误也改一改代码如下# -*- coding: utf-8 -*-
"""
快递企业客户群识别 —— 基于K-means的客户价值分群
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings
import chardet
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 支持中文显示
plt.rcParams['axes.unicode_minus'] = False
# 检测文件编码的函数
def get_encoding(file_path):
with open(file_path, 'rb') as f:
result = chardet.detect(f.read())
return result['encoding']
# -------------------------------
# Step 1: 读取三个CSV文件
# -------------------------------
print("正在加载数据...")
# 检测并读取bwds_meta.csv
meta_encoding = get_encoding('bwds_meta.csv')
meta_df = pd.read_csv('bwds_meta.csv', encoding=meta_encoding)
# 检测并读取Customer.csv
customer_encoding = get_encoding('Customer.csv')
customer_df = pd.read_csv('Customer.csv', encoding=customer_encoding)
# 检测并读取order.csv
order_encoding = get_encoding('order.csv')
order_df = pd.read_csv('order.csv', encoding=order_encoding)
print(f"元数据表 shape: {meta_df.shape}")
print(f"客户表 shape: {customer_df.shape}")
print(f"订单表 shape: {order_df.shape}")
# -------------------------------
# Step 2: 查看字段含义(可选)
# -------------------------------
print("\n--- 元数据前5行 ---")
print(meta_df.head())
# -------------------------------
# Step 3: 数据清洗与预处理
# -------------------------------
# 重命名关键字段以便处理
customer_df.rename(columns={
'客户账号': 'customer_id',
'运单数': 'total_shipments',
'业务量': 'monetary',
'计费重量': 'charge_weight',
'主要始发站': 'main_origin',
'主要终点站': 'main_destination',
'开始城市': 'origin_city',
'终点城市': 'dest_city',
'收益': 'revenue',
'近期合作月份_time': 'last_month',
'创收站': 'revenue_station',
'流失情况': 'churn_status'
}, inplace=True)
order_df.rename(columns={
'客户账号': 'customer_id',
'客户名称': 'customer_name',
'收入金额': 'income_amount',
'收入月份_time': 'income_month',
'始发站': 'origin_station',
'目的站': 'dest_station',
'录入日期_time': 'entry_date',
'签字日期_time': 'signed_date'
}, inplace=True)
# 统一customer_id的数据类型为字符串
customer_df['customer_id'] = customer_df['customer_id'].astype(str).str.strip()
order_df['customer_id'] = order_df['customer_id'].astype(str).str.strip()
# 转换日期字段
order_df['entry_date'] = pd.to_datetime(order_df['entry_date'], errors='coerce')
order_df['signed_date'] = pd.to_datetime(order_df['signed_date'], errors='coerce')
# -------------------------------
# Step 4: 构建客户行为特征表(RFM模型)
# -------------------------------
# 计算每位客户的最近一次发货时间(Recency)
latest_activity = order_df.groupby('customer_id')['entry_date'].max().reset_index()
latest_activity['recency_days'] = (pd.Timestamp.now() - latest_activity['entry_date']).dt.days
# 计算发货频率(Frequency):近6个月有多少个活跃月?
recent_orders = order_df[order_df['entry_date'] >= (pd.Timestamp.now() - pd.DateOffset(months=6))]
monthly_freq = recent_orders.groupby('customer_id')['income_month'].nunique().reset_index()
monthly_freq.rename(columns={'income_month': 'active_months_last_6'}, inplace=True)
# 计算总收入(Monetary)
total_income = order_df.groupby('customer_id')['income_amount'].sum().reset_index()
# 合并所有特征
feature_data = customer_df[['customer_id', 'total_shipments', 'monetary']].copy()
feature_data = feature_data.merge(latest_activity[['customer_id', 'recency_days']], on='customer_id', how='left')
feature_data = feature_data.merge(monthly_freq, on='customer_id', how='left')
feature_data = feature_data.merge(total_income, on='customer_id', how='left')
# 补充新特征
feature_data['frequency'] = feature_data['active_months_last_6'].fillna(0) # 缺失值填充为0
feature_data['recency_days'] = feature_data['recency_days'].fillna(feature_data['recency_days'].max()) # 缺失值填充为最大值
feature_data['monetary'] = feature_data['monetary'].fillna(0) # 缺失值填充为0
# 去除极端异常值(可选,根据业务调整)
for col in ['recency_days', 'frequency', 'monetary']:
q1 = feature_data[col].quantile(0.25)
q3 = feature_data[col].quantile(0.75)
iqr = q3 - q1
feature_data = feature_data[(feature_data[col] >= q1 - 3*iqr) & (feature_data[col] <= q3 + 3*iqr)]
# -------------------------------
# Step 5: 特征标准化
# -------------------------------
X = feature_data[['recency_days', 'frequency', 'monetary']].copy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"用于聚类的数据形状: {X_scaled.shape}")
# 检查样本数量,若不足则调整逻辑
if X_scaled.shape[0] < 10:
print("样本数量过少,请检查数据来源或清洗逻辑!")
else:
# -------------------------------
# Step 6: 确定最优聚类数 k
# -------------------------------
inertias = []
silhouettes = []
for k in range(2, 8):
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X_scaled)
inertias.append(kmeans.inertia_)
silhouettes.append(silhouette_score(X_scaled, kmeans.labels_))
# 绘图判断
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(range(2, 8), inertias, 'bo-', label='WCSS')
plt.xlabel('聚类数 k')
plt.ylabel('WCSS(簇内平方和)')
plt.title('肘部法则 Elbow Method')
plt.grid(True)
plt.subplot(1, 2, 2)
plt.plot(range(2, 8), silhouettes, 'ro--', label='轮廓系数')
plt.xlabel('聚类数 k')
plt.ylabel('轮廓系数')
plt.title('轮廓系数法')
plt.grid(True)
plt.tight_layout()
plt.show()
# 选择 k=4
optimal_k = 4
最新发布