# 导入所需库(与文档一致)
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')
# ==============================
# 任务1:数据准备(按文档变量名读取数据)
# ==============================
try:
order_data = pd.read_csv('meal_order_info.csv', encoding='gbk')
history_order = pd.read_csv('info_new.csv', encoding='gbk')
user_info = pd.read_csv('users.csv', encoding='gbk') # 确保此变量被正确加载
user_loss = pd.read_csv('user_loss.csv', encoding='gbk')
print("数据内容:")
print(data.to_csv(sep='\t', na_rep='nan')) # 以制表符分隔打印数据
except Exception as e:
print(f"发生错误: {e}") # 捕获并打印其他可能的异常
# ==============================
# 任务2:统计每日用餐人数与销售额(文档要求状态为1)
# ==============================
def analyze_daily_sales(data):
"""统计有效订单(状态=1)的每日用餐人数与销售额"""
valid_data = data[data['order_status'] == 1].copy()
# 提取日期(假设日期格式为'YYYY-MM-DD HH:MM',截取前10位)
valid_data['use_start_time'] = valid_data['use_start_time'].str[:10]
daily_stats = valid_data.groupby('use_start_time').agg(
daily_diners=('number_consumers', 'sum'),
daily_sales=('expenditure', 'sum')
).reset_index()
# 绘制折线图(符合文档要求)
plt.figure(figsize=(12, 6))
plt.plot(daily_stats['use_start_time'], daily_stats['daily_diners'], label='每日用餐人数', marker='o')
plt.plot(daily_stats['use_start_time'], daily_stats['daily_sales'], label='每日销售额', marker='s')
plt.title('餐饮企业每日经营趋势', fontsize=14)
plt.xlabel('日期', fontsize=12)
plt.ylabel('数值', fontsize=12)
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()
return daily_stats
# 调用函数(此时order_data已定义)
daily_trends = analyze_daily_sales(order_data)
# ==============================
# 任务3:数据预处理(构建RFM与流失特征)
# ==============================
# -------------------------
# 客户价值分析:构建RFM特征(文档表中R/F/M定义)
# -------------------------
def build_rfm(order_data, user_info, rfm_end='2016-08-31'):
merged = pd.merge(user_info, order_data, on='USER_ID', how='left')
valid_orders = merged[merged['order_status'] == 1].copy()
rfm = valid_orders.groupby('USER_ID').agg({
'use_start_time': lambda x: (pd.to_datetime(rfm_end) - x.max()).days,
'order_number': 'count',
'expenditure': 'sum'
}).reset_index()
rfm.columns = ['USER_ID', 'R', 'F', 'M']
rfm.fillna({'R': rfm['R'].max(), 'F': 0, 'M': 0}, inplace=True)
return rfm
# 执行RFM分析(此时user_info应已正确加载)
rfm_data = build_rfm(order_data, user_info)
# -------------------------
# 客户流失预测:构建流失特征(文档中4个指标)
# -------------------------
def build_churn(user_loss_data, history_order_data, churn_end='2016-07-31'):
churn_merged = pd.merge(user_loss_data, history_order_data, on='USER_ID', how='left')
churn_merged['use_start_time'] = pd.to_datetime(churn_merged['use_start_time'])
churn_features = churn_merged.groupby('USER_ID').agg({
'order_number': 'count', # frequence
'use_start_time': lambda x: (pd.to_datetime(churn_end) - x.max()).days, # recently
'expenditure': ['sum', lambda x: x.sum()/x.count() if x.count()!=0 else 0] # amount, average
}).reset_index()
churn_features.columns = ['USER_ID', 'frequence', 'recently', 'amount', 'average']
# 标记流失客户(文档未明确阈值,设为最近天数>90天)
churn_features['churn_status'] = np.where(churn_features['recently'] > 90, 1, 0)
return churn_features
churn_data = build_churn(user_loss, history_order)
# ==============================
# 任务4:K-Means聚类分析(文档聚类数=3)
# ==============================
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_data[['R', 'F', 'M']])
kmeans = KMeans(n_clusters=3, random_state=42)
rfm_data['cluster'] = kmeans.fit_predict(rfm_scaled)
# 输出聚类中心(文档要求分析各群特征)
cluster_centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_),
columns=['R', 'F', 'M'],
index=['客户群1', '客户群2', '客户群3'])
print("客户群特征中心:\n", cluster_centers.round(2))
# ==============================
# 任务5:雷达图可视化(文档要求用雷达图)
# ==============================
def plot_radar_chart(centers, features):
n_clusters = centers.shape[0]
angles = np.linspace(0, 2*np.pi, len(features), endpoint=False).tolist()
angles += angles[:1] # 闭合图形
plt.figure(figsize=(8, 8))
for i in range(n_clusters):
values = centers.iloc[i].tolist() + [centers.iloc[i, 0]]
plt.plot(angles, values, label=f'客户群{i+1}')
plt.fill(angles, values, alpha=0.2, edgecolor='black')
plt.xticks(angles[:-1], features, fontsize=10)
plt.title('客户价值聚类雷达图', fontsize=14)
plt.legend(loc='upper right')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()
plot_radar_chart(cluster_centers, ['最近消费天数(R)', '消费次数(F)', '消费金额(M)'])
# ==============================
# 任务6:决策树模型(文档使用CART算法)
# ==============================
X = churn_data[['frequence', 'recently', 'average', 'amount']]
y = churn_data['churn_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
cart_model = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)
cart_model.fit(X_train, y_train)
# ==============================
# 任务7:模型评价(文档要求混淆矩阵)
# ==============================
y_pred = cart_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("混淆矩阵:\n", cm)
print(f"精确率:{precision:.2f}, 召回率:{recall:.2f}, F1值:{f1:.2f}")x修改以上代码
最新发布