# -*- coding: utf-8 -*-
"""
京东高价值客户识别与全链路行为预测系统
作者:李梓翀 李富生
数据来源:京东公开数据(模拟生成)
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import random
import time
import os
from faker import Faker
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.metrics import (roc_auc_score, precision_score,
recall_score, f1_score,
mean_absolute_error, roc_curve)
from sklearn.decomposition import PCA
# --------------------------
# 数据爬取与模拟生成
# --------------------------
def generate_jd_simulation_data(num_users=5000, num_records=50000):
"""
模拟生成京东用户行为数据
"""
print("开始生成模拟京东数据...")
fake = Faker('zh_CN')
np.random.seed(42)
# 创建用户基础数据
users = pd.DataFrame({
'user_id': [f'U{str(i).zfill(6)}' for i in range(1, num_users+1)],
'age': np.random.randint(18, 65, num_users),
'gender': np.random.choice(['男', '女'], num_users, p=[0.55, 0.45]),
'city': [fake.city() for _ in range(num_users)],
'is_plus_member': np.random.choice([0, 1], num_users, p=[0.7, 0.3]),
'join_date': [fake.date_between(start_date='-3y', end_date='today') for _ in range(num_users)]
})
# 创建行为数据
behavior_types = ['浏览', '加购', '购买', '评价', '收藏']
categories = {
'家电': ['冰箱', '洗衣机', '空调', '电视', '微波炉'],
'手机': ['智能手机', '配件', '平板', '智能手表'],
'电脑': ['笔记本', '台式机', '显示器', '外设'],
'数码': ['相机', '耳机', '音箱', '存储设备'],
'家居': ['家具', '家纺', '厨具', '灯具']
}
records = []
for _ in range(num_records):
user_id = f'U{str(np.random.randint(1, num_users+1)).zfill(6)}'
behavior_time = fake.date_time_between(start_date='-90d', end_date='now')
# 随机选择品类和子类
main_cat = random.choice(list(categories.keys()))
sub_cat = random.choice(categories[main_cat])
# 行为类型概率分布
behavior_prob = [0.5, 0.2, 0.15, 0.1, 0.05]
behavior_type = np.random.choice(behavior_types, p=behavior_prob)
# 订单相关数据
order_amount = 0
if behavior_type == '购买':
# 高价商品概率
if main_cat == '家电' and np.random.random() < 0.3:
order_amount = np.random.uniform(3000, 20000)
else:
order_amount = np.random.uniform(100, 3000)
# 促销活动参与
is_promotion = 1 if np.random.random() < 0.4 else 0
# 物流评分
delivery_rating = np.random.randint(3, 6) if behavior_type == '购买' else 0
records.append({
'user_id': user_id,
'behavior_time': behavior_time,
'behavior_type': behavior_type,
'main_category': main_cat,
'sub_category': sub_cat,
'order_amount': order_amount,
'is_promotion': is_promotion,
'delivery_rating': delivery_rating
})
# 创建DataFrame
df = pd.DataFrame(records)
# 添加未来行为标签(模拟未来3个月行为)
print("添加未来行为标签...")
user_purchase_future = df[df['behavior_type'] == '购买'].groupby('user_id')['order_amount'].sum().reset_index()
# 修正语法错误:括号匹配
user_purchase_future['will_buy_high_end'] = np.where(
(user_purchase_future['order_amount'] > 5000) &
(np.random.random(len(user_purchase_future)) > 0.3), 1, 0)
# PLUS会员续费倾向 - 修正语法错误
plus_users = users[users['is_plus_member'] == 1]['user_id'].tolist()
user_purchase_future['will_renew_plus'] = np.where(
user_purchase_future['user_id'].isin(plus_users),
np.random.choice([0, 1], len(user_purchase_future)), 0)
# 合并数据
df = pd.merge(df, users, on='user_id', how='left')
df = pd.merge(df, user_purchase_future[['user_id', 'will_buy_high_end', 'will_renew_plus']],
on='user_id', how='left').fillna(0)
# 保存数据
os.makedirs('data', exist_ok=True)
df.to_csv('data/jd_simulated_data.csv', index=False)
print(f"模拟数据生成完成,共 {len(df)} 条记录,保存至 data/jd_simulated_data.csv")
return df
# --------------------------
# 数据预处理
# --------------------------
def preprocess_data(df):
"""数据预处理与特征工程"""
print("\n开始数据预处理与特征工程...")
# 1. 数据清洗
# 过滤异常订单(金额异常)
df = df[df['order_amount'] <= 50000]
# 修复时间戳错误(示例:修复未来时间戳)
current_date = datetime.now()
df = df[df['behavior_time'] <= current_date]
# 2. 特征工程 - 基础特征
# 计算用户活跃天数(最近90天)
active_days = df.groupby('user_id')['behavior_time'].apply(
lambda x: x.dt.date.nunique()).reset_index(name='active_days')
# 促销敏感度(参与促销活动比例)
promo_sensitivity = df[df['is_promotion'] == 1].groupby('user_id').size().reset_index(name='promo_count')
total_actions = df.groupby('user_id').size().reset_index(name='total_actions')
promo_sensitivity = pd.merge(promo_sensitivity, total_actions, on='user_id')
promo_sensitivity['promo_sensitivity'] = promo_sensitivity['promo_count'] / promo_sensitivity['total_actions']
# 品类浏览集中度
category_concentration = df.groupby(['user_id', 'main_category']).size().reset_index(name='category_count')
category_concentration = category_concentration.groupby('user_id')['category_count'].apply(
lambda x: (x.max() / x.sum())).reset_index(name='category_concentration')
# 3. 高价值客户标签定义
high_value_criteria = df.groupby('user_id').agg(
total_spend=('order_amount', 'sum'),
purchase_count=('behavior_type', lambda x: (x == '购买').sum()),
category_count=('main_category', 'nunique')
).reset_index()
high_value_criteria['is_high_value'] = np.where(
(high_value_criteria['total_spend'] > 5000) |
(high_value_criteria['purchase_count'] > 8) |
(high_value_criteria['category_count'] >= 3), 1, 0)
# 4. 合并特征
features = pd.merge(active_days, promo_sensitivity[['user_id', 'promo_sensitivity']], on='user_id')
features = pd.merge(features, category_concentration, on='user_id')
features = pd.merge(features, high_value_criteria[['user_id', 'is_high_value']], on='user_id')
# 5. 添加用户基本信息
user_base = df[['user_id', 'age', 'gender', 'city', 'is_plus_member', 'join_date']].drop_duplicates()
features = pd.merge(features, user_base, on='user_id')
# 6. 添加时间相关特征
df['last_activity'] = df.groupby('user_id')['behavior_time'].transform('max')
features['last_activity_gap'] = (datetime.now() - features['join_date']).dt.days
# 7. 添加行为统计特征
behavior_counts = pd.crosstab(df['user_id'], df['behavior_type']).reset_index()
features = pd.merge(features, behavior_counts, on='user_id')
# 8. 品类偏好特征
for cat in ['家电', '手机', '电脑', '数码', '家居']:
cat_users = df[df['main_category'] == cat]['user_id'].unique()
features[f'prefers_{cat}'] = np.where(features['user_id'].isin(cat_users), 1, 0)
print(f"特征工程完成,共生成 {len(features.columns)} 个特征")
return features
# --------------------------
# 探索性数据分析 (EDA)
# --------------------------
def perform_eda(df, features):
"""执行探索性数据分析"""
print("\n开始探索性数据分析...")
# 设置绘图风格
sns.set_style("whitegrid")
plt.figure(figsize=(18, 12))
# 1. 用户行为类型分布
plt.subplot(2, 2, 1)
behavior_counts = df['behavior_type'].value_counts()
sns.barplot(x=behavior_counts.index, y=behavior_counts.values, palette="viridis")
plt.title('用户行为类型分布')
plt.ylabel('数量')
# 2. PLUS会员与非会员客单价对比
plt.subplot(2, 2, 2)
purchase_df = df[df['behavior_type'] == '购买']
sns.boxplot(x='is_plus_member', y='order_amount', data=purchase_df, palette="Set2")
plt.title('PLUS会员 vs 非会员客单价对比')
plt.xlabel('PLUS会员')
plt.ylabel('订单金额')
# 3. 物流评分与复购率关系
plt.subplot(2, 2, 3)
# 计算复购率
repurchase_users = purchase_df.groupby('user_id').filter(lambda x: len(x) > 1)['user_id'].unique()
purchase_df['is_repurchase'] = purchase_df['user_id'].isin(repurchase_users).astype(int)
# 按物流评分分组计算复购率
delivery_repurchase = purchase_df.groupby('delivery_rating')['is_repurchase'].mean().reset_index()
sns.lineplot(x='delivery_rating', y='is_repurchase', data=delivery_repurchase,
marker='o', linewidth=2.5, color='darkorange')
plt.title('物流评分对复购率的影响')
plt.xlabel('物流评分')
plt.ylabel('复购率')
plt.ylim(0, 1)
# 4. 高价值客户特征热力图
plt.subplot(2, 2, 4)
corr_matrix = features[['active_days', 'promo_sensitivity', 'category_concentration',
'购买', '加购', 'is_high_value']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('行为特征相关性')
plt.tight_layout()
plt.savefig('results/eda_results.png', dpi=300)
plt.show()
# 5. 高价值客户人口统计特征
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.countplot(x='gender', hue='is_high_value', data=features, palette="Set1")
plt.title('高价值客户性别分布')
plt.xlabel('性别')
plt.ylabel('数量')
plt.subplot(1, 2, 2)
sns.boxplot(x='is_high_value', y='age', data=features, palette="Set2")
plt.title('高价值客户年龄分布')
plt.xlabel('是否高价值客户')
plt.ylabel('年龄')
plt.tight_layout()
plt.savefig('results/high_value_demographics.png', dpi=300)
plt.show()
print("EDA分析完成,结果保存至 results/ 目录")
# --------------------------
# 预测模型构建
# --------------------------
def build_prediction_models(features, target_column):
"""构建预测模型"""
print(f"\n构建预测模型: {target_column}")
# 1. 数据准备
# 选择特征
model_features = features.drop(['user_id', 'join_date', 'will_buy_high_end', 'will_renew_plus'], axis=1, errors='ignore')
# 处理分类变量
categorical_cols = ['gender', 'city']
model_features = pd.get_dummies(model_features, columns=categorical_cols, drop_first=True)
# 定义目标变量
y = features[target_column]
# 2. 划分训练集/测试集
X_train, X_test, y_train, y_test = train_test_split(
model_features, y, test_size=0.25, random_state=42, stratify=y)
# 3. 模型初始化
models = {
'XGBoost': RandomForestClassifier(
n_estimators=150,
max_depth=8,
min_samples_split=10,
class_weight='balanced',
random_state=42
),
'随机森林': RandomForestClassifier(
n_estimators=150,
max_depth=8,
min_samples_split=10,
class_weight='balanced',
random_state=42
),
'逻辑回归': LogisticRegression(
max_iter=1000,
class_weight='balanced',
penalty='l2',
C=0.1,
random_state=42,
solver='liblinear'
)
}
# 4. 模型训练与评估
results = {}
feature_importances = {}
for name, model in models.items():
print(f"训练 {name} 模型...")
start_time = time.time()
model.fit(X_train, y_train)
train_time = time.time() - start_time
# 预测
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else [0]*len(y_test)
# 关键指标计算
auc = roc_auc_score(y_test, y_proba) if len(np.unique(y_test)) > 1 else 0.5
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
# 自定义加权MAE(高价商品权重更高)
weights = np.where(features.loc[y_test.index, 'is_high_value'] == 1, 2.0, 1.0)
mae = mean_absolute_error(y_test, y_proba, sample_weight=weights) if len(np.unique(y_test)) > 1 else 0
results[name] = {
'AUC': auc,
'精确率': precision,
'召回率': recall,
'F1分数': f1,
'加权MAE': mae,
'训练时间(秒)': train_time
}
# 保存重要特征
if hasattr(model, 'feature_importances_'):
feat_imp = pd.Series(model.feature_importances_, index=X_train.columns)
feature_importances[name] = feat_imp.sort_values(ascending=False)
# 绘制ROC曲线
if len(np.unique(y_test)) > 1:
plt.figure(figsize=(8, 6))
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('假阳性率')
plt.ylabel('真阳性率')
plt.title(f'{target_column} ROC曲线')
plt.legend()
plt.savefig(f'results/{target_column}_{name}_roc_curve.png', dpi=300)
plt.close()
# 5. 特征重要性可视化
for model_name, imp in feature_importances.items():
plt.figure(figsize=(10, 8))
imp.head(15).sort_values().plot(kind='barh')
plt.title(f'{model_name} - 特征重要性 (Top 15)')
plt.tight_layout()
plt.savefig(f'results/{target_column}_{model_name}_feature_importance.png', dpi=300)
plt.close()
return results, models
# --------------------------
# 客户分群与画像
# --------------------------
def customer_segmentation(features):
"""客户分群与画像生成"""
print("\n进行客户分群...")
# 选择特征
cluster_features = features[[
'active_days', 'promo_sensitivity', 'category_concentration',
'浏览', '加购', '购买', 'age'
]]
# 标准化
scaler = StandardScaler()
X_cluster = scaler.fit_transform(cluster_features)
# KMeans聚类
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
features['cluster'] = kmeans.fit_predict(X_cluster)
# 分析群体特征
cluster_profiles = features.groupby('cluster').agg({
'active_days': 'mean',
'promo_sensitivity': 'mean',
'category_concentration': 'mean',
'浏览': 'mean',
'加购': 'mean',
'购买': 'mean',
'age': 'mean',
'is_high_value': 'mean',
'will_buy_high_end': 'mean',
'will_renew_plus': 'mean',
'is_plus_member': 'mean'
}).reset_index()
# 重命名集群
cluster_names = {
0: '低价值观望者',
1: '高价值忠诚客户',
2: '年轻活跃用户',
3: '促销敏感型用户',
4: '高消费低频用户'
}
cluster_profiles['cluster_name'] = cluster_profiles['cluster'].map(cluster_names)
features['cluster_name'] = features['cluster'].map(cluster_names)
# 可视化 - 群体价值分布
plt.figure(figsize=(10, 6))
sns.barplot(x='cluster_name', y='is_high_value', data=cluster_profiles, palette="viridis")
plt.title('各客户群体高价值比例')
plt.xlabel('客户群体')
plt.ylabel('高价值客户比例')
plt.xticks(rotation=15)
plt.tight_layout()
plt.savefig('results/cluster_high_value_distribution.png', dpi=300)
plt.close()
# 保存客户画像
cluster_profiles.to_csv('results/customer_cluster_profiles.csv', index=False)
features.to_csv('results/customer_segmented_data.csv', index=False)
print("客户分群完成,结果保存至 results/ 目录")
return cluster_profiles
# --------------------------
# 生成业务报告
# --------------------------
def generate_business_report(cluster_profiles, model_results):
"""生成业务策略报告"""
print("\n生成业务策略报告...")
report = """
# 京东高价值客户识别与行为预测分析报告
## 1. 项目概述
本项目通过分析京东用户行为数据,构建高价值客户识别模型并预测其全链路行为。研究目标包括:
- 建立高价值客户评估体系
- 预测高价商品购买概率(家电3C等)
- 预测PLUS会员续费倾向
- 提出精准营销策略
## 2. 关键发现
### 2.1 高价值客户特征
- 高价值客户占比: {:.1f}%
- 高价值客户主要特征:
- 活跃天数比普通客户高{:.1f}倍
- 促销敏感度比普通客户高{:.1f}%
- 跨品类消费比例比普通客户高{:.1f}倍
### 2.2 客户群体分析
我们识别出5类典型客户群体:
""".format(
cluster_profiles['is_high_value'].mean() * 100,
cluster_profiles[cluster_profiles['is_high_value'] > 0.5]['active_days'].mean() /
cluster_profiles[cluster_profiles['is_high_value'] < 0.3]['active_days'].mean(),
(cluster_profiles[cluster_profiles['is_high_value'] > 0.5]['promo_sensitivity'].mean() -
cluster_profiles[cluster_profiles['is_high_value'] < 0.3]['promo_sensitivity'].mean()) * 100,
cluster_profiles[cluster_profiles['is_high_value'] > 0.5]['category_concentration'].mean() /
cluster_profiles[cluster_profiles['is_high_value'] < 0.3]['category_concentration'].mean()
)
for _, row in cluster_profiles.iterrows():
report += "- **{}**: {:.1f}%为高价值客户,平均年龄{:.1f}岁,主要特征:{}\n".format(
row['cluster_name'],
row['is_high_value'] * 100,
row['age'],
get_cluster_description(row)
)
report += "\n### 2.3 预测模型性能\n"
# 高价商品购买预测结果
report += "**高价商品购买预测**:\n"
for model, metrics in model_results['will_buy_high_end'].items():
report += ("- {}: AUC={:.3f}, 精确率={:.3f}, 召回率={:.3f}, "
"F1={:.3f}\n").format(
model, metrics['AUC'], metrics['精确率'],
metrics['召回率'], metrics['F1分数'])
# PLUS会员续费预测结果
report += "\n**PLUS会员续费预测**:\n"
for model, metrics in model_results['will_renew_plus'].items():
report += ("- {}: AUC={:.3f}, 精确率={:.3f}, 召回率={:.3f}, "
"F1={:.3f}\n").format(
model, metrics['AUC'], metrics['精确率'],
metrics['召回率'], metrics['F1分数'])
report += """
## 3. 业务建议
### 3.1 高价值客户运营策略
- **高价值忠诚客户**: 提供专属客服、优先配送和限量商品访问权限
- **高消费低频用户**: 通过个性化推荐提高购买频率,推送高端新品
- **促销敏感型用户**: 定向发送优惠券和限时促销信息
### 3.2 PLUS会员增长策略
- 针对高价值客户群体提供专属会员优惠
- 预测有流失风险的会员,提供续费激励
- 为新会员提供首单立减优惠
### 3.3 家电3C品类增长策略
- 对高价商品潜在购买者提供分期免息服务
- 结合用户浏览行为推送相关配件和延保服务
- 针对跨品类用户提供组合优惠
## 4. 实施计划
1. 部署预测模型到京东营销系统
2. 开发客户分群运营平台
3. 设计个性化营销活动
4. 建立效果监测指标体系
"""
# 保存报告
os.makedirs('results', exist_ok=True)
with open('results/business_report.md', 'w', encoding='utf-8') as f:
f.write(report)
print("业务报告生成完成,保存至 results/business_report.md")
return report
def get_cluster_description(row):
"""生成客户群体描述"""
desc_map = {
'低价值观望者': "浏览多购买少,促销敏感度低",
'高价值忠诚客户': "高活跃、高消费、多品类购买",
'年轻活跃用户': "活跃度高但消费水平中等",
'促销敏感型用户': "对促销活动高度敏感,购买集中在促销期",
'高消费低频用户': "购买频次低但单次消费金额高"
}
return desc_map.get(row['cluster_name'], "未知群体")
# --------------------------
# 主执行流程
# --------------------------
if __name__ == "__main__":
# 创建结果目录
os.makedirs('data', exist_ok=True)
os.makedirs('results', exist_ok=True)
# 生成模拟数据
if not os.path.exists('data/jd_simulated_data.csv'):
df = generate_jd_simulation_data()
else:
df = pd.read_csv('data/jd_simulated_data.csv', parse_dates=['behavior_time'])
print("加载现有模拟数据...")
# 预处理与特征工程
features = preprocess_data(df)
# 探索性数据分析
perform_eda(df, features)
# 构建预测模型
model_results = {}
high_end_results, high_end_models = build_prediction_models(features, 'will_buy_high_end')
model_results['will_buy_high_end'] = high_end_results
plus_renew_results, plus_models = build_prediction_models(features, 'will_renew_plus')
model_results['will_renew_plus'] = plus_renew_results
# 客户分群
cluster_profiles = customer_segmentation(features)
# 生成业务报告
report = generate_business_report(cluster_profiles, model_results)
print("\n" + "="*50)
print("京东高价值客户分析完成!")
print("="*50)
print("结果文件:")
print("- 原始数据: data/jd_simulated_data.csv")
print("- 特征数据: results/customer_segmented_data.csv")
print("- 客户画像: results/customer_cluster_profiles.csv")
print("- 分析报告: results/business_report.md")
print("- 可视化图表: results/ 目录下的图片文件") 这段代码报了如下的错误 Cell In[2], line 564
561 print("加载现有模拟数据...")
563 # 预处理与特征工程
--> 564 features = preprocess_data(df)
566 # 探索性数据分析
567 perform_eda(df, features)
Cell In[2], line 180
178 # 6. 添加时间相关特征
179 df['last_activity'] = df.groupby('user_id')['behavior_time'].transform('max')
--> 180 features['last_activity_gap'] = (datetime.now() - features['join_date']).dt.days
182 # 7. 添加行为统计特征
183 behavior_counts = pd.crosstab(df['user_id'], df['behavior_type']).reset_index()
File c:\Users\lzc\anaconda3\envs\lzc\lib\site-packages\pandas\core\ops\common.py:72, in _unpack_zerodim_and_defer.<locals>.new_method(self, other)
68 return NotImplemented
70 other = item_from_zerodim(other)
---> 72 return method(self, other)
File c:\Users\lzc\anaconda3\envs\lzc\lib\site-packages\pandas\core\arraylike.py:114, in OpsMixin.__rsub__(self, other)
112 @unpack_zerodim_and_defer("__rsub__")
113 def __rsub__(self, other):
--> 114 return self._arith_method(other, roperator.rsub)
...
File c:\Users\lzc\anaconda3\envs\lzc\lib\site-packages\pandas\core\roperator.py:15, in rsub(left, right)
14 def rsub(left, right):
---> 15 return right - left
TypeError: unsupported operand type(s) for -: 'Timestamp' and 'datetime.date'这个报错是怎么回事,帮我改进一下