import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
from sklearn.utils import resample
import warnings
warnings.filterwarnings('ignore')
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
class TelcoChurnAnalyzer:
def __init__(self, file_path):
self.file_path = file_path
self.df = None
self.df_processed = None
self.models = {}
self.results = {}
def load_and_explore(self):
"""数据加载与探索"""
print("数据加载与探索")
try:
self.df = pd.read_excel(self.file_path)
print("数据加载成功!")
except Exception as e:
print(f"数据加载失败: {e}")
return False
# 基本信息
print(f"数据集形状: {self.df.shape}")
print(f"列名: {list(self.df.columns)}")
# 检查数据基本信息
print("数据基本信息:")
print(self.df.info())
# 检查缺失值
print("缺失值统计:")
missing_data = self.df.isnull().sum()
print(missing_data[missing_data > 0])
return True
def preprocess_data(self):
"""数据预处理"""
print("数据预处理")
df_clean = self.df.copy()
# 处理TotalCharges中的空字符串
df_clean['TotalCharges'] = pd.to_numeric(df_clean['TotalCharges'], errors='coerce')
# 填充缺失值
numerical_cols = ['TotalCharges', 'MonthlyCharges', 'tenure']
for col in numerical_cols:
if col in df_clean.columns and df_clean[col].isna().sum() > 0:
df_clean[col].fillna(df_clean[col].median(), inplace=True)
print(f"已填充 {col} 的缺失值")
# 删除客户ID
if 'customerID' in df_clean.columns:
df_clean = df_clean.drop('customerID', axis=1)
# 处理"No internet service"和"No phone service"
service_columns = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines']
for col in service_columns:
if col in df_clean.columns:
df_clean[col] = df_clean[col].replace({'No internet service': 'No', 'No phone service': 'No'})
# 标签编码目标变量
le = LabelEncoder()
df_clean['Churn'] = le.fit_transform(df_clean['Churn'])
print(f"目标变量编码: {dict(zip(le.classes_, le.transform(le.classes_)))}")
self.df_processed = df_clean
print("数据预处理完成")
return df_clean
def feature_engineering(self):
"""特征工程"""
print("特征工程")
df = self.df_processed.copy()
# 1. 创建新特征
# 平均每月费用
df['AvgMonthlyCharge'] = np.where(
df['tenure'] > 0,
df['TotalCharges'] / df['tenure'],
df['MonthlyCharges']
)
# 服务数量计数
service_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
'TechSupport', 'StreamingTV', 'StreamingMovies']
df['NumServices'] = df[service_cols].apply(lambda x: (x == 'Yes').sum(), axis=1)
# 是否有多服务套餐
df['HasMultipleServices'] = (df['NumServices'] > 2).astype(int)
# 客户价值分组
df['CustomerValue'] = pd.cut(df['MonthlyCharges'],
bins=[0, 35, 70, 100, 200],
labels=['低价值', '中等价值', '高价值', '极高价值'])
# 在网时长分组
df['TenureGroup'] = pd.cut(df['tenure'],
bins=[0, 12, 24, 48, 72],
labels=['新客户', '常规客户', '忠诚客户', 'VIP客户'])
# 总费用分组
df['TotalChargesGroup'] = pd.cut(df['TotalCharges'],
bins=[0, 1000, 3000, 5000, 10000],
labels=['低消费', '中等消费', '高消费', '极高消费'])
print("特征工程完成")
return df
def exploratory_analysis(self):
"""探索性数据分析"""
print("探索性数据分析")
df = self.df_processed
# 1. 目标变量分布
plt.figure(figsize=(15, 10))
plt.subplot(2, 3, 1)
churn_counts = df['Churn'].value_counts()
plt.pie(churn_counts, labels=['未流失', '流失'], autopct='%1.1f%%',
colors=['#66b3ff', '#ff6666'], startangle=90)
plt.title('客户流失分布')
# 2. 关键数值变量分布
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
for i, feature in enumerate(numerical_features, 2):
plt.subplot(2, 3, i)
if feature == 'tenure':
feature_name = '在网时长(月)'
elif feature == 'MonthlyCharges':
feature_name = '月费用(美元)'
else:
feature_name = '总费用(美元)'
sns.histplot(data=df, x=feature, hue='Churn', kde=True, alpha=0.6)
plt.title(f'{feature_name}分布')
plt.xlabel(feature_name)
# 3. 重要分类变量分析
plt.subplot(2, 3, 5)
contract_churn = df.groupby('Contract')['Churn'].mean().sort_values(ascending=False)
contract_labels = {
'Month-to-month': '按月合同',
'One year': '一年合同',
'Two year': '两年合同'
}
contract_index = [contract_labels.get(x, x) for x in contract_churn.index]
sns.barplot(x=contract_index, y=contract_churn.values, palette='viridis')
plt.title('合同类型 vs 流失率')
plt.xlabel('合同类型')
plt.ylabel('流失率')
plt.xticks(rotation=45)
plt.subplot(2, 3, 6)
internet_churn = df.groupby('InternetService')['Churn'].mean().sort_values(ascending=False)
internet_labels = {
'Fiber optic': '光纤',
'DSL': 'DSL',
'No': '无网络服务'
}
internet_index = [internet_labels.get(x, x) for x in internet_churn.index]
sns.barplot(x=internet_index, y=internet_churn.values, palette='rocket')
plt.title('网络服务类型 vs 流失率')
plt.xlabel('网络服务类型')
plt.ylabel('流失率')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 4. 相关性热力图
plt.figure(figsize=(12, 8))
numerical_df = df.select_dtypes(include=[np.number])
correlation_matrix = numerical_df.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm',
center=0, square=True, fmt='.2f')
plt.title('特征相关性热力图')
plt.tight_layout()
plt.show()
# 5. 详细的特征分析
self._detailed_feature_analysis()
def _detailed_feature_analysis(self):
"""详细特征分析"""
df = self.df_processed
# 特征标签映射
feature_labels = {
'Contract': '合同类型',
'InternetService': '网络服务类型',
'PaymentMethod': '支付方式',
'tenure': '在网时长(月)',
'MonthlyCharges': '月费用(美元)',
'OnlineSecurity': '在线安全服务'
}
# 分类变量值映射
value_labels = {
'Month-to-month': '按月合同',
'One year': '一年合同',
'Two year': '两年合同',
'Fiber optic': '光纤',
'DSL': 'DSL',
'No': '无网络服务',
'Electronic check': '电子支票',
'Mailed check': '邮寄支票',
'Bank transfer (automatic)': '银行转账',
'Credit card (automatic)': '信用卡',
'Yes': '是',
'No': '否'
}
important_features = ['Contract', 'InternetService', 'PaymentMethod',
'tenure', 'MonthlyCharges', 'OnlineSecurity']
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()
for i, feature in enumerate(important_features):
feature_label = feature_labels.get(feature, feature)
if df[feature].dtype == 'object' or df[feature].nunique() < 10:
# 分类特征
churn_rate = df.groupby(feature)['Churn'].mean().sort_values(ascending=False)
# 转换标签
index_labels = [value_labels.get(x, x) for x in churn_rate.index]
sns.barplot(x=index_labels, y=churn_rate.values, ax=axes[i], palette='Set2')
axes[i].set_title(f'{feature_label} - 流失率')
axes[i].set_xlabel(feature_label)
axes[i].set_ylabel('流失率')
axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45)
axes[i].axhline(y=df['Churn'].mean(), color='red', linestyle='--',
label='平均流失率')
if i == 0:
axes[i].legend()
else:
# 数值特征
sns.boxplot(data=df, x='Churn', y=feature, ax=axes[i], palette='Set3')
axes[i].set_title(f'{feature_label} - 分布')
axes[i].set_xlabel('是否流失')
axes[i].set_ylabel(feature_label)
axes[i].set_xticklabels(['未流失', '流失'])
plt.tight_layout()
plt.show()
def prepare_modeling_data(self):
"""准备建模数据"""
print("准备建模数据")
df = self.feature_engineering()
# 选择特征
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
'TechSupport', 'StreamingTV', 'StreamingMovies',
'Contract', 'PaperlessBilling', 'PaymentMethod',
'CustomerValue', 'TenureGroup', 'TotalChargesGroup']
numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges',
'AvgMonthlyCharge', 'NumServices']
# 创建特征集
features_to_use = numerical_features + categorical_features
features_to_use = [f for f in features_to_use if f in df.columns]
# 创建最终数据集
X = df[features_to_use]
y = df['Churn']
# 对分类变量进行编码
categorical_columns = X.select_dtypes(include=['object', 'category']).columns
numerical_columns = X.select_dtypes(include=[np.number]).columns
# One-hot编码
X_encoded = pd.get_dummies(X, columns=categorical_columns, drop_first=True)
print(f"最终特征数量: {X_encoded.shape[1]}")
print(f"目标变量分布: {y.value_counts().to_dict()}")
return X_encoded, y
def handle_imbalance(self, X, y):
"""处理数据不平衡"""
print("处理数据不平衡")
# 组合数据
data = pd.concat([X, y], axis=1)
# 分离多数类和少数类
majority_class = data[data['Churn'] == 0]
minority_class = data[data['Churn'] == 1]
print(f"多数类样本数: {len(majority_class)}")
print(f"少数类样本数: {len(minority_class)}")
# 上采样少数类
minority_upsampled = resample(minority_class,
replace=True,
n_samples=len(majority_class),
random_state=42)
# 组合数据
balanced_data = pd.concat([majority_class, minority_upsampled])
X_balanced = balanced_data.drop('Churn', axis=1)
y_balanced = balanced_data['Churn']
print(f"平衡后样本数: {len(X_balanced)}")
print(f"平衡后目标变量分布: {y_balanced.value_counts().to_dict()}")
return X_balanced, y_balanced
def train_models(self):
"""训练多个模型"""
print("模型训练")
X, y = self.prepare_modeling_data()
# 处理数据不平衡
X_balanced, y_balanced = self.handle_imbalance(X, y)
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(
X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)
# 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print(f"训练集形状: {X_train.shape}")
print(f"测试集形状: {X_test.shape}")
# 定义模型
models = {
'逻辑回归': LogisticRegression(random_state=42, max_iter=1000),
'随机森林': RandomForestClassifier(random_state=42, n_estimators=200),
'梯度提升': GradientBoostingClassifier(random_state=42, n_estimators=200),
'支持向量机': SVC(probability=True, random_state=42)
}
# 训练和评估模型
for name, model in models.items():
print(f"--- {name} ---")
if name in ['逻辑回归', '支持向量机']:
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
else:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
# 交叉验证
if name in ['逻辑回归', '支持向量机']:
cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='roc_auc')
else:
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
# 存储结果
self.results[name] = {
'model': model,
'predictions': y_pred,
'probabilities': y_pred_proba,
'cv_score_mean': cv_scores.mean(),
'cv_score_std': cv_scores.std()
}
# 打印结果
print(classification_report(y_test, y_pred))
auc_score = roc_auc_score(y_test, y_pred_proba)
print(f"AUC分数: {auc_score:.4f}")
print(f"交叉验证AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
self.X_test = X_test
self.y_test = y_test
self.X_train = X_train
self.y_train = y_train
self.X_test_scaled = X_test_scaled
def optimize_best_model(self):
"""优化最佳模型"""
print("模型优化")
# 选择表现最好的模型进行优化
best_model_name = max(self.results.keys(),
key=lambda x: self.results[x]['cv_score_mean'])
print(f"选择最佳模型进行优化: {best_model_name}")
if best_model_name == '随机森林':
param_grid = {
'n_estimators': [100, 200, 300],
'max_depth': [10, 20, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
model = RandomForestClassifier(random_state=42)
X_train, X_test, y_train, y_test = self.X_train, self.X_test, self.y_train, self.y_test
elif best_model_name == '梯度提升':
param_grid = {
'n_estimators': [100, 200, 300],
'learning_rate': [0.05, 0.1, 0.15],
'max_depth': [3, 4, 5]
}
model = GradientBoostingClassifier(random_state=42)
X_train, X_test, y_train, y_test = self.X_train, self.X_test, self.y_train, self.y_test
else:
print("跳过优化,使用默认参数")
return
# 网格搜索
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc',
n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print(f"最佳参数: {grid_search.best_params_}")
print(f"最佳分数: {grid_search.best_score_:.4f}")
# 使用优化后的模型
best_model = grid_search.best_estimator_
y_pred_optimized = best_model.predict(X_test)
y_pred_proba_optimized = best_model.predict_proba(X_test)[:, 1]
print(f"优化后的 {best_model_name} 性能:")
print(classification_report(y_test, y_pred_optimized))
optimized_auc = roc_auc_score(y_test, y_pred_proba_optimized)
print(f"优化后AUC分数: {optimized_auc:.4f}")
self.results[f'{best_model_name} (优化版)'] = {
'model': best_model,
'predictions': y_pred_optimized,
'probabilities': y_pred_proba_optimized
}
def evaluate_models(self):
"""模型评估与可视化"""
print("模型评估")
# 1. 混淆矩阵
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()
for idx, (name, result) in enumerate(list(self.results.items())[:4]):
cm = confusion_matrix(self.y_test, result['predictions'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
xticklabels=['预测未流失', '预测流失'],
yticklabels=['实际未流失', '实际流失'])
axes[idx].set_title(f'{name}\n混淆矩阵')
axes[idx].set_xlabel('预测标签')
axes[idx].set_ylabel('真实标签')
plt.tight_layout()
plt.show()
# 2. ROC曲线
plt.figure(figsize=(10, 8))
for name, result in self.results.items():
if 'probabilities' in result:
fpr, tpr, _ = roc_curve(self.y_test, result['probabilities'])
auc_score = roc_auc_score(self.y_test, result['probabilities'])
plt.plot(fpr, tpr, label=f'{name} (AUC = {auc_score:.4f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='随机分类器')
plt.xlabel('假正率 (False Positive Rate)')
plt.ylabel('真正率 (True Positive Rate)')
plt.title('ROC曲线比较')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
# 3. 精度-召回率曲线
plt.figure(figsize=(10, 8))
for name, result in self.results.items():
if 'probabilities' in result:
precision, recall, _ = precision_recall_curve(self.y_test, result['probabilities'])
plt.plot(recall, precision, label=name, linewidth=2)
plt.xlabel('召回率 (Recall)')
plt.ylabel('精度 (Precision)')
plt.title('精度-召回率曲线')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()
def feature_importance_analysis(self):
"""特征重要性分析"""
print("特征重要性分析")
# 使用随机森林分析特征重要性
rf_model = None
for name, result in self.results.items():
if '随机森林' in name:
rf_model = result['model']
break
if rf_model is None:
print("未找到随机森林模型")
return
X, y = self.prepare_modeling_data()
feature_importance = rf_model.feature_importances_
# 创建特征重要性DataFrame
importance_df = pd.DataFrame({
'feature': X.columns,
'importance': feature_importance
}).sort_values('importance', ascending=False)
# 可视化最重要的20个特征
plt.figure(figsize=(12, 10))
top_features = importance_df.head(20)
plt.subplot(2, 1, 1)
sns.barplot(data=top_features, x='importance', y='feature', palette='viridis')
plt.title('Top 20 特征重要性')
plt.xlabel('重要性得分')
plt.ylabel('特征')
# 累积重要性
plt.subplot(2, 1, 2)
cumulative_importance = importance_df['importance'].cumsum()
plt.plot(range(1, len(cumulative_importance) + 1), cumulative_importance, 'b-')
plt.axhline(y=0.8, color='r', linestyle='--', label='80% 重要性')
plt.axhline(y=0.9, color='g', linestyle='--', label='90% 重要性')
plt.xlabel('特征数量')
plt.ylabel('累积重要性')
plt.title('特征累积重要性')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
# 打印关键特征
print("Top 10 最重要的特征:")
for i, row in importance_df.head(10).iterrows():
print(f" {i+1:2d}. {row['feature']}: {row['importance']:.4f}")
return importance_df
def generate_business_insights(self, importance_df):
"""生成业务洞察"""
print("="*60)
print("业务洞察和建议")
print("="*60)
df = self.df_processed
# 1. 关键发现
print("关键发现:")
# 合同类型分析
contract_analysis = df.groupby('Contract')['Churn'].agg(['count', 'mean'])
contract_analysis['percentage'] = contract_analysis['count'] / len(df) * 100
print(f"合同类型分析:")
contract_labels = {
'Month-to-month': '按月合同',
'One year': '一年合同',
'Two year': '两年合同'
}
for contract, row in contract_analysis.iterrows():
contract_name = contract_labels.get(contract, contract)
print(f" {contract_name}: {row['count']} 客户 ({row['percentage']:.1f}%), "
f"流失率: {row['mean']:.1%}")
# 网络服务分析
internet_analysis = df.groupby('InternetService')['Churn'].agg(['count', 'mean'])
print(f"网络服务分析:")
internet_labels = {
'Fiber optic': '光纤',
'DSL': 'DSL',
'No': '无网络服务'
}
for service, row in internet_analysis.iterrows():
service_name = internet_labels.get(service, service)
print(f" {service_name}: 流失率 {row['mean']:.1%}")
# 2. 高风险客户画像
print(f"高风险客户画像:")
high_risk_profiles = [
"按月合同 + 光纤网络 + 电子支付",
"新客户(在网<12个月) + 高月费",
"无在线安全服务 + 无技术支持",
"月费 > $70 + 服务数量 < 2"
]
for i, profile in enumerate(high_risk_profiles, 1):
print(f" {i}. {profile}")
# 3. 基于特征重要性的洞察
print(f"基于模型的特征洞察:")
top_features = importance_df.head(5)['feature'].tolist()
for feature in top_features:
if 'Contract' in feature:
print(" 合同类型是预测流失的最重要因素")
elif 'tenure' in feature:
print(" 在网时长显著影响客户忠诚度")
elif 'MonthlyCharges' in feature:
print(" 月费用与流失风险相关")
elif 'InternetService' in feature:
print(" 网络服务类型影响客户满意度")
# 4. 具体建议
print(f"具体建议措施:")
recommendations = [
"重点监控按月合同客户,提供转长期合同优惠",
"改善光纤服务质量,减少服务中断",
"为新客户提供更好的入门体验和专属支持",
"推广在线安全和技术支持服务",
"对高价值客户提供个性化服务和专属优惠",
"优化电子支付体验,减少支付失败"
]
for i, rec in enumerate(recommendations, 1):
print(f" {i}. {rec}")
# 5. 预期效果
print(f"预期效果:")
expected_outcomes = [
"降低按月合同客户流失率 15-20%",
"提高新客户留存率 10-15%",
"减少光纤客户投诉 25-30%",
"提升客户生命周期价值 8-12%"
]
for outcome in expected_outcomes:
print(f" {outcome}")
print("="*60)
def run_complete_analysis(self):
"""运行完整分析流程"""
print("开始电信客户流失分析...")
# 1. 数据加载
if not self.load_and_explore():
return
# 2. 数据预处理
self.preprocess_data()
# 3. 探索性分析
self.exploratory_analysis()
# 4. 模型训练
self.train_models()
# 5. 模型优化
self.optimize_best_model()
# 6. 模型评估
self.evaluate_models()
# 7. 特征重要性分析
importance_df = self.feature_importance_analysis()
# 8. 业务洞察
self.generate_business_insights(importance_df)
print("分析完成!")
# 使用示例
if __name__ == "__main__":
# 请将路径替换为您的实际文件路径
file_path = r"C:\Users\Huahai\Desktop\人工智能技术实验项目\WA_Fn-UseC_-Telco-Customer-Churn.xlsx"
# 创建分析器并运行完整分析
analyzer = TelcoChurnAnalyzer(file_path)
analyzer.run_complete_analysis() 代码是在jupyter上运行,如果要在jupyter上分成多个小段代码要如何修改