red5phone features

本文介绍了一种基于FC技术实现与闪存客户端、软电话和Avaya系统的集成方案,涵盖了基本呼叫、转移、挂起/恢复、音乐等待、双线路、SIP消息、视频呼叫和会议等功能。

1.FC(falsh client) To FC

2.FC to softphone(such as xlite)

3.FC to avaya

 

features:

1.base call

2.transfer

3.hold/unhold

4.music hold/unhold

5.2 lines

6.sip message

7.vedio call

8.conference

 

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve from sklearn.utils import resample import warnings warnings.filterwarnings('ignore') # 设置中文字体 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False class TelcoChurnAnalyzer: def __init__(self, file_path): self.file_path = file_path self.df = None self.df_processed = None self.models = {} self.results = {} def load_and_explore(self): """数据加载与探索""" print("数据加载与探索") try: self.df = pd.read_excel(self.file_path) print("数据加载成功!") except Exception as e: print(f"数据加载失败: {e}") return False # 基本信息 print(f"数据集形状: {self.df.shape}") print(f"列名: {list(self.df.columns)}") # 检查数据基本信息 print("数据基本信息:") print(self.df.info()) # 检查缺失值 print("缺失值统计:") missing_data = self.df.isnull().sum() print(missing_data[missing_data > 0]) return True def preprocess_data(self): """数据预处理""" print("数据预处理") df_clean = self.df.copy() # 处理TotalCharges中的空字符串 df_clean['TotalCharges'] = pd.to_numeric(df_clean['TotalCharges'], errors='coerce') # 填充缺失值 numerical_cols = ['TotalCharges', 'MonthlyCharges', 'tenure'] for col in numerical_cols: if col in df_clean.columns and df_clean[col].isna().sum() > 0: df_clean[col].fillna(df_clean[col].median(), inplace=True) print(f"已填充 {col} 的缺失值") # 删除客户ID if 'customerID' in df_clean.columns: df_clean = df_clean.drop('customerID', axis=1) # 处理"No internet service"和"No phone service" service_columns = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines'] for col in service_columns: if col in df_clean.columns: df_clean[col] = df_clean[col].replace({'No internet service': 'No', 'No phone service': 'No'}) # 标签编码目标变量 le = LabelEncoder() df_clean['Churn'] = le.fit_transform(df_clean['Churn']) print(f"目标变量编码: {dict(zip(le.classes_, le.transform(le.classes_)))}") self.df_processed = df_clean print("数据预处理完成") return df_clean def feature_engineering(self): """特征工程""" print("特征工程") df = self.df_processed.copy() # 1. 创建新特征 # 平均每月费用 df['AvgMonthlyCharge'] = np.where( df['tenure'] > 0, df['TotalCharges'] / df['tenure'], df['MonthlyCharges'] ) # 服务数量计数 service_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'] df['NumServices'] = df[service_cols].apply(lambda x: (x == 'Yes').sum(), axis=1) # 是否有多服务套餐 df['HasMultipleServices'] = (df['NumServices'] > 2).astype(int) # 客户价值分组 df['CustomerValue'] = pd.cut(df['MonthlyCharges'], bins=[0, 35, 70, 100, 200], labels=['低价值', '中等价值', '高价值', '极高价值']) # 在网时长分组 df['TenureGroup'] = pd.cut(df['tenure'], bins=[0, 12, 24, 48, 72], labels=['新客户', '常规客户', '忠诚客户', 'VIP客户']) # 总费用分组 df['TotalChargesGroup'] = pd.cut(df['TotalCharges'], bins=[0, 1000, 3000, 5000, 10000], labels=['低消费', '中等消费', '高消费', '极高消费']) print("特征工程完成") return df def exploratory_analysis(self): """探索性数据分析""" print("探索性数据分析") df = self.df_processed # 1. 目标变量分布 plt.figure(figsize=(15, 10)) plt.subplot(2, 3, 1) churn_counts = df['Churn'].value_counts() plt.pie(churn_counts, labels=['未流失', '流失'], autopct='%1.1f%%', colors=['#66b3ff', '#ff6666'], startangle=90) plt.title('客户流失分布') # 2. 关键数值变量分布 numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges'] for i, feature in enumerate(numerical_features, 2): plt.subplot(2, 3, i) if feature == 'tenure': feature_name = '在网时长(月)' elif feature == 'MonthlyCharges': feature_name = '月费用(美元)' else: feature_name = '总费用(美元)' sns.histplot(data=df, x=feature, hue='Churn', kde=True, alpha=0.6) plt.title(f'{feature_name}分布') plt.xlabel(feature_name) # 3. 重要分类变量分析 plt.subplot(2, 3, 5) contract_churn = df.groupby('Contract')['Churn'].mean().sort_values(ascending=False) contract_labels = { 'Month-to-month': '按月合同', 'One year': '一年合同', 'Two year': '两年合同' } contract_index = [contract_labels.get(x, x) for x in contract_churn.index] sns.barplot(x=contract_index, y=contract_churn.values, palette='viridis') plt.title('合同类型 vs 流失率') plt.xlabel('合同类型') plt.ylabel('流失率') plt.xticks(rotation=45) plt.subplot(2, 3, 6) internet_churn = df.groupby('InternetService')['Churn'].mean().sort_values(ascending=False) internet_labels = { 'Fiber optic': '光纤', 'DSL': 'DSL', 'No': '无网络服务' } internet_index = [internet_labels.get(x, x) for x in internet_churn.index] sns.barplot(x=internet_index, y=internet_churn.values, palette='rocket') plt.title('网络服务类型 vs 流失率') plt.xlabel('网络服务类型') plt.ylabel('流失率') plt.xticks(rotation=45) plt.tight_layout() plt.show() # 4. 相关性热力图 plt.figure(figsize=(12, 8)) numerical_df = df.select_dtypes(include=[np.number]) correlation_matrix = numerical_df.corr() mask = np.triu(np.ones_like(correlation_matrix, dtype=bool)) sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0, square=True, fmt='.2f') plt.title('特征相关性热力图') plt.tight_layout() plt.show() # 5. 详细的特征分析 self._detailed_feature_analysis() def _detailed_feature_analysis(self): """详细特征分析""" df = self.df_processed # 特征标签映射 feature_labels = { 'Contract': '合同类型', 'InternetService': '网络服务类型', 'PaymentMethod': '支付方式', 'tenure': '在网时长(月)', 'MonthlyCharges': '月费用(美元)', 'OnlineSecurity': '在线安全服务' } # 分类变量值映射 value_labels = { 'Month-to-month': '按月合同', 'One year': '一年合同', 'Two year': '两年合同', 'Fiber optic': '光纤', 'DSL': 'DSL', 'No': '无网络服务', 'Electronic check': '电子支票', 'Mailed check': '邮寄支票', 'Bank transfer (automatic)': '银行转账', 'Credit card (automatic)': '信用卡', 'Yes': '是', 'No': '否' } important_features = ['Contract', 'InternetService', 'PaymentMethod', 'tenure', 'MonthlyCharges', 'OnlineSecurity'] fig, axes = plt.subplots(2, 3, figsize=(18, 12)) axes = axes.ravel() for i, feature in enumerate(important_features): feature_label = feature_labels.get(feature, feature) if df[feature].dtype == 'object' or df[feature].nunique() < 10: # 分类特征 churn_rate = df.groupby(feature)['Churn'].mean().sort_values(ascending=False) # 转换标签 index_labels = [value_labels.get(x, x) for x in churn_rate.index] sns.barplot(x=index_labels, y=churn_rate.values, ax=axes[i], palette='Set2') axes[i].set_title(f'{feature_label} - 流失率') axes[i].set_xlabel(feature_label) axes[i].set_ylabel('流失率') axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45) axes[i].axhline(y=df['Churn'].mean(), color='red', linestyle='--', label='平均流失率') if i == 0: axes[i].legend() else: # 数值特征 sns.boxplot(data=df, x='Churn', y=feature, ax=axes[i], palette='Set3') axes[i].set_title(f'{feature_label} - 分布') axes[i].set_xlabel('是否流失') axes[i].set_ylabel(feature_label) axes[i].set_xticklabels(['未流失', '流失']) plt.tight_layout() plt.show() def prepare_modeling_data(self): """准备建模数据""" print("准备建模数据") df = self.feature_engineering() # 选择特征 categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'CustomerValue', 'TenureGroup', 'TotalChargesGroup'] numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgMonthlyCharge', 'NumServices'] # 创建特征集 features_to_use = numerical_features + categorical_features features_to_use = [f for f in features_to_use if f in df.columns] # 创建最终数据集 X = df[features_to_use] y = df['Churn'] # 对分类变量进行编码 categorical_columns = X.select_dtypes(include=['object', 'category']).columns numerical_columns = X.select_dtypes(include=[np.number]).columns # One-hot编码 X_encoded = pd.get_dummies(X, columns=categorical_columns, drop_first=True) print(f"最终特征数量: {X_encoded.shape[1]}") print(f"目标变量分布: {y.value_counts().to_dict()}") return X_encoded, y def handle_imbalance(self, X, y): """处理数据不平衡""" print("处理数据不平衡") # 组合数据 data = pd.concat([X, y], axis=1) # 分离多数类和少数类 majority_class = data[data['Churn'] == 0] minority_class = data[data['Churn'] == 1] print(f"多数类样本数: {len(majority_class)}") print(f"少数类样本数: {len(minority_class)}") # 上采样少数类 minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class), random_state=42) # 组合数据 balanced_data = pd.concat([majority_class, minority_upsampled]) X_balanced = balanced_data.drop('Churn', axis=1) y_balanced = balanced_data['Churn'] print(f"平衡后样本数: {len(X_balanced)}") print(f"平衡后目标变量分布: {y_balanced.value_counts().to_dict()}") return X_balanced, y_balanced def train_models(self): """训练多个模型""" print("模型训练") X, y = self.prepare_modeling_data() # 处理数据不平衡 X_balanced, y_balanced = self.handle_imbalance(X, y) # 分割数据 X_train, X_test, y_train, y_test = train_test_split( X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced ) # 特征标准化 scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) print(f"训练集形状: {X_train.shape}") print(f"测试集形状: {X_test.shape}") # 定义模型 models = { '逻辑回归': LogisticRegression(random_state=42, max_iter=1000), '随机森林': RandomForestClassifier(random_state=42, n_estimators=200), '梯度提升': GradientBoostingClassifier(random_state=42, n_estimators=200), '支持向量机': SVC(probability=True, random_state=42) } # 训练和评估模型 for name, model in models.items(): print(f"--- {name} ---") if name in ['逻辑回归', '支持向量机']: model.fit(X_train_scaled, y_train) y_pred = model.predict(X_test_scaled) y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] else: model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_proba = model.predict_proba(X_test)[:, 1] # 交叉验证 if name in ['逻辑回归', '支持向量机']: cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='roc_auc') else: cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc') # 存储结果 self.results[name] = { 'model': model, 'predictions': y_pred, 'probabilities': y_pred_proba, 'cv_score_mean': cv_scores.mean(), 'cv_score_std': cv_scores.std() } # 打印结果 print(classification_report(y_test, y_pred)) auc_score = roc_auc_score(y_test, y_pred_proba) print(f"AUC分数: {auc_score:.4f}") print(f"交叉验证AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})") self.X_test = X_test self.y_test = y_test self.X_train = X_train self.y_train = y_train self.X_test_scaled = X_test_scaled def optimize_best_model(self): """优化最佳模型""" print("模型优化") # 选择表现最好的模型进行优化 best_model_name = max(self.results.keys(), key=lambda x: self.results[x]['cv_score_mean']) print(f"选择最佳模型进行优化: {best_model_name}") if best_model_name == '随机森林': param_grid = { 'n_estimators': [100, 200, 300], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4] } model = RandomForestClassifier(random_state=42) X_train, X_test, y_train, y_test = self.X_train, self.X_test, self.y_train, self.y_test elif best_model_name == '梯度提升': param_grid = { 'n_estimators': [100, 200, 300], 'learning_rate': [0.05, 0.1, 0.15], 'max_depth': [3, 4, 5] } model = GradientBoostingClassifier(random_state=42) X_train, X_test, y_train, y_test = self.X_train, self.X_test, self.y_train, self.y_test else: print("跳过优化,使用默认参数") return # 网格搜索 grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1) grid_search.fit(X_train, y_train) print(f"最佳参数: {grid_search.best_params_}") print(f"最佳分数: {grid_search.best_score_:.4f}") # 使用优化后的模型 best_model = grid_search.best_estimator_ y_pred_optimized = best_model.predict(X_test) y_pred_proba_optimized = best_model.predict_proba(X_test)[:, 1] print(f"优化后的 {best_model_name} 性能:") print(classification_report(y_test, y_pred_optimized)) optimized_auc = roc_auc_score(y_test, y_pred_proba_optimized) print(f"优化后AUC分数: {optimized_auc:.4f}") self.results[f'{best_model_name} (优化版)'] = { 'model': best_model, 'predictions': y_pred_optimized, 'probabilities': y_pred_proba_optimized } def evaluate_models(self): """模型评估与可视化""" print("模型评估") # 1. 混淆矩阵 fig, axes = plt.subplots(2, 2, figsize=(15, 12)) axes = axes.ravel() for idx, (name, result) in enumerate(list(self.results.items())[:4]): cm = confusion_matrix(self.y_test, result['predictions']) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx], xticklabels=['预测未流失', '预测流失'], yticklabels=['实际未流失', '实际流失']) axes[idx].set_title(f'{name}\n混淆矩阵') axes[idx].set_xlabel('预测标签') axes[idx].set_ylabel('真实标签') plt.tight_layout() plt.show() # 2. ROC曲线 plt.figure(figsize=(10, 8)) for name, result in self.results.items(): if 'probabilities' in result: fpr, tpr, _ = roc_curve(self.y_test, result['probabilities']) auc_score = roc_auc_score(self.y_test, result['probabilities']) plt.plot(fpr, tpr, label=f'{name} (AUC = {auc_score:.4f})', linewidth=2) plt.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='随机分类器') plt.xlabel('假正率 (False Positive Rate)') plt.ylabel('真正率 (True Positive Rate)') plt.title('ROC曲线比较') plt.legend() plt.grid(True, alpha=0.3) plt.show() # 3. 精度-召回率曲线 plt.figure(figsize=(10, 8)) for name, result in self.results.items(): if 'probabilities' in result: precision, recall, _ = precision_recall_curve(self.y_test, result['probabilities']) plt.plot(recall, precision, label=name, linewidth=2) plt.xlabel('召回率 (Recall)') plt.ylabel('精度 (Precision)') plt.title('精度-召回率曲线') plt.legend() plt.grid(True, alpha=0.3) plt.show() def feature_importance_analysis(self): """特征重要性分析""" print("特征重要性分析") # 使用随机森林分析特征重要性 rf_model = None for name, result in self.results.items(): if '随机森林' in name: rf_model = result['model'] break if rf_model is None: print("未找到随机森林模型") return X, y = self.prepare_modeling_data() feature_importance = rf_model.feature_importances_ # 创建特征重要性DataFrame importance_df = pd.DataFrame({ 'feature': X.columns, 'importance': feature_importance }).sort_values('importance', ascending=False) # 可视化最重要的20个特征 plt.figure(figsize=(12, 10)) top_features = importance_df.head(20) plt.subplot(2, 1, 1) sns.barplot(data=top_features, x='importance', y='feature', palette='viridis') plt.title('Top 20 特征重要性') plt.xlabel('重要性得分') plt.ylabel('特征') # 累积重要性 plt.subplot(2, 1, 2) cumulative_importance = importance_df['importance'].cumsum() plt.plot(range(1, len(cumulative_importance) + 1), cumulative_importance, 'b-') plt.axhline(y=0.8, color='r', linestyle='--', label='80% 重要性') plt.axhline(y=0.9, color='g', linestyle='--', label='90% 重要性') plt.xlabel('特征数量') plt.ylabel('累积重要性') plt.title('特征累积重要性') plt.legend() plt.grid(True, alpha=0.3) plt.tight_layout() plt.show() # 打印关键特征 print("Top 10 最重要的特征:") for i, row in importance_df.head(10).iterrows(): print(f" {i+1:2d}. {row['feature']}: {row['importance']:.4f}") return importance_df def generate_business_insights(self, importance_df): """生成业务洞察""" print("="*60) print("业务洞察和建议") print("="*60) df = self.df_processed # 1. 关键发现 print("关键发现:") # 合同类型分析 contract_analysis = df.groupby('Contract')['Churn'].agg(['count', 'mean']) contract_analysis['percentage'] = contract_analysis['count'] / len(df) * 100 print(f"合同类型分析:") contract_labels = { 'Month-to-month': '按月合同', 'One year': '一年合同', 'Two year': '两年合同' } for contract, row in contract_analysis.iterrows(): contract_name = contract_labels.get(contract, contract) print(f" {contract_name}: {row['count']} 客户 ({row['percentage']:.1f}%), " f"流失率: {row['mean']:.1%}") # 网络服务分析 internet_analysis = df.groupby('InternetService')['Churn'].agg(['count', 'mean']) print(f"网络服务分析:") internet_labels = { 'Fiber optic': '光纤', 'DSL': 'DSL', 'No': '无网络服务' } for service, row in internet_analysis.iterrows(): service_name = internet_labels.get(service, service) print(f" {service_name}: 流失率 {row['mean']:.1%}") # 2. 高风险客户画像 print(f"高风险客户画像:") high_risk_profiles = [ "按月合同 + 光纤网络 + 电子支付", "新客户(在网<12个月) + 高月费", "无在线安全服务 + 无技术支持", "月费 > $70 + 服务数量 < 2" ] for i, profile in enumerate(high_risk_profiles, 1): print(f" {i}. {profile}") # 3. 基于特征重要性的洞察 print(f"基于模型的特征洞察:") top_features = importance_df.head(5)['feature'].tolist() for feature in top_features: if 'Contract' in feature: print(" 合同类型是预测流失的最重要因素") elif 'tenure' in feature: print(" 在网时长显著影响客户忠诚度") elif 'MonthlyCharges' in feature: print(" 月费用与流失风险相关") elif 'InternetService' in feature: print(" 网络服务类型影响客户满意度") # 4. 具体建议 print(f"具体建议措施:") recommendations = [ "重点监控按月合同客户,提供转长期合同优惠", "改善光纤服务质量,减少服务中断", "为新客户提供更好的入门体验和专属支持", "推广在线安全和技术支持服务", "对高价值客户提供个性化服务和专属优惠", "优化电子支付体验,减少支付失败" ] for i, rec in enumerate(recommendations, 1): print(f" {i}. {rec}") # 5. 预期效果 print(f"预期效果:") expected_outcomes = [ "降低按月合同客户流失率 15-20%", "提高新客户留存率 10-15%", "减少光纤客户投诉 25-30%", "提升客户生命周期价值 8-12%" ] for outcome in expected_outcomes: print(f" {outcome}") print("="*60) def run_complete_analysis(self): """运行完整分析流程""" print("开始电信客户流失分析...") # 1. 数据加载 if not self.load_and_explore(): return # 2. 数据预处理 self.preprocess_data() # 3. 探索性分析 self.exploratory_analysis() # 4. 模型训练 self.train_models() # 5. 模型优化 self.optimize_best_model() # 6. 模型评估 self.evaluate_models() # 7. 特征重要性分析 importance_df = self.feature_importance_analysis() # 8. 业务洞察 self.generate_business_insights(importance_df) print("分析完成!") # 使用示例 if __name__ == "__main__": # 请将路径替换为您的实际文件路径 file_path = r"C:\Users\Huahai\Desktop\人工智能技术实验项目\WA_Fn-UseC_-Telco-Customer-Churn.xlsx" # 创建分析器并运行完整分析 analyzer = TelcoChurnAnalyzer(file_path) analyzer.run_complete_analysis() 代码是在jupyter上运行,如果要在jupyter上分成多个小段代码要如何修改
11-26
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值