FM:MONTH_NAMES_GET获取 月份描述信息

博客主要介绍了FM:MONTH_NAMES_GET可用于获取月份描述信息。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

# 导入必要库 import pandas as pd import matplotlib import matplotlib.pyplot as plt import matplotlib.font_manager as fm import logging import sys import os from matplotlib.patches import Patch # 配置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger('PowerBI_YoY_Chart') # 配置中文字体支持 def configure_chinese_font(): """配置中文字体支持""" try: # 尝试加载常用中文字体 font_names = ['SimHei', 'Microsoft YaHei', 'Arial Unicode MS', 'WenQuanYi Micro Hei', 'SimSun'] available_fonts = [f.name for f in fm.fontManager.ttflist] # 记录可用字体 logger.info(f"可用字体: {', '.join(available_fonts[:10])}...") for font_name in font_names: if font_name in available_fonts: plt.rcParams['font.family'] = font_name logger.info(f"使用中文字体: {font_name}") return True # 尝试加载Windows系统字体 if sys.platform == 'win32': font_paths = [ r'C:\Windows\Fonts\simhei.ttf', # 黑体 r'C:\Windows\Fonts\msyh.ttc', # 微软雅黑 r'C:\Windows\Fonts\simsun.ttc', # 宋体 r'C:\Windows\Fonts\STSONG.TTF' # 华文宋体 ] for font_path in font_paths: if os.path.exists(font_path): font_prop = fm.FontProperties(fname=font_path) plt.rcParams['font.family'] = font_prop.get_name() logger.info(f"使用系统字体: {font_path}") return True # 使用默认字体 plt.rcParams['font.family'] = ['sans-serif'] logger.warning("未找到中文字体,可能无法正确显示中文") return False except Exception as e: logger.error(f"字体配置错误: {str(e)}") return False # 配置中文字体 configure_chinese_font() plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 def sort_months(months): """对月份进行排序:英文月份→数字月份→Average""" month_order = { 'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12, 'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12 } numeric_months = [] en_month_list = [] average_month = None for month in months: if pd.isna(month): continue str_month = str(month).strip() if str_month.lower() == 'average': average_month = 'Average' elif str_month in month_order: en_month_list.append(str_month) elif str_month.isdigit(): numeric_months.append(int(str_month)) else: # 尝试匹配其他格式的月份 str_month_lower = str_month.lower() if str_month_lower.startswith('jan'): en_month_list.append('Jan') elif str_month_lower.startswith('feb'): en_month_list.append('Feb') elif str_month_lower.startswith('mar'): en_month_list.append('Mar') elif str_month_lower.startswith('apr'): en_month_list.append('Apr') elif str_month_lower.startswith('may'): en_month_list.append('May') elif str_month_lower.startswith('jun'): en_month_list.append('Jun') elif str_month_lower.startswith('jul'): en_month_list.append('Jul') elif str_month_lower.startswith('aug'): en_month_list.append('Aug') elif str_month_lower.startswith('sep'): en_month_list.append('Sep') elif str_month_lower.startswith('oct'): en_month_list.append('Oct') elif str_month_lower.startswith('nov'): en_month_list.append('Nov') elif str_month_lower.startswith('dec'): en_month_list.append('Dec') else: # 无法识别的月份原样保留 en_month_list.append(str_month) # 排序并拼接结果 en_month_list.sort(key=lambda x: month_order.get(x, month_order.get(x.capitalize(), 13))) numeric_months.sort() sorted_months = en_month_list + [str(m) for m in numeric_months] if average_month: sorted_months.append(average_month) return sorted_months # 主绘图函数 def plot_yoy_chart(dataset): """生成年度同比变化图表""" try: # 记录开始时间 logger.info("开始生成图表...") # 创建图表(增大图表尺寸以适应更大字体) fig, ax = plt.subplots(figsize=(14, 8)) # 复制数据集以防修改原始数据 df = dataset.copy() # 记录初始数据行数 logger.info(f"原始数据行数: {len(df)}") # 检查必需列是否存在 required_columns = ['Month', 'QTY', 'Year', 'Percentage'] missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: error_msg = f"缺少必需列: {', '.join(missing_columns)}" logger.error(error_msg) ax.text(0.5, 0.5, error_msg, ha='center', va='center', fontsize=16) plt.tight_layout() return plt # 清洗数据 df = df.dropna(subset=required_columns) df = df[df['QTY'] > 0] # 移除无效QTY # 转换数据类型 df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('Int64') df['Percentage'] = pd.to_numeric(df['Percentage'], errors='coerce') df['QTY'] = pd.to_numeric(df['QTY'], errors='coerce') df = df.dropna(subset=['Year', 'Percentage', 'QTY']) logger.info(f"清洗后数据行数: {len(df)}") # 检查是否有有效数据 if df.empty: ax.text(0.5, 0.5, '没有有效数据', ha='center', va='center', fontsize=16) plt.tight_layout() return plt # 获取年份数据并检查 years = sorted(df['Year'].unique()) if len(years) < 2: ax.text(0.5, 0.5, '需要至少两年数据才能生成图表', ha='center', va='center', fontsize=16) plt.tight_layout() return plt # 选取最近的两年数据 prev_year, curr_year = years[-2], years[-1] all_months = sort_months(df['Month'].unique()) logger.info(f"排序后的月份: {all_months}") # 准备绘图数据 data = { 'months': [], 'prev_qty': [], 'curr_qty': [], 'diff_percent': [], 'is_average': [] # 标记是否为Average } for month in all_months: prev_data = df[(df['Year'] == prev_year) & (df['Month'] == month)] curr_data = df[(df['Year'] == curr_year) & (df['Month'] == month)] if not prev_data.empty and not curr_data.empty: data['months'].append(month) data['prev_qty'].append(prev_data['QTY'].values[0]) data['curr_qty'].append(curr_data['QTY'].values[0]) # 计算百分比差异 prev_p = prev_data['Percentage'].values[0] curr_p = curr_data['Percentage'].values[0] diff_p = (curr_p - prev_p) * 100 data['diff_percent'].append(diff_p) # 标记是否为Average data['is_average'].append(str(month).strip().lower() == 'average') # 检查是否有共同月份数据 if not data['months']: ax.text(0.5, 0.5, f'{prev_year} 和 {curr_year} 无共同月份数据', ha='center', va='center', fontsize=16) plt.tight_layout() return plt # 创建子图并绘制柱状图 width = 0.35 x_pos = range(len(data['months'])) # 绘制两年数据的柱状图(区分Average组) for i, (month, prev_qty, curr_qty) in enumerate(zip(data['months'], data['prev_qty'], data['curr_qty'])): is_avg = data['is_average'][i] # 设置颜色:Average组使用紫色,其他组使用默认颜色 prev_color = '#C04F15' if is_avg else '#00B0F0' # 紫色 vs 蓝色 118DFF curr_color = '#F6C6AD' if is_avg else '#A0D1FF' # 浅紫色 vs 橙色C04F15 # 绘制柱状图 prev_bar = ax.bar( x_pos[i] - width/2, prev_qty, width, label=f'{prev_year}' if i == 0 else None, color=prev_color ) curr_bar = ax.bar( x_pos[i] + width/2, curr_qty, width, label=f'{curr_year}' if i == 0 else None, color=curr_color ) # 为柱状图添加数值标签(增大字体) for bar in [prev_bar, curr_bar]: height = bar[0].get_height() ax.text( bar[0].get_x() + bar[0].get_width()/2, height * 1.02, f'{int(height):,}', ha='center', va='bottom', fontweight='bold', fontsize=10 ) # 计算所有柱子的最大高度,用于比例计算 max_qty = max(max(data['prev_qty']), max(data['curr_qty'])) # 调整 Y 轴范围,预留足够空间(增加顶部空间以容纳上移的元素) ax.set_ylim(0, max_qty * 1.6) # 增加更多顶部空间 # 固定文本偏移量(增大值使文本上移更多) fixed_text_offset = max_qty * 0.06 # 增加偏移量 # U形框与柱子顶部标签的间距,解决重叠问题 frame_offset = max_qty * 0.05 # 遍历每个月份,绘制U形虚线框、箭头、百分比文本 for i in range(len(data['months'])): # 获取当前组的两根柱子高度 prev_height = data['prev_qty'][i] # 左侧柱子高度 curr_height = data['curr_qty'][i] # 右侧柱子高度 # 计算U形框位置参数 frame_left = x_pos[i] - width/2 frame_right = x_pos[i] + width/2 # 调整U形框与柱子的间距,避免与顶部数字重叠 base_top = max(prev_height, curr_height) + frame_offset # U 形框竖线高度(略微减小,为上移的箭头留出空间) vert_height = max_qty * 0.08 # 稍微减小竖线高度 left_top = base_top + vert_height right_top = base_top + vert_height # U 形框顶部横线的 y 坐标(保持水平) top_line_y = max(left_top, right_top) # 绘制U形虚线框 # 左侧竖线:从柱子顶部上方开始 ax.plot([frame_left, frame_left], [prev_height + frame_offset, left_top], '--', color='gray', linewidth=1.0) # 右侧竖线:从柱子顶部上方开始 ax.plot([frame_right, frame_right], [curr_height + frame_offset, right_top], '--', color='gray', linewidth=1.0) # 顶部横线(连接左右竖线顶端) ax.plot([frame_left, frame_right], [top_line_y, top_line_y], '--', color='gray', linewidth=1.0) # 计算箭头和文本的位置(显著增加垂直间距,使元素上移) arrow_y = top_line_y + max_qty * 0.05 # 箭头位置大幅上移 text_y = arrow_y + fixed_text_offset # 文本位置基于调整后的箭头位置 center_x = (frame_left + frame_right) / 2 # 水平居中 # 根据百分比设置颜色和标记 diff_p = data['diff_percent'][i] if diff_p > 0: arrow_color = '#d62728' # 红色 marker = '^' # 向上三角形 marker_size = 80 elif diff_p < 0: arrow_color = '#2ca02c' # 绿色 marker = 'v' # 向下三角形 marker_size = 80 else: arrow_color = '#7f7f7f' # 灰色 marker = '_' # 水平线标记 marker_size = 80 # 绘制箭头 ax.scatter( center_x, arrow_y, marker=marker, color=arrow_color, s=marker_size, zorder=5 ) # 绘制百分比文本(增大字体并上移) text_color = arrow_color ax.text( center_x, text_y, f'{diff_p:+.1f}%', ha='center', va='bottom', fontsize=11, fontweight='bold', color=text_color ) # 设置图表属性(特别加大X轴字体) ax.set_xlabel('Month', fontsize=20) ax.set_ylabel('QTY', fontsize=20) # 增大Y轴标签字体到20 ax.set_title('YoY OTS QTY Change', fontsize=28, pad=20) ax.set_xticks(x_pos) ax.set_xticklabels(data['months'], fontsize=15, rotation=45, ha='right') # 设置网格线 - 虚线,只显示Y轴网格线,并与刻度对齐 ax.grid(True, axis='y', linestyle='--', linewidth=0.8, alpha=0.7) ax.grid(False, axis='x') # 不显示X轴网格线 # 设置Y轴刻度标签字体大小 ax.tick_params(axis='y', labelsize=14) # 添加自定义图例并放置在右上角 legend_elements = [ Patch(facecolor='#00B0F0', label=f'{prev_year}'), Patch(facecolor='#A0D1FF', label=f'{curr_year}') ] ax.legend( handles=legend_elements, fontsize=14, loc='upper right', # 将图例放置在右上角 frameon=False # 去除图例外框 ) # 去除所有图表边框 for spine in ['top', 'right', 'left', 'bottom']: ax.spines[spine].set_visible(False) # 重新显示底部边框(X轴) ax.spines['bottom'].set_visible(True) ax.spines['bottom'].set_color('#cccccc') # 设置为浅灰色 # 调整布局(增加顶部和底部空间) plt.subplots_adjust(top=0.88, bottom=0.15) plt.tight_layout() logger.info("图表生成成功") return plt except Exception as e: # 记录详细错误信息 logger.exception("图表生成过程中发生错误") # 创建错误信息图表 plt.figure(figsize=(8, 4)) error_msg = f"图表生成错误: {str(e)}" plt.text(0.5, 0.5, error_msg, ha='center', va='center', fontsize=14, color='red') plt.tight_layout() return plt # 在Power BI中执行的主函数 def main(dataset): """Power BI 入口函数""" try: # 检查数据集 if dataset is None or dataset.empty: plt.figure(figsize=(8, 4)) plt.text(0.5, 0.5, "没有提供数据", ha='center', va='center', fontsize=16) plt.tight_layout() plt.show() return # 生成图表 plot = plot_yoy_chart(dataset) # 显示图表 plot.show() except Exception as e: logger.exception("主函数执行错误") plt.figure(figsize=(8, 4)) plt.text(0.5, 0.5, f"执行错误: {str(e)}", ha='center', va='center', fontsize=14, color='red') plt.tight_layout() plt.show() # 执行主函数(Power BI会自动提供dataset变量) main(dataset) 以上代码,我需要将百分比数相减结果是0时,在箭头上方的百分比数前不要显示“+”号,请按我的需求改好,并给出完整代码。
最新发布
08-09
import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from pyspark.sql import SparkSession from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator import joblib import os import time import warnings from io import BytesIO import platform from pathlib import Path def safe_path(path): """处理Windows长路径问题""" if platform.system() == 'Windows': try: import ntpath return ntpath.realpath(path) except: return str(Path(path).resolve()) return path # 忽略警告 warnings.filterwarnings("ignore") # 设置中文字体 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False # 页面设置 st.set_page_config( page_title="精准营销系统", page_icon="📊", layout="wide", initial_sidebar_state="expanded" ) # 自定义CSS样式 st.markdown(""" <style> .stApp { background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); font-family: 'Helvetica Neue', Arial, sans-serif; } .header { background: linear-gradient(90deg, #1a237e 0%, #283593 100%); color: white; padding: 1.5rem; border-radius: 0.75rem; box-shadow: 0 4px 12px rgba(0,0,0,0.1); margin-bottom: 2rem; } .card { background: white; border-radius: 0.75rem; padding: 1rem; margin-bottom: 1.5rem; box-shadow: 0 4px 12px rgba(0,0,0,0.08); transition: transform 0.3s ease; } .card:hover { transform: translateY(-5px); box-shadow: 0 6px 16px rgba(0,0,0,0.12); } .stButton button { background: linear-gradient(90deg, #3949ab 0%, #1a237e 100%) !important; color: white !important; border: none !important; border-radius: 0.5rem; padding: 0.75rem 1.5rem; font-size: 1rem; font-weight: 600; transition: all 0.3s ease; width: 100%; } .stButton button:hover { transform: scale(1.05); box-shadow: 0 4px 8px rgba(57, 73, 171, 0.4); } .feature-box { background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); border-radius: 0.75rem; padding: 1.5rem; margin-bottom: 1.5rem; } .result-box { background: linear-gradient(135deg, #e8f5e9 0%, #c8e6c9 100%); border-radius: 0.75rem; padding: 1.5rem; margin-top: 1.5rem; } .model-box { background: linear-gradient(135deg, #fff3e0 0%, #ffe0b2 100%); border-radius: 0.75rem; padding: 1.5rem; margin-top: 1.5rem; } .stProgress > div > div > div { background: linear-gradient(90deg, #2ecc71 0%, #27ae60 100%) !important; } .metric-card { background: white; border-radius: 0.75rem; padding: 1rem; text-align: center; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .metric-value { font-size: 1.8rem; font-weight: 700; color: #1a237e; } .metric-label { font-size: 0.9rem; color: #5c6bc0; margin-top: 0.5rem; } .highlight { background: linear-gradient(90deg, #ffeb3b 0%, #fbc02d 100%); padding: 0.2rem 0.5rem; border-radius: 0.25rem; font-weight: 600; } .stDataFrame { border-radius: 0.75rem; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .convert-high { background-color: #c8e6c9 !important; color: #388e3c !important; font-weight: 700; } .convert-low { background-color: #ffcdd2 !important; color: #c62828 !important; font-weight: 600; } </style> """, unsafe_allow_html=True) # 创建Spark会话 def create_spark_session(): return SparkSession.builder \ .appName("TelecomPrecisionMarketing") \ .config("spark.driver.memory", "4g") \ .config("spark.executor.memory", "4g") \ .getOrCreate() # 数据预处理函数 - 修改后 def preprocess_data(df): """ 数据预处理函数 参数: df: 原始数据 (DataFrame) 返回: 预处理后的数据 (DataFrame) """ # 1. 选择关键特征 - 使用实际存在的列名 available_features = [col for col in df.columns if col in [ 'AGE', 'GENDER', 'ONLINE_DAY', 'TERM_CNT', 'IF_YHTS', 'MKT_STAR_GRADE_NAME', 'PROM_AMT_MONTH', 'is_rh_next' # 目标变量 ]] # 确保目标变量存在 if 'is_rh_next' not in available_features: st.error("错误:数据集中缺少目标变量 'is_rh_next'") return df # 只保留需要的列 df = df[available_features].copy() # 2. 处理缺失值 # 数值特征用均值填充 numeric_cols = ['AGE', 'ONLINE_DAY', 'TERM_CNT', 'PROM_AMT_MONTH'] for col in numeric_cols: if col in df.columns: mean_val = df[col].mean() df[col].fillna(mean_val, inplace=True) # 分类特征用众数填充 categorical_cols = ['GENDER', 'MKT_STAR_GRADE_NAME', 'IF_YHTS'] for col in categorical_cols: if col in df.columns: mode_val = df[col].mode()[0] df[col].fillna(mode_val, inplace=True) # 3. 异常值处理(使用IQR方法) def handle_outliers(series): Q1 = series.quantile(0.25) Q3 = series.quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR return series.clip(lower_bound, upper_bound) for col in numeric_cols: if col in df.columns: df[col] = handle_outliers(df[col]) return df # 标题区域 st.markdown(""" <div class="header"> <h1 style='text-align: center; margin: 0;'>精准营销系统</h1> <p style='text-align: center; margin: 0.5rem 0 0; font-size: 1.1rem;'>基于机器学习的单宽转融预测</p> </div> """, unsafe_allow_html=True) # 页面布局 col1, col2 = st.columns([1, 1.5]) # 左侧区域 - 图片和简介 with col1: st.markdown(""" <div class="card"> <h2>📱 智能营销系统</h2> <p>预测单宽带用户转化为融合套餐用户的可能性</p> </div> """, unsafe_allow_html=True) # 使用在线图片作为占位符 st.image("https://images.unsplash.com/photo-1551836022-d5d88e9218df?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1200&q=80", caption="精准营销系统示意图", width=600) st.markdown(""" <div class="card"> <h4>📈 系统功能</h4> <ul> <li>用户转化可能性预测</li> <li>高精度机器学习模型</li> <li>可视化数据分析</li> <li>精准营销策略制定</li> </ul> </div> """, unsafe_allow_html=True) # 右侧区域 - 功能选择 with col2: st.markdown(""" <div class="card"> <h3>📋 请选择操作类型</h3> <p>您可以选择数据分析或使用模型进行预测</p> </div> """, unsafe_allow_html=True) # 功能选择 option = st.radio("", ["📊 数据分析 - 探索数据并训练模型", "🔍 预测分析 - 预测用户转化可能性"], index=0, label_visibility="hidden") # 数据分析部分 if "数据分析" in option: st.markdown(""" <div class="card"> <h3>数据分析与模型训练</h3> <p>上传数据并训练预测模型</p> </极客时间> """, unsafe_allow_html=True) # 上传训练数据 train_file = st.file_uploader("上传数据集 (CSV格式, GBK编码)", type=["csv"]) if train_file is not None: try: # 读取数据 train_data = pd.read_csv(train_file, encoding='GBK') # 显示数据预览 with st.expander("数据预览", expanded=True): st.dataframe(train_data.head()) col1, col2 = st.columns(2) col1.metric("总样本数", train_data.shape[0]) col2.metric("特征数量", train_data.shape[1] - 1) # 数据预处理 st.subheader("数据预处理") with st.spinner("数据预处理中..."): processed_data = preprocess_data(train_data) st.success("✅ 数据预处理完成") # 可视化数据分布 st.subheader("数据分布分析") # 目标变量分布 st.markdown("**目标变量分布 (is_rh_next)**") fig, ax = plt.subplots(figsize=(8, 5)) sns.countplot(x='is_rh_next', data=processed_data, palette='viridis') plt.title('用户转化分布 (0:未转化, 1:转化)') plt.xlabel('是否转化') plt.ylabel('用户数量') st.pyplot(fig) # 数值特征分布 st.markdown("**数值特征分布**") numeric_cols = ['AGE', 'ONLINE_DAY', 'TERM_CNT', 'PROM_AMT_MONTH'] # 动态计算子图布局 num_features = len(numeric_cols) if num_features > 0: ncols = 2 nrows = (num_features + ncols - 1) // ncols # 向上取整 fig, axes = plt.subplots(nrows, ncols, figsize=(14, 4*nrows)) # 将axes展平为一维数组 if nrows > 1 or ncols > 1: axes = axes.flatten() else: axes = [axes] # 单个子图时确保axes是列表 for i, col in enumerate(numeric_cols): if col in processed_data.columns and i < len(axes): sns.histplot(processed_data[col], kde=True, ax=axes[i], color='skyblue') axes[i].set_title(f'{col}分布') axes[i].set_xlabel('') # 隐藏多余的子图 for j in range(i+1, len(axes)): axes[j].set_visible(False) plt.tight_layout() st.pyplot(fig) else: st.warning("没有可用的数值特征") # 特征相关性分析 st.markdown("**特征相关性热力图**") corr_cols = numeric_cols + ['is_rh_next'] if len(corr_cols) > 1: corr_data = processed_data[corr_cols].corr() fig, ax = plt.subplots(figsize=(12, 8)) sns.heatmap(corr_data, annot=True, fmt=".2f", cmap='coolwarm', ax=ax) plt.title('特征相关性热力图') st.pyplot(fig) else: st.warning("特征不足,无法生成相关性热力图") # 模型训练 st.subheader("模型训练") # 训练参数设置 col1, col2 = st.columns(2) test_size = col1.slider("测试集比例", 0.1, 0.4, 0.2, 0.05) random_state = col2.number_input("随机种子", 0, 100, 42) # 开始训练按钮 if st.button("开始训练模型", use_container_width=True): with st.spinner("模型训练中,请稍候..."): # 创建Spark会话 spark = create_spark_session() # 将Pandas DataFrame转换为Spark DataFrame spark_df = spark.createDataFrame(processed_data) # 划分训练集和测试集 train_df, test_df = spark_df.randomSplit([1.0 - test_size, test_size], seed=random_state) # 特征工程 # 分类特征编码 categorical_cols = ['GENDER', 'MKT_STAR_GRADE_NAME', 'IF_YHTS'] # 只处理存在的分类特征 existing_cat_cols = [col for col in categorical_cols if col in processed_data.columns] indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in existing_cat_cols] encoders = [OneHotEncoder(inputCol=col+"_index", outputCol=col+"_encoded") for col in existing_cat_cols] # 数值特征 numeric_cols = ['AGE', 'ONLINE_DAY', 'TERM_CNT', 'PROM_AMT_MONTH'] # 组合所有特征 feature_cols = numeric_cols + [col+"_encoded" for col in existing_cat_cols] assembler = VectorAssembler(inputCols=feature_cols, outputCol="features") # 目标变量索引 label_indexer = StringIndexer(inputCol="is_rh_next", outputCol="label") # 构建模型 lr = LogisticRegression(featuresCol="features", labelCol="label") dt = DecisionTreeClassifier(featuresCol="features", labelCol="label") rf = RandomForestClassifier(featuresCol="features", labelCol="label") # 创建管道 pipeline_lr = Pipeline(stages=indexers + encoders + [assembler, label_indexer, lr]) pipeline_dt = Pipeline(stages=indexers + encoders + [assembler, label_indexer, dt]) pipeline_rf = Pipeline(stages=indexers + encoders + [assembler, label_indexer, rf]) # 训练模型 model_lr = pipeline_lr.fit(train_df) model_dt = pipeline_dt.fit(train_df) model_rf = pipeline_rf.fit(train_df) # 评估模型 evaluator_auc = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction") evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1") def evaluate_model(model, data): predictions = model.transform(data) auc = evaluator_auc.evaluate(predictions) acc = evaluator_acc.evaluate(predictions) f1 = evaluator_f1.evaluate(predictions) return {"AUC": auc, "Accuracy": acc, "F1": f1} results = { "Logistic Regression": evaluate_model(model_lr, test_df), "Decision Tree": evaluate_model(model_dt, test_df), "Random Forest": evaluate_model(model_rf, test_df) } # 保存结果 st.session_state.model_results = results st.session_state.best_model = model_rf # 默认使用随机森林作为最佳模型 st.session_state.spark = spark st.success("🎉 模型训练完成!") # 显示模型性能 st.subheader("模型性能评估") # 转换为DataFrame展示 results_df = pd.DataFrame(results).T st.dataframe(results_df.style.format("{:.4f}").background_gradient(cmap='Blues')) # 可视化比较 fig, ax = plt.subplots(figsize=(10, 6)) results_df.plot(kind='bar', ax=ax) plt.title('模型性能比较') plt.ylabel('分数') plt.xticks(rotation=15) plt.legend(loc='upper right') st.pyplot(fig) # 特征重要性(随机森林) st.subheader("随机森林特征重要性") rf_model = model_rf.stages[-1] feature_importances = rf_model.featureImportances.toArray() feature_names = numeric_cols + [f"{col}_encoded" for col in existing_cat_cols] importance_df = pd.DataFrame({ "Feature": feature_names, "Importance": feature_importances }).sort_values("Importance", ascending=False).head(10) fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(x="Importance", y="Feature", data=importance_df, palette="viridis", ax=ax) plt.title('Top 10 重要特征') st.pyplot(fig) # 保存模型 model_path = "best_model" model_rf.write().overwrite().save(model_path) st.session_state.model_path = model_path except Exception as e: st.error(f"数据处理错误: {str(e)}") # 预测分析部分 else: st.markdown(""" <div class="card"> <h3>用户转化预测</h3> <p>预测单宽带用户转化为融合套餐的可能性</p> </div> """, unsafe_allow_html=True) # 上传预测数据 predict_file = st.file_uploader("上传预测数据 (CSV格式, GBK编码)", type=["csv"]) if predict_file is not None: try: # 读取数据 predict_data = pd.read_csv(predict_file, encoding='GBK') # 显示数据预览 with st.expander("数据预览", expanded=True): st.dataframe(predict_data.head()) # 检查是否有模型 if "model_path" not in st.session_state or not os.path.exists(st.session_state.model_path): st.warning("⚠️ 未找到训练好的模型,请先训练模型") st.stop() # 开始预测按钮 if st.button("开始预测", use_container_width=True): with st.spinner("预测进行中,请稍候..."): # 数据预处理 processed_data = preprocess_data(predict_data) # 创建Spark会话 if "spark" not in st.session_state: spark = create_spark_session() st.session_state.spark = spark else: spark = st.session_state.spark # 将Pandas DataFrame转换为Spark DataFrame spark_df = spark.createDataFrame(processed_data) # 加载模型 best_model = st.session_state.best_model # 生成预测结果 predictions = best_model.transform(spark_df) # 提取预测结果 predictions_df = predictions.select( "CCUST_ROW_ID", "probability", "prediction" ).toPandas() # 解析概率值 predictions_df['转化概率'] = predictions_df['probability'].apply(lambda x: float(x[1])) predictions_df['预测结果'] = predictions_df['prediction'].apply(lambda x: "可能转化" if x == 1.0 else "可能不转化") # 添加转化可能性等级 predictions_df['转化可能性'] = pd.cut( predictions_df['转化概率'], bins=[0, 0.3, 0.7, 1], labels=["低可能性", "中可能性", "高可能性"] ) # 保存结果 st.session_state.prediction_results = predictions_df st.success("✅ 预测完成!") except Exception as e: st.error(f"预测错误: {str(e)}") # 显示预测结果 if "prediction_results" in st.session_state: st.markdown(""" <div class="card"> <h3>预测结果</h3> <p>用户转化可能性评估报告</p> </div> """, unsafe_allow_html=True) result_df = st.session_state.prediction_results # 转化可能性分布 st.subheader("转化可能性分布概览") col1, col2, col3 = st.columns(3) high_conv = (result_df["转化可能性"] == "高可能性").sum() med_conv = (result_df["转化可能性"] == "中可能性").sum() low_conv = (result_df["转化可能性"] == "低可能性").sum() col1.markdown(f""" <div class="metric-card"> <div class="metric-value">{high_conv}</div> <div class="metric-label">高可能性用户</div> </div> """, unsafe_allow_html=True) col2.markdown(f""" <div class="metric-card"> <div class="metric-value">{med_conv}</div> <div class="metric-label">中可能性用户</div> </div> """, unsafe_allow_html=True) col3.markdown(f""" <div class="metric-card"> <div class="metric-value">{low_conv}</div> <div class="metric-label">低可能性用户</div> </div> """, unsafe_allow_html=True) # 转化可能性分布图 fig, ax = plt.subplots(figsize=(8, 5)) conv_counts = result_df["转化可能性"].value_counts() conv_counts.plot(kind='bar', color=['#4CAF50', '#FFC107', '#F44336'], ax=ax) plt.title('用户转化可能性分布') plt.xlabel('可能性等级') plt.ylabel('用户数量') st.pyplot(fig) # 详细预测结果 st.subheader("详细预测结果") # 样式函数 def color_convert(val): if val == "高可能性": return "background-color: #c8e6c9; color: #388e3c;" elif val == "中可能性": return "background-color: #fff9c4; color: #f57f17;" else: return "background-color: #ffcdd2; color: #c62828;" # 格式化显示 display_df = result_df[["CCUST_ROW_ID", "转化概率", "预测结果", "转化可能性"]] styled_df = display_df.style.format({ "转化概率": "{:.2%}" }).applymap(color_convert, subset=["转化可能性"]) st.dataframe(styled_df, height=400) # 下载结果 csv = display_df.to_csv(index=False).encode("utf-8") st.download_button( label="下载预测结果", data=csv, file_name="用户转化预测结果.csv", mime="text/csv", use_container_width=True ) # 页脚 st.markdown("---") st.markdown(""" <div style="text-align: center; color: #5c6bc0; font-size: 0.9rem; padding: 1rem;"> © 2023 精准营销系统 | 基于Spark和Streamlit开发 </div> """, unsafe_allow_html=True) 将上述所给代码,不使用spark,仿照如下所给代码,完成算法和模型调优等操作 import streamlit as st import pandas as pd import numpy as np import joblib import os import time import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl import matplotlib.font_manager as fm import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix from sklearn.preprocessing import StandardScaler from imblearn.over_sampling import SMOTE from sklearn.impute import SimpleImputer import warnings warnings.filterwarnings("ignore") plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False # 正确显示负号 # 页面设置 st.set_page_config( page_title="风控违约预测系统", page_icon="📊", layout="wide", initial_sidebar_state="expanded" ) # 自定义CSS样式 st.markdown(""" <style> .stApp { background: linear-gradient(135deg, #f5f7fa 0%, #e4edf5 100%); font-family: 'Helvetica Neue', Arial, sans-serif; } .header { background: linear-gradient(90deg, #2c3e50 0%, #4a6491 100%); color: white; padding: 1.5rem; border-radius: 0.75rem; box-shadow: 0 4px 12px rgba(0,0,0,0.1); margin-bottom: 2rem; } .card { background: white; border-radius: 0.75rem; padding: 1.5rem; margin-bottom: 1.5rem; box-shadow: 0 4px 12px rgba(0,0,0,0.08); transition: transform 0.3s ease; } .card:hover { transform: translateY(-5px); box-shadow: 0 6px 16px rgba(0,0,0,0.12); } .stButton button { background: linear-gradient(90deg, #3498db 0%, #1a5276 100%) !important; color: white !important; border: none !important; border-radius: 0.5rem; padding: 0.75rem 1.5rem; font-size: 1rem; font-weight: 600; transition: all 0.3s ease; width: 100%; } .stButton button:hover { transform: scale(1.05); box-shadow: 0 4px 8px rgba(52, 152, 219, 0.4); } .feature-box { background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); border-radius: 0.75rem; padding: 1.5rem; margin-bottom: 1.5rem; } .result-box { background: linear-gradient(135deg, #e8f5e9 0%, #c8e6c9 100%); border-radius: 0.75rem; padding: 1.5rem; margin-top: 1.5rem; } .model-box { background: linear-gradient(135deg, #fff3e0 0%, #ffe0b2 100%); border-radius: 0.75rem; padding: 1.5rem; margin-top: 1.5rem; } .stProgress > div > div > div { background: linear-gradient(90deg, #2ecc71 0%, #27ae60 100%) !important; } .metric-card { background: white; border-radius: 0.75rem; padding: 1rem; text-align: center; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .metric-value { font-size: 1.8rem; font-weight: 700; color: #2c3e50; } .metric-label { font-size: 0.9rem; color: #7f8c8d; margin-top: 0.5rem; } .highlight { background: linear-gradient(90deg, #ffeb3b 0%, #fbc02d 100%); padding: 0.2rem 0.5rem; border-radius: 0.25rem; font-weight: 600; } .stDataFrame { border-radius: 0.75rem; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .risk-high { background-color: #ffcdd2 !important; color: #c62828 !important; font-weight: 700; } .risk-medium { background-color: #fff9c4 !important; color: #f57f17 !important; font-weight: 600; } .risk-low { background-color: #c8e6c9 !important; color: #388e3c !important; } </style> """, unsafe_allow_html=True) def preprocess_loan_data(data_old): """ 训练时数据预处理函数,返回处理后的数据和推理时需要的参数 参数: data_old: 原始训练数据 (DataFrame) 返回: processed_data: 预处理后的训练数据 (DataFrame) preprocessor_params: 推理时需要的预处理参数 (dict) """ # 1. 创建原始数据副本 loan_data = data_old.copy() # 2. 保存要删除的列列表 drop_list = ['id','member_id', 'term', 'pymnt_plan', 'initial_list_status', 'sub_grade', 'emp_title', 'issue_d', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d', 'url','desc','next_pymnt_d'] loan_data.drop([col for col in drop_list if col in loan_data.columns], axis=1, inplace=True, errors='ignore') # 3. 删除缺失值超过90%的列 #todo 自己补齐删除代码 missing_ratio = loan_data.isnull().sum() / len(loan_data) loan_data.drop(missing_ratio[missing_ratio > 0.9].index, axis=1, inplace=True, errors='ignore') # 4. 删除值全部相同的列 #todo 自己补齐删除代码 constant_cols = loan_data.columns[loan_data.nunique() <= 1] loan_data.drop(constant_cols, axis=1, inplace=True, errors='ignore') # 5. 处理特殊数值列 loans = loan_data # 修正变量名 loans["int_rate"] = loans["int_rate"].astype(str).str.rstrip('%').astype("float") loans["revol_util"] = loans["revol_util"].astype(str).str.rstrip('%').astype("float") # 6. 缺失值处理 ## 识别分类列和数值列 objectColumns = loans.select_dtypes(include=["object"]).columns.tolist() numColumns = loans.select_dtypes(include=[np.number]).columns.tolist() ## 保存分类列的列名 categorical_columns = objectColumns.copy() ## 填充分类变量缺失值 loans[objectColumns] = loans[objectColumns].fillna("Unknown") ## 填充数值变量缺失值并保存均值 imr = SimpleImputer(missing_values=np.nan, strategy="mean") loans[numColumns] = imr.fit_transform(loans[numColumns]) # 保存数值列的均值 numerical_means = {col: imr.statistics_[i] for i, col in enumerate(numColumns)} # 8. 特征衍生 loans["installment_feat"] = loans["installment"] / ((loans["annual_inc"] + 1) / 12) # 9. 目标变量编码 status_mapping = { "Current": 0, "Issued": 0, "Fully Paid": 0, "In Grace Period": 1, "Late (31-120 days)": 1, "Late (16-30 days)": 1, "Charged Off": 1, "Does not meet the credit policy. Status:Charged Off": 1, "Does not meet the credit policy. Status:Fully Paid": 0, "Default": 0 } loans["loan_status"] = loans["loan_status"].map(status_mapping) # 10. 有序特征映射 mapping_dict = { "emp_length": { "10+ years": 10, "9 years": 9, "8 years": 8, "7 years": 7, "6 years": 6, "5 years": 5, "4 years": 4, "3 years": 3, "2 years": 2, "1 year": 1, "< 1 year": 0, "Unknown": 0 }, "grade": { "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7 } } loans = loans.replace(mapping_dict) # 11. One-hot编码 n_columns = ["home_ownership", "verification_status", "purpose", "application_type"] dummy_df = pd.get_dummies(loans[n_columns], drop_first=False) loans = pd.concat([loans, dummy_df], axis=1) loans.drop(n_columns, axis=1, inplace=True) # 保存One-hot编码后的列名 onehot_columns = n_columns onehot_encoder_columns = dummy_df.columns.tolist() # 12. 特征缩放 # 识别需要缩放的数值列 numeric_cols = loans.select_dtypes(include=["int", "float"]).columns.tolist() if 'loan_status' in numeric_cols: numeric_cols.remove('loan_status') # 创建并拟合缩放器 sc = StandardScaler() if numeric_cols: loans[numeric_cols] = sc.fit_transform(loans[numeric_cols]) # 保存缩放列名 scaled_columns = numeric_cols # 13. 保存最终列结构(在SMOTE之前) #final_columns = loans.columns.tolist().remove('loan_status') final_columns = loans.columns[loans.columns != 'loan_status'].tolist() # 14. 处理不平衡数据(SMOTE过采样) X = loans.drop("loan_status", axis=1) y = loans["loan_status"] os = SMOTE(random_state=42) X_res, y_res = os.fit_resample(X, y) # 15. 合并为最终DataFrame processed_data = pd.concat([X_res, y_res], axis=1) processed_data.columns = list(X.columns) + ["loan_status"] # 16. 创建推理时需要的参数字典 preprocessor_params = { # 1. 删除的列 'drop_list': drop_list, # 2. 分类列缺失值填充 'categorical_columns': categorical_columns, # 3. 数值列填充均值 'numerical_means': numerical_means, # 4. 有序特征映射 'mapping_dict': mapping_dict, # 5. One-hot配置 'onehot_columns': onehot_columns, 'onehot_encoder_columns': onehot_encoder_columns, # 6. 缩放器及缩放列 'scaler': sc, # 已拟合的StandardScaler实例 'scaled_columns': scaled_columns, # 7. 最终列结构(训练后的列顺序) 'final_columns': final_columns } return processed_data, preprocessor_params def preprocess_loan_data_inference(data_old, preprocessor_params): """ 推理时数据处理函数 参数: data_old: 原始推理数据 (DataFrame) preprocessor_params: 从训练过程保存的预处理参数 (dict) 返回: processed_data: 预处理后的推理数据 (DataFrame) """ # 1. 复制数据避免污染原始数据 loanss = data_old.copy() # 2. 删除训练时确定的列 drop_list = preprocessor_params['drop_list'] loans = loanss.drop(columns=[col for col in drop_list if col in loanss.columns], axis=1, errors='ignore') # 3. 处理特殊数值列(百分比转换) if 'int_rate' in loans: loans["int_rate"] = loans["int_rate"].astype(str).str.rstrip('%').astype("float") if 'revol_util' in loans: loans["revol_util"] = loans["revol_util"].astype(str).str.rstrip('%').astype("float") # 4. 特征衍生(使用训练时相同公式) if 'installment' in loans and 'annual_inc' in loans: loans["installment_feat"] = loans["installment"] / ((loans["annual_inc"] + 1) / 12) # 5. 有序特征映射(使用训练时的映射字典) mapping_dict = preprocessor_params['mapping_dict'] for col, mapping in mapping_dict.items(): if col in loans: # 处理未知值,默认为0 loans[col] = loans[col].map(mapping).fillna(0).astype(int) # 6. 缺失值处理(使用训练时保存的策略) # 分类变量 cat_cols = preprocessor_params['categorical_columns'] for col in cat_cols: if col in loans: loans[col] = loans[col].fillna("Unknown") # 数值变量(使用训练时保存的均值) num_means = preprocessor_params['numerical_means'] for col, mean_value in num_means.items(): if col in loans: loans[col] = loans[col].fillna(mean_value) # 7. One-hot编码(对齐训练时的列结构) n_columns = preprocessor_params['onehot_columns'] expected_dummy_columns = preprocessor_params['onehot_encoder_columns'] # 创建空DataFrame用于存储结果 dummy_df = pd.DataFrame(columns=expected_dummy_columns) # 为每个分类列生成dummy变量 for col in n_columns: if col in loans: # 为当前列生成dummies col_dummies = pd.get_dummies(loans[col], prefix=col) # 对齐训练时的列结构 for expected_col in expected_dummy_columns: if expected_col in col_dummies: dummy_df[expected_col] = col_dummies[expected_col] else: # 如果该列不存在,则创建全0列 dummy_df[expected_col] = 0 # 合并dummy变量 loans = pd.concat([loans, dummy_df], axis=1) # 删除原始分类列 loans.drop(columns=[col for col in n_columns if col in loans.columns], inplace=True, errors='ignore') # 8. 特征缩放(使用训练时的缩放器参数) sc = preprocessor_params['scaler'] scaled_cols = [col for col in preprocessor_params['scaled_columns'] if col in loans.columns] if scaled_cols: loans[scaled_cols] = sc.transform(loans[scaled_cols]) # 9. 对齐最终特征列(确保与训练数据相同) final_columns = preprocessor_params['final_columns'] # 添加缺失列(用0填充) for col in final_columns: if col not in loans.columns: loans[col] = 0 # 移除多余列并保持顺序 processed_data = loans[final_columns] print(loans.columns) return processed_data # 标题区域 st.markdown(""" <div class="header"> <h1 style='text-align: center; margin: 0;'>风控违约预测系统</h1> <p style='text-align: center; margin: 0.5rem 0 0; font-size: 1.1rem;'>基于机器学习的信贷风险评估与预测</p> </div> """, unsafe_allow_html=True) # 页面布局 col1, col2 = st.columns([1, 1.5]) # 左侧区域 - 图片和简介 with col1: st.markdown(""" <div class="card"> <h3 style='text-align: center; color: #2c3e50;'>智能风控系统</h3> <p style='text-align: center;'>利用先进机器学习技术预测信贷违约风险</p> </div> """, unsafe_allow_html=True) # 使用在线图片作为占位符 st.image("https://images.unsplash.com/photo-1553877522-43269d4ea984?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1200&q=80", caption="智能风控系统示意图", use_column_width=True) st.markdown(""" <div class="feature-box"> <h4>📈 系统功能</h4> <ul> <li>客户违约风险预测</li> <li>高精度机器学习模型</li> <li>可视化风险评估</li> <li>批量数据处理</li> </ul> </div> """, unsafe_allow_html=True) # 右侧区域 - 功能选择 with col2: st.markdown(""" <div class="card"> <h3 style='color: #2c3e50;'>请选择操作类型</h3> <p>您可以选择训练新模型或使用现有模型进行预测</p> </div> """, unsafe_allow_html=True) # 功能选择 option = st.radio("", ["🚀 训练新模型 - 使用新数据训练预测模型", "🔍 推理预测 - 使用模型预测违约风险"], index=0, label_visibility="hidden") # 模型训练部分 if "训练新模型" in option: st.markdown(""" <div class="model-box"> <h4>模型训练</h4> <p>上传训练数据并训练新的预测模型</p> </div> """, unsafe_allow_html=True) # 上传训练数据 train_file = st.file_uploader("上传训练数据 (CSV格式)", type=["csv"]) if train_file is not None: try: # 读取数据 train_data_old = pd.read_csv(train_file) # 显示数据预览 with st.expander("数据预览", expanded=True): st.dataframe(train_data_old.head()) col1, col2, col3 = st.columns(3) col1.metric("总样本数", train_data_old.shape[0]) col2.metric("特征数量", train_data_old.shape[1] - 1) # 训练参数设置 st.subheader("训练参数") col1, col2 = st.columns(2) test_size = col1.slider("测试集比例", 0.1, 0.4, 0.2, 0.1) n_estimators = col2.slider("树的数量", 10, 500, 100, 10) max_depth = col1.slider("最大深度", 2, 30, 10, 1) random_state = col2.number_input("随机种子", 0, 100, 42) # 开始训练按钮 if st.button("开始训练模型", use_container_width=True): with st.spinner("模型训练中,请稍候..."): # 模拟数据处理 progress_bar = st.progress(0) train_data,preprocessor_params = preprocess_loan_data(train_data_old) joblib.dump(preprocessor_params, 'loan_preprocessor_params.pkl') # 步骤1: 数据预处理 time.sleep(1) progress_bar.progress(25) st.success("✅ 数据预处理完成") # 步骤2: 特征工程 time.sleep(1) progress_bar.progress(50) st.success("✅ 特征工程完成") # 步骤3: 模型训练 time.sleep(2) progress_bar.progress(75) # 实际训练代码 (简化版) X = train_data.drop("loan_status", axis=1) y = train_data["loan_status"] # 划分训练测试集 #todo 自己补齐数据划分代码 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y) # 训练模型 #todo 自己补齐调用随机森林算法完成模型的训练 model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state, n_jobs=-1) model.fit(X_train, y_train) # 保存模型 joblib.dump(model, "risk_model.pkl") # 步骤4: 模型评估 time.sleep(1) progress_bar.progress(100) # 评估模型 #todo 自己补齐调用预测函数完成测试集推理预测 y_pred = model.predict(X_test) y_proba = model.predict_proba(X_test)[:, 1] accuracy = accuracy_score(y_test, y_pred) auc = roc_auc_score(y_test, y_proba) # 保存评估结果 st.session_state.model_trained = True st.session_state.accuracy = accuracy st.session_state.auc = auc st.session_state.y_test = y_test st.session_state.y_pred = y_pred st.success("🎉 模型训练完成!") # 显示模型性能 st.subheader("模型性能评估") col1, col2 = st.columns(2) col1.markdown(f""" <div class="metric-card"> <div class="metric-value">{accuracy*100:.1f}%</div> <div class="metric-label">准确率</div> </div> """, unsafe_allow_html=True) col2.markdown(f""" <div class="metric-card"> <div class="metric-value">{auc:.3f}</div> <div class="metric-label">AUC 分数</div> </div> """, unsafe_allow_html=True) # 混淆矩阵 st.subheader("混淆矩阵") cm = confusion_matrix(y_test, y_pred) fig, ax = plt.subplots(figsize=(6, 4)) sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax) ax.set_xlabel("预测标签") ax.set_ylabel("真实标签") ax.set_title("混淆矩阵") st.pyplot(fig) # 特征重要性 st.subheader("特征重要性") feature_importance = pd.DataFrame({ "特征": X.columns, "重要性": model.feature_importances_ }).sort_values("重要性", ascending=False).head(10) fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(x="重要性", y="特征", data=feature_importance, palette="viridis", ax=ax) ax.set_title("Top 10 重要特征") st.pyplot(fig) except Exception as e: st.error(f"数据处理错误: {str(e)}") # 推理预测部分 else: st.markdown(""" <div class="model-box"> <h4>风险预测</h4> <p>上传需要预测的数据,生成违约风险评估报告</p> </div> """, unsafe_allow_html=True) # 上传预测数据 predict_file = st.file_uploader("上传预测数据 (CSV格式)", type=["csv"]) if predict_file is not None: try: # 读取数据 predict_data = pd.read_csv(predict_file) # 显示数据预览 with st.expander("数据预览", expanded=True): st.dataframe(predict_data.head()) st.info(f"数据集包含 {predict_data.shape[0]} 个样本,{predict_data.shape[1]} 个特征") # 检查是否有模型 if not os.path.exists("risk_model.pkl"): st.warning("⚠️ 未找到训练好的模型,请先训练模型或使用示例数据") # 使用示例模型 if st.button("使用示例模型进行预测", use_container_width=True): st.info("正在使用预训练的示例模型进行预测...") # 创建示例模型 X = np.random.rand(100, 10) y = np.random.randint(0, 2, 100) model = RandomForestClassifier(n_estimators=50, random_state=42) model.fit(X, y) # 生成预测结果 predictions = model.predict(predict_data.values) probas = model.predict_proba(predict_data.values)[:, 1] # 创建结果DataFrame result_df = pd.DataFrame({ "客户ID": predict_data["member_id"], "违约概率": probas, "预测标签": predictions }) # 添加风险等级 result_df["风险等级"] = pd.cut( result_df["违约概率"], bins=[0, 0.2, 0.5, 1], labels=["低风险", "中风险", "高风险"], include_lowest=True ) # 保存结果 st.session_state.prediction_results = result_df else: # 加载模型 model = joblib.load("risk_model.pkl") preprocessor_params = joblib.load('loan_preprocessor_params.pkl') # 开始预测按钮 if st.button("开始风险预测", use_container_width=True): with st.spinner("预测进行中,请稍候..."): # 模拟预测过程 progress_bar = st.progress(0) # 预处理推理数据 #todo 自己补齐调用推理数据处理函数完成推理数据的清洗 processed_inference = preprocess_loan_data_inference(predict_data, preprocessor_params) # 步骤1: 数据预处理 time.sleep(1) progress_bar.progress(25) # 步骤2: 特征工程 time.sleep(1) progress_bar.progress(50) # 步骤3: 模型预测 time.sleep(1) progress_bar.progress(75) # 生成预测结果 predictions = model.predict(processed_inference.values) probas = model.predict_proba(processed_inference.values)[:, 1] # 创建结果DataFrame result_df = pd.DataFrame({ "客户ID": predict_data["member_id"], "违约概率": probas, "预测标签": predictions }) # 添加风险等级 result_df["风险等级"] = pd.cut( result_df["违约概率"], bins=[0, 0.2, 0.5, 1], labels=["低风险", "中风险", "高风险"], include_lowest=True ) # 步骤4: 生成报告 time.sleep(1) progress_bar.progress(100) # 保存结果 st.session_state.prediction_results = result_df st.success("✅ 预测完成!") except Exception as e: st.error(f"预测错误: {str(e)}") # 显示预测结果 if "prediction_results" in st.session_state: st.markdown(""" <div class="result-box"> <h4>预测结果</h4> <p>客户违约风险评估报告</p> </div> """, unsafe_allow_html=True) result_df = st.session_state.prediction_results # 风险分布 st.subheader("风险分布概览") col1, col2, col3 = st.columns(3) high_risk = (result_df["风险等级"] == "高风险").sum() med_risk = (result_df["风险等级"] == "中风险").sum() low_risk = (result_df["风险等级"] == "低风险").sum() col1.markdown(f""" <div class="metric-card"> <div class="metric-value risk-high">{high_risk}</div> <div class="metric-label">高风险客户</div> </div> """, unsafe_allow_html=True) col2.markdown(f""" <div class="metric-card"> <div class="metric-value risk-medium">{med_risk}</div> <div class="metric-label">中风险客户</div> </div> """, unsafe_allow_html=True) col3.markdown(f""" <div class="metric-card"> <div class="metric-value risk-low">{low_risk}</div> <div class="metric-label">低风险客户</div> </div> """, unsafe_allow_html=True) # 风险分布图 fig, ax = plt.subplots(figsize=(8, 4)) risk_counts = result_df["风险等级"].value_counts() risk_counts.plot(kind="bar", color=["#4CAF50", "#FFC107", "#F44336"], ax=ax) ax.set_title("客户风险等级分布") ax.set_xlabel("风险等级") ax.set_ylabel("客户数量") st.pyplot(fig) # 详细预测结果 st.subheader("详细预测结果") # 样式函数 def color_risk(val): if val == "高风险": return "background-color: #ffcdd2; color: #c62828;" elif val == "中风险": return "background-color: #fff9c4; color: #f57f17;" else: return "background-color: #c8e6c9; color: #388e3c;" # 格式化显示 styled_df = result_df.style.applymap(color_risk, subset=["风险等级"]) st.dataframe(styled_df.format({ "违约概率": "{:.2%}" }), height=400) # 下载结果 csv = result_df.to_csv(index=False).encode("utf-8") st.download_button( label="下载预测结果", data=csv, file_name="风险预测结果.csv", mime="text/csv", use_container_width=True ) # 页脚 st.markdown("---") st.markdown(""" <div style="text-align: center; color: #7f8c8d; font-size: 0.9rem; padding: 1rem;"> © 2023 风控违约预测系统 | 基于Streamlit开发 </div> """, unsafe_allow_html=True)
07-03
import streamlit as st import pandas as pd import numpy as np import joblib import os import time import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl import matplotlib.font_manager as fm import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix from sklearn.preprocessing import StandardScaler from imblearn.over_sampling import SMOTE from sklearn.impute import SimpleImputer import warnings warnings.filterwarnings(“ignore”) plt.rcParams[‘font.sans-serif’] = [‘SimHei’] plt.rcParams[‘axes.unicode_minus’] = False # 正确显示负号 页面设置 st.set_page_config( page_title=“风控违约预测系统”, page_icon=“📊”, layout=“wide”, initial_sidebar_state=“expanded” ) 自定义CSS样式 st.markdown(“”" <style> .stApp { background: linear-gradient(135deg, #f5f7fa 0%, #e4edf5 100%); font-family: 'Helvetica Neue', Arial, sans-serif; } .header { background: linear-gradient(90deg, #2c3e50 0%, #4a6491 100%); color: white; padding: 1.5rem; border-radius: 0.75rem; box-shadow: 0 4px 12px rgba(0,0,0,0.1); margin-bottom: 2rem; } .card { background: white; border-radius: 0.75rem; padding: 1.5rem; margin-bottom: 1.5rem; box-shadow: 0 4px 12px rgba(0,0,0,0.08); transition: transform 0.3s ease; } .card:hover { transform: translateY(-5px); box-shadow: 0 6px 16px rgba(0,0,0,0.12); } .stButton button { background: linear-gradient(90deg, #3498db 0%, #1a5276 100%) !important; color: white !important; border: none !important; border-radius: 0.5rem; padding: 0.75rem 1.5rem; font-size: 1rem; font-weight: 600; transition: all 0.3s ease; width: 100%; } .stButton button:hover { transform: scale(1.05); box-shadow: 0 4px 8px rgba(52, 152, 219, 0.4); } .feature-box { background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); border-radius: 0.75rem; padding: 1.5rem; margin-bottom: 1.5rem; } .result-box { background: linear-gradient(135deg, #e8f5e9 0%, #c8e6c9 100%); border-radius: 0.75rem; padding: 1.5rem; margin-top: 1.5rem; } .model-box { background: linear-gradient(135deg, #fff3e0 0%, #ffe0b2 100%); border-radius: 0.75rem; padding: 1.5rem; margin-top: 1.5rem; } .stProgress > div > div > div { background: linear-gradient(90deg, #2ecc71 0%, #27ae60 100%) !important; } .metric-card { background: white; border-radius: 0.75rem; padding: 1rem; text-align: center; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .metric-value { font-size: 1.8rem; font-weight: 700; color: #2c3e50; } .metric-label { font-size: 0.9rem; color: #7f8c8d; margin-top: 0.5rem; } .highlight { background: linear-gradient(90deg, #ffeb3b 0%, #fbc02d 100%); padding: 0.2rem 0.5rem; border-radius: 0.25rem; font-weight: 600; } .stDataFrame { border-radius: 0.75rem; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .risk-high { background-color: #ffcdd2 !important; color: #c62828 !important; font-weight: 700; } .risk-medium { background-color: #fff9c4 !important; color: #f57f17 !important; font-weight: 600; } .risk-low { background-color: #c8e6c9 !important; color: #388e3c !important; } </style> “”", unsafe_allow_html=True) def preprocess_loan_data(data_old): “”" 训练时数据预处理函数,返回处理后的数据和推理时需要的参数 参数: data_old: 原始训练数据 (DataFrame) 返回: processed_data: 预处理后的训练数据 (DataFrame) preprocessor_params: 推理时需要的预处理参数 (dict) “”" # 1. 创建原始数据副本 loan_data = data_old.copy() # 2. 保存要删除的列列表 drop_list = [‘id’,‘member_id’, ‘term’, ‘pymnt_plan’, ‘initial_list_status’, ‘sub_grade’, ‘emp_title’, ‘issue_d’, ‘title’, ‘zip_code’, ‘addr_state’, ‘earliest_cr_line’, ‘last_pymnt_d’, ‘last_credit_pull_d’, ‘url’,‘desc’,‘next_pymnt_d’] loan_data.drop([col for col in drop_list if col in loan_data.columns], axis=1, inplace=True, errors=‘ignore’) # 3. 删除缺失值超过90%的列 #todo 自己补齐删除代码 missing_ratio = loan_data.isnull().sum() / len(loan_data) loan_data.drop(missing_ratio[missing_ratio > 0.9].index, axis=1, inplace=True, errors=‘ignore’) # 4. 删除值全部相同的列 #todo 自己补齐删除代码 constant_cols = loan_data.columns[loan_data.nunique() <= 1] loan_data.drop(constant_cols, axis=1, inplace=True, errors=‘ignore’) # 5. 处理特殊数值列 loans = loan_data # 修正变量名 loans[“int_rate”] = loans[“int_rate”].astype(str).str.rstrip(‘%’).astype(“float”) loans[“revol_util”] = loans[“revol_util”].astype(str).str.rstrip(‘%’).astype(“float”) # 6. 缺失值处理 ## 识别分类列和数值列 objectColumns = loans.select_dtypes(include=[“object”]).columns.tolist() numColumns = loans.select_dtypes(include=[np.number]).columns.tolist() ## 保存分类列的列名 categorical_columns = objectColumns.copy() ## 填充分类变量缺失值 loans[objectColumns] = loans[objectColumns].fillna(“Unknown”) ## 填充数值变量缺失值并保存均值 imr = SimpleImputer(missing_values=np.nan, strategy=“mean”) loans[numColumns] = imr.fit_transform(loans[numColumns]) # 保存数值列的均值 numerical_means = {col: imr.statistics_[i] for i, col in enumerate(numColumns)} # 8. 特征衍生 loans[“installment_feat”] = loans[“installment”] / ((loans[“annual_inc”] + 1) / 12) # 9. 目标变量编码 status_mapping = { “Current”: 0, “Issued”: 0, “Fully Paid”: 0, “In Grace Period”: 1, “Late (31-120 days)”: 1, “Late (16-30 days)”: 1, “Charged Off”: 1, “Does not meet the credit policy. Status:Charged Off”: 1, “Does not meet the credit policy. Status:Fully Paid”: 0, “Default”: 0 } loans[“loan_status”] = loans[“loan_status”].map(status_mapping) # 10. 有序特征映射 mapping_dict = { “emp_length”: { “10+ years”: 10, “9 years”: 9, “8 years”: 8, “7 years”: 7, “6 years”: 6, “5 years”: 5, “4 years”: 4, “3 years”: 3, “2 years”: 2, “1 year”: 1, “< 1 year”: 0, “Unknown”: 0 }, “grade”: { “A”: 1, “B”: 2, “C”: 3, “D”: 4, “E”: 5, “F”: 6, “G”: 7 } } loans = loans.replace(mapping_dict) # 11. One-hot编码 n_columns = [“home_ownership”, “verification_status”, “purpose”, “application_type”] dummy_df = pd.get_dummies(loans[n_columns], drop_first=False) loans = pd.concat([loans, dummy_df], axis=1) loans.drop(n_columns, axis=1, inplace=True) # 保存One-hot编码后的列名 onehot_columns = n_columns onehot_encoder_columns = dummy_df.columns.tolist() # 12. 特征缩放 # 识别需要缩放的数值列 numeric_cols = loans.select_dtypes(include=[“int”, “float”]).columns.tolist() if ‘loan_status’ in numeric_cols: numeric_cols.remove(‘loan_status’) # 创建并拟合缩放器 sc = StandardScaler() if numeric_cols: loans[numeric_cols] = sc.fit_transform(loans[numeric_cols]) # 保存缩放列名 scaled_columns = numeric_cols # 13. 保存最终列结构(在SMOTE之前) #final_columns = loans.columns.tolist().remove(‘loan_status’) final_columns = loans.columns[loans.columns != ‘loan_status’].tolist() # 14. 处理不平衡数据(SMOTE过采样) X = loans.drop(“loan_status”, axis=1) y = loans[“loan_status”] os = SMOTE(random_state=42) X_res, y_res = os.fit_resample(X, y) # 15. 合并为最终DataFrame processed_data = pd.concat([X_res, y_res], axis=1) processed_data.columns = list(X.columns) + [“loan_status”] # 16. 创建推理时需要的参数字典 preprocessor_params = { # 1. 删除的列 ‘drop_list’: drop_list, # 2. 分类列缺失值填充 ‘categorical_columns’: categorical_columns, # 3. 数值列填充均值 ‘numerical_means’: numerical_means, # 4. 有序特征映射 ‘mapping_dict’: mapping_dict, # 5. One-hot配置 ‘onehot_columns’: onehot_columns, ‘onehot_encoder_columns’: onehot_encoder_columns, # 6. 缩放器及缩放列 ‘scaler’: sc, # 已拟合的StandardScaler实例 ‘scaled_columns’: scaled_columns, # 7. 最终列结构(训练后的列顺序) ‘final_columns’: final_columns } return processed_data, preprocessor_params def preprocess_loan_data_inference(data_old, preprocessor_params): “”" 推理时数据处理函数 参数: data_old: 原始推理数据 (DataFrame) preprocessor_params: 从训练过程保存的预处理参数 (dict) 返回: processed_data: 预处理后的推理数据 (DataFrame) “”" # 1. 复制数据避免污染原始数据 loanss = data_old.copy() # 2. 删除训练时确定的列 drop_list = preprocessor_params[‘drop_list’] loans = loanss.drop(columns=[col for col in drop_list if col in loanss.columns], axis=1, errors=‘ignore’) # 3. 处理特殊数值列(百分比转换) if ‘int_rate’ in loans: loans[“int_rate”] = loans[“int_rate”].astype(str).str.rstrip(‘%’).astype(“float”) if ‘revol_util’ in loans: loans[“revol_util”] = loans[“revol_util”].astype(str).str.rstrip(‘%’).astype(“float”) # 4. 特征衍生(使用训练时相同公式) if ‘installment’ in loans and ‘annual_inc’ in loans: loans[“installment_feat”] = loans[“installment”] / ((loans[“annual_inc”] + 1) / 12) # 5. 有序特征映射(使用训练时的映射字典) mapping_dict = preprocessor_params[‘mapping_dict’] for col, mapping in mapping_dict.items(): if col in loans: # 处理未知值,默认为0 loans[col] = loans[col].map(mapping).fillna(0).astype(int) # 6. 缺失值处理(使用训练时保存的策略) # 分类变量 cat_cols = preprocessor_params[‘categorical_columns’] for col in cat_cols: if col in loans: loans[col] = loans[col].fillna(“Unknown”) # 数值变量(使用训练时保存的均值) num_means = preprocessor_params[‘numerical_means’] for col, mean_value in num_means.items(): if col in loans: loans[col] = loans[col].fillna(mean_value) # 7. One-hot编码(对齐训练时的列结构) n_columns = preprocessor_params[‘onehot_columns’] expected_dummy_columns = preprocessor_params[‘onehot_encoder_columns’] # 创建空DataFrame用于存储结果 dummy_df = pd.DataFrame(columns=expected_dummy_columns) # 为每个分类列生成dummy变量 for col in n_columns: if col in loans: # 为当前列生成dummies col_dummies = pd.get_dummies(loans[col], prefix=col) # 对齐训练时的列结构 for expected_col in expected_dummy_columns: if expected_col in col_dummies: dummy_df[expected_col] = col_dummies[expected_col] else: # 如果该列不存在,则创建全0列 dummy_df[expected_col] = 0 # 合并dummy变量 loans = pd.concat([loans, dummy_df], axis=1) # 删除原始分类列 loans.drop(columns=[col for col in n_columns if col in loans.columns], inplace=True, errors=‘ignore’) # 8. 特征缩放(使用训练时的缩放器参数) sc = preprocessor_params[‘scaler’] scaled_cols = [col for col in preprocessor_params[‘scaled_columns’] if col in loans.columns] if scaled_cols: loans[scaled_cols] = sc.transform(loans[scaled_cols]) # 9. 对齐最终特征列(确保与训练数据相同) final_columns = preprocessor_params[‘final_columns’] # 添加缺失列(用0填充) for col in final_columns: if col not in loans.columns: loans[col] = 0 # 移除多余列并保持顺序 processed_data = loans[final_columns] print(loans.columns) return processed_data 标题区域 st.markdown(“”" <div class="header"> <h1 style='text-align: center; margin: 0;'>风控违约预测系统</h1> <p style='text-align: center; margin: 0.5rem 0 0; font-size: 1.1rem;'>基于机器学习的信贷风险评估与预测</p> </div> """, unsafe_allow_html=True) 页面布局 col1, col2 = st.columns([1, 1.5]) 左侧区域 - 图片和简介 with col1: st.markdown(“”" 智能风控系统 利用先进机器学习技术预测信贷违约风险 “”", unsafe_allow_html=True) 使用在线图片作为占位符 st.image(“https://images.unsplash.com/photo-1553877522-43269d4ea984?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1200&q=80”, caption=“智能风控系统示意图”, use_column_width=True) st.markdown(“”" 📈 系统功能 客户违约风险预测 高精度机器学习模型 可视化风险评估 批量数据处理 “”", unsafe_allow_html=True) 右侧区域 - 功能选择 with col2: st.markdown(“”" 请选择操作类型 您可以选择训练新模型或使用现有模型进行预测 “”", unsafe_allow_html=True) 功能选择 option = st.radio(“”, [“🚀 训练新模型 - 使用新数据训练预测模型”, “🔍 推理预测 - 使用模型预测违约风险”], index=0, label_visibility=“hidden”) # 模型训练部分 if “训练新模型” in option: st.markdown(“”" 模型训练 上传训练数据并训练新的预测模型 “”“, unsafe_allow_html=True) # 上传训练数据 train_file = st.file_uploader(“上传训练数据 (CSV格式)”, type=[“csv”]) if train_file is not None: try: # 读取数据 train_data_old = pd.read_csv(train_file) # 显示数据预览 with st.expander(“数据预览”, expanded=True): st.dataframe(train_data_old.head()) col1, col2, col3 = st.columns(3) col1.metric(“总样本数”, train_data_old.shape[0]) col2.metric(“特征数量”, train_data_old.shape[1] - 1) # 训练参数设置 st.subheader(“训练参数”) col1, col2 = st.columns(2) test_size = col1.slider(“测试集比例”, 0.1, 0.4, 0.2, 0.1) n_estimators = col2.slider(“树的数量”, 10, 500, 100, 10) max_depth = col1.slider(“最大深度”, 2, 30, 10, 1) random_state = col2.number_input(“随机种子”, 0, 100, 42) # 开始训练按钮 if st.button(“开始训练模型”, use_container_width=True): with st.spinner(“模型训练中,请稍候…”): # 模拟数据处理 progress_bar = st.progress(0) train_data,preprocessor_params = preprocess_loan_data(train_data_old) joblib.dump(preprocessor_params, ‘loan_preprocessor_params.pkl’) # 步骤1: 数据预处理 time.sleep(1) progress_bar.progress(25) st.success(”✅ 数据预处理完成") # 步骤2: 特征工程 time.sleep(1) progress_bar.progress(50) st.success(“✅ 特征工程完成”) # 步骤3: 模型训练 time.sleep(2) progress_bar.progress(75) # 实际训练代码 (简化版) X = train_data.drop(“loan_status”, axis=1) y = train_data[“loan_status”] # 划分训练测试集 #todo 自己补齐数据划分代码 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y) # 训练模型 #todo 自己补齐调用随机森林算法完成模型的训练 model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state, n_jobs=-1) model.fit(X_train, y_train) # 保存模型 joblib.dump(model, “risk_model.pkl”) # 步骤4: 模型评估 time.sleep(1) progress_bar.progress(100) # 评估模型 #todo 自己补齐调用预测函数完成测试集推理预测 y_pred = model.predict(X_test) y_proba = model.predict_proba(X_test)[:, 1] accuracy = accuracy_score(y_test, y_pred) auc = roc_auc_score(y_test, y_proba) # 保存评估结果 st.session_state.model_trained = True st.session_state.accuracy = accuracy st.session_state.auc = auc st.session_state.y_test = y_test st.session_state.y_pred = y_pred st.success(“🎉 模型训练完成!”) # 显示模型性能 st.subheader(“模型性能评估”) col1, col2 = st.columns(2) col1.markdown(f"“” {accuracy*100:.1f}% 准确率 “”“, unsafe_allow_html=True) col2.markdown(f”“” {auc:.3f} AUC 分数 “”“, unsafe_allow_html=True) # 混淆矩阵 st.subheader(“混淆矩阵”) cm = confusion_matrix(y_test, y_pred) fig, ax = plt.subplots(figsize=(6, 4)) sns.heatmap(cm, annot=True, fmt=“d”, cmap=“Blues”, ax=ax) ax.set_xlabel(“预测标签”) ax.set_ylabel(“真实标签”) ax.set_title(“混淆矩阵”) st.pyplot(fig) # 特征重要性 st.subheader(“特征重要性”) feature_importance = pd.DataFrame({ “特征”: X.columns, “重要性”: model.feature_importances_ }).sort_values(“重要性”, ascending=False).head(10) fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(x=“重要性”, y=“特征”, data=feature_importance, palette=“viridis”, ax=ax) ax.set_title(“Top 10 重要特征”) st.pyplot(fig) except Exception as e: st.error(f"数据处理错误: {str(e)}”) # 推理预测部分 else: st.markdown(“”" 风险预测 上传需要预测的数据,生成违约风险评估报告 “”“, unsafe_allow_html=True) # 上传预测数据 predict_file = st.file_uploader(“上传预测数据 (CSV格式)”, type=[“csv”]) if predict_file is not None: try: # 读取数据 predict_data = pd.read_csv(predict_file) # 显示数据预览 with st.expander(“数据预览”, expanded=True): st.dataframe(predict_data.head()) st.info(f"数据集包含 {predict_data.shape[0]} 个样本,{predict_data.shape[1]} 个特征”) # 检查是否有模型 if not os.path.exists(“risk_model.pkl”): st.warning(“⚠️ 未找到训练好的模型,请先训练模型或使用示例数据”) # 使用示例模型 if st.button(“使用示例模型进行预测”, use_container_width=True): st.info(“正在使用预训练的示例模型进行预测…”) # 创建示例模型 X = np.random.rand(100, 10) y = np.random.randint(0, 2, 100) model = RandomForestClassifier(n_estimators=50, random_state=42) model.fit(X, y) # 生成预测结果 predictions = model.predict(predict_data.values) probas = model.predict_proba(predict_data.values)[:, 1] # 创建结果DataFrame result_df = pd.DataFrame({ “客户ID”: predict_data[“member_id”], “违约概率”: probas, “预测标签”: predictions }) # 添加风险等级 result_df[“风险等级”] = pd.cut( result_df[“违约概率”], bins=[0, 0.2, 0.5, 1], labels=[“低风险”, “中风险”, “高风险”], include_lowest=True ) # 保存结果 st.session_state.prediction_results = result_df else: # 加载模型 model = joblib.load(“risk_model.pkl”) preprocessor_params = joblib.load(‘loan_preprocessor_params.pkl’) # 开始预测按钮 if st.button(“开始风险预测”, use_container_width=True): with st.spinner(“预测进行中,请稍候…”): # 模拟预测过程 progress_bar = st.progress(0) # 预处理推理数据 #todo 自己补齐调用推理数据处理函数完成推理数据的清洗 processed_inference = preprocess_loan_data_inference(predict_data, preprocessor_params) # 步骤1: 数据预处理 time.sleep(1) progress_bar.progress(25) # 步骤2: 特征工程 time.sleep(1) progress_bar.progress(50) # 步骤3: 模型预测 time.sleep(1) progress_bar.progress(75) # 生成预测结果 predictions = model.predict(processed_inference.values) probas = model.predict_proba(processed_inference.values)[:, 1] # 创建结果DataFrame result_df = pd.DataFrame({ “客户ID”: predict_data[“member_id”], “违约概率”: probas, “预测标签”: predictions }) # 添加风险等级 result_df[“风险等级”] = pd.cut( result_df[“违约概率”], bins=[0, 0.2, 0.5, 1], labels=[“低风险”, “中风险”, “高风险”], include_lowest=True ) # 步骤4: 生成报告 time.sleep(1) progress_bar.progress(100) # 保存结果 st.session_state.prediction_results = result_df st.success(“✅ 预测完成!”) except Exception as e: st.error(f"预测错误: {str(e)}“) # 显示预测结果 if “prediction_results” in st.session_state: st.markdown(”“” 预测结果 客户违约风险评估报告 “”“, unsafe_allow_html=True) result_df = st.session_state.prediction_results # 风险分布 st.subheader(“风险分布概览”) col1, col2, col3 = st.columns(3) high_risk = (result_df[“风险等级”] == “高风险”).sum() med_risk = (result_df[“风险等级”] == “中风险”).sum() low_risk = (result_df[“风险等级”] == “低风险”).sum() col1.markdown(f”“” {high_risk} 高风险客户 “”“, unsafe_allow_html=True) col2.markdown(f”“” {med_risk} 中风险客户 “”“, unsafe_allow_html=True) col3.markdown(f”“” {low_risk} 低风险客户 “”“, unsafe_allow_html=True) # 风险分布图 fig, ax = plt.subplots(figsize=(8, 4)) risk_counts = result_df[“风险等级”].value_counts() risk_counts.plot(kind=“bar”, color=[”#4CAF50", “#FFC107”, “#F44336”], ax=ax) ax.set_title(“客户风险等级分布”) ax.set_xlabel(“风险等级”) ax.set_ylabel(“客户数量”) st.pyplot(fig) # 详细预测结果 st.subheader(“详细预测结果”) # 样式函数 def color_risk(val): if val == “高风险”: return “background-color: #ffcdd2; color: #c62828;” elif val == “中风险”: return “background-color: #fff9c4; color: #f57f17;” else: return “background-color: #c8e6c9; color: #388e3c;” # 格式化显示 styled_df = result_df.style.applymap(color_risk, subset=[“风险等级”]) st.dataframe(styled_df.format({ “违约概率”: “{:.2%}” }), height=400) # 下载结果 csv = result_df.to_csv(index=False).encode(“utf-8”) st.download_button( label=“下载预测结果”, data=csv, file_name=“风险预测结果.csv”, mime=“text/csv”, use_container_width=True ) 页脚 st.markdown(“—”) st.markdown(“”" <div style="text-align: center; color: #7f8c8d; font-size: 0.9rem; padding: 1rem;"> © 2023 风控违约预测系统 | 基于Streamlit开发 </div> """, unsafe_allow_html=True) 根据如上代码,仿照如下要求,给出结果完整代码 大数据挖掘:精准营销 一、题目背景 某电信运营商为提升用户 ARPU(每用户平均收入),计划对单宽带用户推广 “单宽转融” 业务(即单宽带用户加装移动网业务,形成融合套餐)。为实现精准营销,需通过数据挖掘技术预测单宽带用户转化为融合套餐用户的可能性,从而针对性制定营销策略。现有一批单宽带用户的行为数据,要求通过数据分析和建模,构建高效的预测模型,辅助运营决策。 二、数据集介绍 1、数据来源:某运营商单宽转融用户的历史数据,包含用户基础信息、资费信息、电信行为数据、客户标签及 DPI 上网行为数据。 2、数据规模:50万+条记录,100+个字段。 3、关键字段说明: 1)用户属性:AGE(年龄),GENDER(性别),ONLINE_DAY(在网天数) 2)消费行为:STMT_AMT(出账金额),PROM_AMT(套餐价格),AVG_STMT_AMT(月均消费) 3)网络使用:DUR(上网时长),DWN_VOL(下载流量),TERM_CNT(接入终端数) 4)业务标签:IF_YHTS(是否投诉),MKT_STAR_GRADE_NAME(用户星级) 5)目标变量(标签):is_rh_next,表示用户是否转为融合套餐(1 为转化,0 为未转化)。 三、题目要求 1、使用 Python 进行数据分析与预处理: 1)加载数据并检查数据质量(缺失值、异常值)。 2)进行特征工程:删除无意义特征、处理缺失值、离散特征编码、标准化 / 归一化。 3)可视化数据分布,分析关键特征与目标变量的相关性。 2、使用 Spark 进行模型训练与测试: 1)构建逻辑回归、决策树、随机森林三种模型。 2)调优模型参数,对比评估指标(准确率、召回率、F1 值、AUC)。 3)选择最优模型,并解释特征重要性。 3、输出要求: 1)给出数据预处理的关键步骤及代码。 2)展示各模型的训练结果与对比分析。 3)说明最终选择的模型及理由。 数据集文件名为Single_breadth_to_melt.csv 文件为gbk编码前一百行数据为 BIL_MONTH ASSET_ROW_ID CCUST_ROW_ID BELONG_CITY MKT_CHANNEL_NAME MKT_CHANNEL_SUB_NAME PREPARE_FLG SERV_START_DT COMB_STAT_NAME FIBER_ACCESS_CATEGORY … AVG_STMT_AMT_LV is_kdts is_itv_up is_mobile_up if_zzzw_up itv_cnt itv_day serv_in_time PROM_AMT_MONTH is_rh_next 0 201706 1-1E6Z49HF 1-UTSNWVU 杭州 NaN 其它部门-未知部门细分-未知 … 0 20140126 现行 普通宽带 … c30-59 0 0 0 0 0 0 41 44.44 0.0 1 201706 3-J591KYI 1-LKFKET 杭州 NaN 其它部门-未知部门细分-未知 … 0 20160406 现行 普通宽带 … e89-129 0 0 0 0 0 0 14 100.00 0.0 2 201706 1-F3YGP4D 1-6T16M75 杭州 营业厅 营业厅-营业服务中心-城市 … 0 20100112 现行 普通宽带 … c30-59 0 0 0 0 0 28 89 44.44 0.0 3 201706 1-1AITRLCN 1-1AB5KV9U 杭州 NaN 其它部门-未知部门细分-未知 … 0 20131017 现行 普通宽带 … c30-59 1 0 0 0 0 10 44 55.56 0.0 4 201706 1-132ZSIVX 1-LPVY5O 杭州 10000号 其它部门-10000客服部-城市 … 0 20130209 现行 普通宽带 … d59-89 0 0 0 0 0 0 52 0.00 0.0
07-02
import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from pyspark.sql import SparkSession from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder from pyspark.ml import Pipeline from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator import joblib import os import time import warnings from io import BytesIO import platform from pathlib import Path def safe_path(path): “”“处理Windows长路径问题”“” if platform.system() == ‘Windows’: try: import ntpath return ntpath.realpath(path) except: return str(Path(path).resolve()) return path 忽略警告 warnings.filterwarnings(“ignore”) 设置中文字体 plt.rcParams[‘font.sans-serif’] = [‘SimHei’] plt.rcParams[‘axes.unicode_minus’] = False 页面设置 st.set_page_config( page_title=“精准营销系统”, page_icon=“📊”, layout=“wide”, initial_sidebar_state=“expanded” ) 自定义CSS样式 st.markdown(“”" <style> .stApp { background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); font-family: 'Helvetica Neue', Arial, sans-serif; } .header { background: linear-gradient(90deg, #1a237e 0%, #283593 100%); color: white; padding: 1.5rem; border-radius: 0.75rem; box-shadow: 0 4px 12px rgba(0,0,0,0.1); margin-bottom: 2rem; } .card { background: white; border-radius: 0.75rem; padding: 1rem; margin-bottom: 1.5rem; box-shadow: 0 4px 12px rgba(0,0,0,0.08); transition: transform 0.3s ease; } .card:hover { transform: translateY(-5px); box-shadow: 0 6px 16px rgba(0,0,0,0.12); } .stButton button { background: linear-gradient(90deg, #3949ab 0%, #1a237e 100%) !important; color: white !important; border: none !important; border-radius: 0.5rem; padding: 0.75rem 1.5rem; font-size: 1rem; font-weight: 600; transition: all 0.3s ease; width: 100%; } .stButton button:hover { transform: scale(1.05); box-shadow: 0 4px 8px rgba(57, 73, 171, 0.4); } .feature-box { background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); border-radius: 0.75rem; padding: 1.5rem; margin-bottom: 1.5rem; } .result-box { background: linear-gradient(135deg, #e8f5e9 0%, #c8e6c9 100%); border-radius: 0.75rem; padding: 1.5rem; margin-top: 1.5rem; } .model-box { background: linear-gradient(135deg, #fff3e0 0%, #ffe0b2 100%); border-radius: 0.75rem; padding: 1.5rem; margin-top: 1.5rem; } .stProgress > div > div > div { background: linear-gradient(90deg, #2ecc71 0%, #27ae60 100%) !important; } .metric-card { background: white; border-radius: 0.75rem; padding: 1rem; text-align: center; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .metric-value { font-size: 1.8rem; font-weight: 700; color: #1a237e; } .metric-label { font-size: 0.9rem; color: #5c6bc0; margin-top: 0.5rem; } .highlight { background: linear-gradient(90deg, #ffeb3b 0%, #fbc02d 100%); padding: 0.2rem 0.5rem; border-radius: 0.25rem; font-weight: 600; } .stDataFrame { border-radius: 0.75rem; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .convert-high { background-color: #c8e6c9 !important; color: #388e3c !important; font-weight: 700; } .convert-low { background-color: #ffcdd2 !important; color: #c62828 !important; font-weight: 600; } </style> “”", unsafe_allow_html=True) 创建Spark会话 def create_spark_session(): return SparkSession.builder .appName(“TelecomPrecisionMarketing”) .config(“spark.driver.memory”, “4g”) .config(“spark.executor.memory”, “4g”) .getOrCreate() 数据预处理函数 - 修改后 def preprocess_data(df): “”" 数据预处理函数 参数: df: 原始数据 (DataFrame) 返回: 预处理后的数据 (DataFrame) “”" # 1. 选择关键特征 - 使用实际存在的列名 available_features = [col for col in df.columns if col in [ ‘AGE’, ‘GENDER’, ‘ONLINE_DAY’, ‘TERM_CNT’, ‘IF_YHTS’, ‘MKT_STAR_GRADE_NAME’, ‘PROM_AMT_MONTH’, ‘is_rh_next’ # 目标变量 ]] # 确保目标变量存在 if 'is_rh_next' not in available_features: st.error("错误:数据集中缺少目标变量 'is_rh_next'") return df # 只保留需要的列 df = df[available_features].copy() # 2. 处理缺失值 # 数值特征用均值填充 numeric_cols = ['AGE', 'ONLINE_DAY', 'TERM_CNT', 'PROM_AMT_MONTH'] for col in numeric_cols: if col in df.columns: mean_val = df[col].mean() df[col].fillna(mean_val, inplace=True) # 分类特征用众数填充 categorical_cols = ['GENDER', 'MKT_STAR_GRADE_NAME', 'IF_YHTS'] for col in categorical_cols: if col in df.columns: mode_val = df[col].mode()[0] df[col].fillna(mode_val, inplace=True) # 3. 异常值处理(使用IQR方法) def handle_outliers(series): Q1 = series.quantile(0.25) Q3 = series.quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR return series.clip(lower_bound, upper_bound) for col in numeric_cols: if col in df.columns: df[col] = handle_outliers(df[col]) return df 标题区域 st.markdown(“”" <div class="header"> <h1 style='text-align: center; margin: 0;'>精准营销系统</h1> <p style='text-align: center; margin: 0.5rem 0 0; font-size: 1.1rem;'>基于机器学习的单宽转融预测</p> </div> """, unsafe_allow_html=True) 页面布局 col1, col2 = st.columns([1, 1.5]) 左侧区域 - 图片和简介 with col1: st.markdown(“”" 📱 智能营销系统 预测单宽带用户转化为融合套餐用户的可能性 “”", unsafe_allow_html=True) # 使用在线图片作为占位符 st.image("https://images.unsplash.com/photo-1551836022-d5d88e9218df?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1200&q=80", caption="精准营销系统示意图", width=600) st.markdown(""" <div class="card"> <h4>📈 系统功能</h4> <ul> <li>用户转化可能性预测</li> <li>高精度机器学习模型</li> <li>可视化数据分析</li> <li>精准营销策略制定</li> </ul> </div> """, unsafe_allow_html=True) 右侧区域 - 功能选择 with col2: st.markdown(“”" 📋 请选择操作类型 您可以选择数据分析或使用模型进行预测 “”", unsafe_allow_html=True) # 功能选择 option = st.radio("", ["📊 数据分析 - 探索数据并训练模型", "🔍 预测分析 - 预测用户转化可能性"], index=0, label_visibility="hidden") # 数据分析部分 if "数据分析" in option: st.markdown(""" <div class="card"> <h3>数据分析与模型训练</h3> <p>上传数据并训练预测模型</p> </极客时间> """, unsafe_allow_html=True) # 上传训练数据 train_file = st.file_uploader("上传数据集 (CSV格式, GBK编码)", type=["csv"]) if train_file is not None: try: # 读取数据 train_data = pd.read_csv(train_file, encoding='GBK') # 显示数据预览 with st.expander("数据预览", expanded=True): st.dataframe(train_data.head()) col1, col2 = st.columns(2) col1.metric("总样本数", train_data.shape[0]) col2.metric("特征数量", train_data.shape[1] - 1) # 数据预处理 st.subheader("数据预处理") with st.spinner("数据预处理中..."): processed_data = preprocess_data(train_data) st.success("✅ 数据预处理完成") # 可视化数据分布 st.subheader("数据分布分析") # 目标变量分布 st.markdown("**目标变量分布 (is_rh_next)**") fig, ax = plt.subplots(figsize=(8, 5)) sns.countplot(x='is_rh_next', data=processed_data, palette='viridis') plt.title('用户转化分布 (0:未转化, 1:转化)') plt.xlabel('是否转化') plt.ylabel('用户数量') st.pyplot(fig) # 数值特征分布 st.markdown("**数值特征分布**") numeric_cols = ['AGE', 'ONLINE_DAY', 'TERM_CNT', 'PROM_AMT_MONTH'] # 动态计算子图布局 num_features = len(numeric_cols) if num_features > 0: ncols = 2 nrows = (num_features + ncols - 1) // ncols # 向上取整 fig, axes = plt.subplots(nrows, ncols, figsize=(14, 4*nrows)) # 将axes展平为一维数组 if nrows > 1 or ncols > 1: axes = axes.flatten() else: axes = [axes] # 单个子图时确保axes是列表 for i, col in enumerate(numeric_cols): if col in processed_data.columns and i < len(axes): sns.histplot(processed_data[col], kde=True, ax=axes[i], color='skyblue') axes[i].set_title(f'{col}分布') axes[i].set_xlabel('') # 隐藏多余的子图 for j in range(i+1, len(axes)): axes[j].set_visible(False) plt.tight_layout() st.pyplot(fig) else: st.warning("没有可用的数值特征") # 特征相关性分析 st.markdown("**特征相关性热力图**") corr_cols = numeric_cols + ['is_rh_next'] if len(corr_cols) > 1: corr_data = processed_data[corr_cols].corr() fig, ax = plt.subplots(figsize=(12, 8)) sns.heatmap(corr_data, annot=True, fmt=".2f", cmap='coolwarm', ax=ax) plt.title('特征相关性热力图') st.pyplot(fig) else: st.warning("特征不足,无法生成相关性热力图") # 模型训练 st.subheader("模型训练") # 训练参数设置 col1, col2 = st.columns(2) test_size = col1.slider("测试集比例", 0.1, 0.4, 0.2, 0.05) random_state = col2.number_input("随机种子", 0, 100, 42) # 开始训练按钮 if st.button("开始训练模型", use_container_width=True): with st.spinner("模型训练中,请稍候..."): # 创建Spark会话 spark = create_spark_session() # 将Pandas DataFrame转换为Spark DataFrame spark_df = spark.createDataFrame(processed_data) # 划分训练集和测试集 train_df, test_df = spark_df.randomSplit([1.0 - test_size, test_size], seed=random_state) # 特征工程 # 分类特征编码 categorical_cols = ['GENDER', 'MKT_STAR_GRADE_NAME', 'IF_YHTS'] # 只处理存在的分类特征 existing_cat_cols = [col for col in categorical_cols if col in processed_data.columns] indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in existing_cat_cols] encoders = [OneHotEncoder(inputCol=col+"_index", outputCol=col+"_encoded") for col in existing_cat_cols] # 数值特征 numeric_cols = ['AGE', 'ONLINE_DAY', 'TERM_CNT', 'PROM_AMT_MONTH'] # 组合所有特征 feature_cols = numeric_cols + [col+"_encoded" for col in existing_cat_cols] assembler = VectorAssembler(inputCols=feature_cols, outputCol="features") # 目标变量索引 label_indexer = StringIndexer(inputCol="is_rh_next", outputCol="label") # 构建模型 lr = LogisticRegression(featuresCol="features", labelCol="label") dt = DecisionTreeClassifier(featuresCol="features", labelCol="label") rf = RandomForestClassifier(featuresCol="features", labelCol="label") # 创建管道 pipeline_lr = Pipeline(stages=indexers + encoders + [assembler, label_indexer, lr]) pipeline_dt = Pipeline(stages=indexers + encoders + [assembler, label_indexer, dt]) pipeline_rf = Pipeline(stages=indexers + encoders + [assembler, label_indexer, rf]) # 训练模型 model_lr = pipeline_lr.fit(train_df) model_dt = pipeline_dt.fit(train_df) model_rf = pipeline_rf.fit(train_df) # 评估模型 evaluator_auc = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction") evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy") evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1") def evaluate_model(model, data): predictions = model.transform(data) auc = evaluator_auc.evaluate(predictions) acc = evaluator_acc.evaluate(predictions) f1 = evaluator_f1.evaluate(predictions) return {"AUC": auc, "Accuracy": acc, "F1": f1} results = { "Logistic Regression": evaluate_model(model_lr, test_df), "Decision Tree": evaluate_model(model_dt, test_df), "Random Forest": evaluate_model(model_rf, test_df) } # 保存结果 st.session_state.model_results = results st.session_state.best_model = model_rf # 默认使用随机森林作为最佳模型 st.session_state.spark = spark st.success("🎉 模型训练完成!") # 显示模型性能 st.subheader("模型性能评估") # 转换为DataFrame展示 results_df = pd.DataFrame(results).T st.dataframe(results_df.style.format("{:.4f}").background_gradient(cmap='Blues')) # 可视化比较 fig, ax = plt.subplots(figsize=(10, 6)) results_df.plot(kind='bar', ax=ax) plt.title('模型性能比较') plt.ylabel('分数') plt.xticks(rotation=15) plt.legend(loc='upper right') st.pyplot(fig) # 特征重要性(随机森林) st.subheader("随机森林特征重要性") rf_model = model_rf.stages[-1] feature_importances = rf_model.featureImportances.toArray() feature_names = numeric_cols + [f"{col}_encoded" for col in existing_cat_cols] importance_df = pd.DataFrame({ "Feature": feature_names, "Importance": feature_importances }).sort_values("Importance", ascending=False).head(10) fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(x="Importance", y="Feature", data=importance_df, palette="viridis", ax=ax) plt.title('Top 10 重要特征') st.pyplot(fig) # 保存模型 model_path = "best_model" model_rf.write().overwrite().save(model_path) st.session_state.model_path = model_path except Exception as e: st.error(f"数据处理错误: {str(e)}") # 预测分析部分 else: st.markdown(""" <div class="card"> <h3>用户转化预测</h3> <p>预测单宽带用户转化为融合套餐的可能性</p> </div> """, unsafe_allow_html=True) # 上传预测数据 predict_file = st.file_uploader("上传预测数据 (CSV格式, GBK编码)", type=["csv"]) if predict_file is not None: try: # 读取数据 predict_data = pd.read_csv(predict_file, encoding='GBK') # 显示数据预览 with st.expander("数据预览", expanded=True): st.dataframe(predict_data.head()) # 检查是否有模型 if "model_path" not in st.session_state or not os.path.exists(st.session_state.model_path): st.warning("⚠️ 未找到训练好的模型,请先训练模型") st.stop() # 开始预测按钮 if st.button("开始预测", use_container_width=True): with st.spinner("预测进行中,请稍候..."): # 数据预处理 processed_data = preprocess_data(predict_data) # 创建Spark会话 if "spark" not in st.session_state: spark = create_spark_session() st.session_state.spark = spark else: spark = st.session_state.spark # 将Pandas DataFrame转换为Spark DataFrame spark_df = spark.createDataFrame(processed_data) # 加载模型 best_model = st.session_state.best_model # 生成预测结果 predictions = best_model.transform(spark_df) # 提取预测结果 predictions_df = predictions.select( "CCUST_ROW_ID", "probability", "prediction" ).toPandas() # 解析概率值 predictions_df['转化概率'] = predictions_df['probability'].apply(lambda x: float(x[1])) predictions_df['预测结果'] = predictions_df['prediction'].apply(lambda x: "可能转化" if x == 1.0 else "可能不转化") # 添加转化可能性等级 predictions_df['转化可能性'] = pd.cut( predictions_df['转化概率'], bins=[0, 0.3, 0.7, 1], labels=["低可能性", "中可能性", "高可能性"] ) # 保存结果 st.session_state.prediction_results = predictions_df st.success("✅ 预测完成!") except Exception as e: st.error(f"预测错误: {str(e)}") # 显示预测结果 if "prediction_results" in st.session_state: st.markdown(""" <div class="card"> <h3>预测结果</h3> <p>用户转化可能性评估报告</p> </div> """, unsafe_allow_html=True) result_df = st.session_state.prediction_results # 转化可能性分布 st.subheader("转化可能性分布概览") col1, col2, col3 = st.columns(3) high_conv = (result_df["转化可能性"] == "高可能性").sum() med_conv = (result_df["转化可能性"] == "中可能性").sum() low_conv = (result_df["转化可能性"] == "低可能性").sum() col1.markdown(f""" <div class="metric-card"> <div class="metric-value">{high_conv}</div> <div class="metric-label">高可能性用户</div> </div> """, unsafe_allow_html=True) col2.markdown(f""" <div class="metric-card"> <div class="metric-value">{med_conv}</div> <div class="metric-label">中可能性用户</div> </div> """, unsafe_allow_html=True) col3.markdown(f""" <div class="metric-card"> <div class="metric-value">{low_conv}</div> <div class="metric-label">低可能性用户</div> </div> """, unsafe_allow_html=True) # 转化可能性分布图 fig, ax = plt.subplots(figsize=(8, 5)) conv_counts = result_df["转化可能性"].value_counts() conv_counts.plot(kind='bar', color=['#4CAF50', '#FFC107', '#F44336'], ax=ax) plt.title('用户转化可能性分布') plt.xlabel('可能性等级') plt.ylabel('用户数量') st.pyplot(fig) # 详细预测结果 st.subheader("详细预测结果") # 样式函数 def color_convert(val): if val == "高可能性": return "background-color: #c8e6c9; color: #388e3c;" elif val == "中可能性": return "background-color: #fff9c4; color: #f57f17;" else: return "background-color: #ffcdd2; color: #c62828;" # 格式化显示 display_df = result_df[["CCUST_ROW_ID", "转化概率", "预测结果", "转化可能性"]] styled_df = display_df.style.format({ "转化概率": "{:.2%}" }).applymap(color_convert, subset=["转化可能性"]) st.dataframe(styled_df, height=400) # 下载结果 csv = display_df.to_csv(index=False).encode("utf-8") st.download_button( label="下载预测结果", data=csv, file_name="用户转化预测结果.csv", mime="text/csv", use_container_width=True ) 页脚 st.markdown(“—”) st.markdown(“”" <div style="text-align: center; color: #5c6bc0; font-size: 0.9rem; padding: 1rem;"> © 2023 精准营销系统 | 基于Spark和Streamlit开发 </div> """, unsafe_allow_html=True) 将上述所给代码,不使用spark,仿照如下所给代码,完成算法和模型调优等操作 import streamlit as st import pandas as pd import numpy as np import joblib import os import time import pandas as pd import numpy as np import matplotlib.pyplot as plt import matplotlib as mpl import matplotlib.font_manager as fm import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix from sklearn.preprocessing import StandardScaler from imblearn.over_sampling import SMOTE from sklearn.impute import SimpleImputer import warnings warnings.filterwarnings("ignore") plt.rcParams[‘font.sans-serif’] = [‘SimHei’] plt.rcParams[‘axes.unicode_minus’] = False # 正确显示负号 页面设置 st.set_page_config( page_title=“风控违约预测系统”, page_icon=“📊”, layout=“wide”, initial_sidebar_state=“expanded” ) 自定义CSS样式 st.markdown(“”" <style> .stApp { background: linear-gradient(135deg, #f5f7fa 0%, #e4edf5 100%); font-family: 'Helvetica Neue', Arial, sans-serif; } .header { background: linear-gradient(90deg, #2c3e50 0%, #4a6491 100%); color: white; padding: 1.5rem; border-radius: 0.75rem; box-shadow: 0 4px 12px rgba(0,0,0,0.1); margin-bottom: 2rem; } .card { background: white; border-radius: 0.75rem; padding: 1.5rem; margin-bottom: 1.5rem; box-shadow: 0 4px 12px rgba(0,0,0,0.08); transition: transform 0.3s ease; } .card:hover { transform: translateY(-5px); box-shadow: 0 6px 16px rgba(0,0,0,0.12); } .stButton button { background: linear-gradient(90deg, #3498db 0%, #1a5276 100%) !important; color: white !important; border: none !important; border-radius: 0.5rem; padding: 0.75rem 1.5rem; font-size: 1rem; font-weight: 600; transition: all 0.3s ease; width: 100%; } .stButton button:hover { transform: scale(1.05); box-shadow: 0 4px 8px rgba(52, 152, 219, 0.4); } .feature-box { background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%); border-radius: 0.75rem; padding: 1.5rem; margin-bottom: 1.5rem; } .result-box { background: linear-gradient(135deg, #e8f5e9 0%, #c8e6c9 100%); border-radius: 0.75rem; padding: 1.5rem; margin-top: 1.5rem; } .model-box { background: linear-gradient(135deg, #fff3e0 0%, #ffe0b2 100%); border-radius: 0.75rem; padding: 1.5rem; margin-top: 1.5rem; } .stProgress > div > div > div { background: linear-gradient(90deg, #2ecc71 0%, #27ae60 100%) !important; } .metric-card { background: white; border-radius: 0.75rem; padding: 1rem; text-align: center; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .metric-value { font-size: 1.8rem; font-weight: 700; color: #2c3e50; } .metric-label { font-size: 0.9rem; color: #7f8c8d; margin-top: 0.5rem; } .highlight { background: linear-gradient(90deg, #ffeb3b 0%, #fbc02d 100%); padding: 0.2rem 0.5rem; border-radius: 0.25rem; font-weight: 600; } .stDataFrame { border-radius: 0.75rem; box-shadow: 0 4px 8px rgba(0,0,0,0.06); } .risk-high { background-color: #ffcdd2 !important; color: #c62828 !important; font-weight: 700; } .risk-medium { background-color: #fff9c4 !important; color: #f57f17 !important; font-weight: 600; } .risk-low { background-color: #c8e6c9 !important; color: #388e3c !important; } </style> “”", unsafe_allow_html=True) def preprocess_loan_data(data_old): “”" 训练时数据预处理函数,返回处理后的数据和推理时需要的参数 参数: data_old: 原始训练数据 (DataFrame) 返回: processed_data: 预处理后的训练数据 (DataFrame) preprocessor_params: 推理时需要的预处理参数 (dict) """ # 1. 创建原始数据副本 loan_data = data_old.copy() # 2. 保存要删除的列列表 drop_list = ['id','member_id', 'term', 'pymnt_plan', 'initial_list_status', 'sub_grade', 'emp_title', 'issue_d', 'title', 'zip_code', 'addr_state', 'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d', 'url','desc','next_pymnt_d'] loan_data.drop([col for col in drop_list if col in loan_data.columns], axis=1, inplace=True, errors='ignore') # 3. 删除缺失值超过90%的列 #todo 自己补齐删除代码 missing_ratio = loan_data.isnull().sum() / len(loan_data) loan_data.drop(missing_ratio[missing_ratio > 0.9].index, axis=1, inplace=True, errors='ignore') # 4. 删除值全部相同的列 #todo 自己补齐删除代码 constant_cols = loan_data.columns[loan_data.nunique() <= 1] loan_data.drop(constant_cols, axis=1, inplace=True, errors='ignore') # 5. 处理特殊数值列 loans = loan_data # 修正变量名 loans["int_rate"] = loans["int_rate"].astype(str).str.rstrip('%').astype("float") loans["revol_util"] = loans["revol_util"].astype(str).str.rstrip('%').astype("float") # 6. 缺失值处理 ## 识别分类列和数值列 objectColumns = loans.select_dtypes(include=["object"]).columns.tolist() numColumns = loans.select_dtypes(include=[np.number]).columns.tolist() ## 保存分类列的列名 categorical_columns = objectColumns.copy() ## 填充分类变量缺失值 loans[objectColumns] = loans[objectColumns].fillna("Unknown") ## 填充数值变量缺失值并保存均值 imr = SimpleImputer(missing_values=np.nan, strategy="mean") loans[numColumns] = imr.fit_transform(loans[numColumns]) # 保存数值列的均值 numerical_means = {col: imr.statistics_[i] for i, col in enumerate(numColumns)} # 8. 特征衍生 loans["installment_feat"] = loans["installment"] / ((loans["annual_inc"] + 1) / 12) # 9. 目标变量编码 status_mapping = { "Current": 0, "Issued": 0, "Fully Paid": 0, "In Grace Period": 1, "Late (31-120 days)": 1, "Late (16-30 days)": 1, "Charged Off": 1, "Does not meet the credit policy. Status:Charged Off": 1, "Does not meet the credit policy. Status:Fully Paid": 0, "Default": 0 } loans["loan_status"] = loans["loan_status"].map(status_mapping) # 10. 有序特征映射 mapping_dict = { "emp_length": { "10+ years": 10, "9 years": 9, "8 years": 8, "7 years": 7, "6 years": 6, "5 years": 5, "4 years": 4, "3 years": 3, "2 years": 2, "1 year": 1, "< 1 year": 0, "Unknown": 0 }, "grade": { "A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7 } } loans = loans.replace(mapping_dict) # 11. One-hot编码 n_columns = ["home_ownership", "verification_status", "purpose", "application_type"] dummy_df = pd.get_dummies(loans[n_columns], drop_first=False) loans = pd.concat([loans, dummy_df], axis=1) loans.drop(n_columns, axis=1, inplace=True) # 保存One-hot编码后的列名 onehot_columns = n_columns onehot_encoder_columns = dummy_df.columns.tolist() # 12. 特征缩放 # 识别需要缩放的数值列 numeric_cols = loans.select_dtypes(include=["int", "float"]).columns.tolist() if 'loan_status' in numeric_cols: numeric_cols.remove('loan_status') # 创建并拟合缩放器 sc = StandardScaler() if numeric_cols: loans[numeric_cols] = sc.fit_transform(loans[numeric_cols]) # 保存缩放列名 scaled_columns = numeric_cols # 13. 保存最终列结构(在SMOTE之前) #final_columns = loans.columns.tolist().remove('loan_status') final_columns = loans.columns[loans.columns != 'loan_status'].tolist() # 14. 处理不平衡数据(SMOTE过采样) X = loans.drop("loan_status", axis=1) y = loans["loan_status"] os = SMOTE(random_state=42) X_res, y_res = os.fit_resample(X, y) # 15. 合并为最终DataFrame processed_data = pd.concat([X_res, y_res], axis=1) processed_data.columns = list(X.columns) + ["loan_status"] # 16. 创建推理时需要的参数字典 preprocessor_params = { # 1. 删除的列 'drop_list': drop_list, # 2. 分类列缺失值填充 'categorical_columns': categorical_columns, # 3. 数值列填充均值 'numerical_means': numerical_means, # 4. 有序特征映射 'mapping_dict': mapping_dict, # 5. One-hot配置 'onehot_columns': onehot_columns, 'onehot_encoder_columns': onehot_encoder_columns, # 6. 缩放器及缩放列 'scaler': sc, # 已拟合的StandardScaler实例 'scaled_columns': scaled_columns, # 7. 最终列结构(训练后的列顺序) 'final_columns': final_columns } return processed_data, preprocessor_params def preprocess_loan_data_inference(data_old, preprocessor_params): “”" 推理时数据处理函数 参数: data_old: 原始推理数据 (DataFrame) preprocessor_params: 从训练过程保存的预处理参数 (dict) 返回: processed_data: 预处理后的推理数据 (DataFrame) """ # 1. 复制数据避免污染原始数据 loanss = data_old.copy() # 2. 删除训练时确定的列 drop_list = preprocessor_params['drop_list'] loans = loanss.drop(columns=[col for col in drop_list if col in loanss.columns], axis=1, errors='ignore') # 3. 处理特殊数值列(百分比转换) if 'int_rate' in loans: loans["int_rate"] = loans["int_rate"].astype(str).str.rstrip('%').astype("float") if 'revol_util' in loans: loans["revol_util"] = loans["revol_util"].astype(str).str.rstrip('%').astype("float") # 4. 特征衍生(使用训练时相同公式) if 'installment' in loans and 'annual_inc' in loans: loans["installment_feat"] = loans["installment"] / ((loans["annual_inc"] + 1) / 12) # 5. 有序特征映射(使用训练时的映射字典) mapping_dict = preprocessor_params['mapping_dict'] for col, mapping in mapping_dict.items(): if col in loans: # 处理未知值,默认为0 loans[col] = loans[col].map(mapping).fillna(0).astype(int) # 6. 缺失值处理(使用训练时保存的策略) # 分类变量 cat_cols = preprocessor_params['categorical_columns'] for col in cat_cols: if col in loans: loans[col] = loans[col].fillna("Unknown") # 数值变量(使用训练时保存的均值) num_means = preprocessor_params['numerical_means'] for col, mean_value in num_means.items(): if col in loans: loans[col] = loans[col].fillna(mean_value) # 7. One-hot编码(对齐训练时的列结构) n_columns = preprocessor_params['onehot_columns'] expected_dummy_columns = preprocessor_params['onehot_encoder_columns'] # 创建空DataFrame用于存储结果 dummy_df = pd.DataFrame(columns=expected_dummy_columns) # 为每个分类列生成dummy变量 for col in n_columns: if col in loans: # 为当前列生成dummies col_dummies = pd.get_dummies(loans[col], prefix=col) # 对齐训练时的列结构 for expected_col in expected_dummy_columns: if expected_col in col_dummies: dummy_df[expected_col] = col_dummies[expected_col] else: # 如果该列不存在,则创建全0列 dummy_df[expected_col] = 0 # 合并dummy变量 loans = pd.concat([loans, dummy_df], axis=1) # 删除原始分类列 loans.drop(columns=[col for col in n_columns if col in loans.columns], inplace=True, errors='ignore') # 8. 特征缩放(使用训练时的缩放器参数) sc = preprocessor_params['scaler'] scaled_cols = [col for col in preprocessor_params['scaled_columns'] if col in loans.columns] if scaled_cols: loans[scaled_cols] = sc.transform(loans[scaled_cols]) # 9. 对齐最终特征列(确保与训练数据相同) final_columns = preprocessor_params['final_columns'] # 添加缺失列(用0填充) for col in final_columns: if col not in loans.columns: loans[col] = 0 # 移除多余列并保持顺序 processed_data = loans[final_columns] print(loans.columns) return processed_data 标题区域 st.markdown(“”" <div class="header"> <h1 style='text-align: center; margin: 0;'>风控违约预测系统</h1> <p style='text-align: center; margin: 0.5rem 0 0; font-size: 1.1rem;'>基于机器学习的信贷风险评估与预测</p> </div> """, unsafe_allow_html=True) 页面布局 col1, col2 = st.columns([1, 1.5]) 左侧区域 - 图片和简介 with col1: st.markdown(“”" 智能风控系统 利用先进机器学习技术预测信贷违约风险 “”", unsafe_allow_html=True) # 使用在线图片作为占位符 st.image("https://images.unsplash.com/photo-1553877522-43269d4ea984?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1200&q=80", caption="智能风控系统示意图", use_column_width=True) st.markdown(""" <div class="feature-box"> <h4>📈 系统功能</h4> <ul> <li>客户违约风险预测</li> <li>高精度机器学习模型</li> <li>可视化风险评估</li> <li>批量数据处理</li> </ul> </div> """, unsafe_allow_html=True) 右侧区域 - 功能选择 with col2: st.markdown(“”" 请选择操作类型 您可以选择训练新模型或使用现有模型进行预测 “”", unsafe_allow_html=True) # 功能选择 option = st.radio("", ["🚀 训练新模型 - 使用新数据训练预测模型", "🔍 推理预测 - 使用模型预测违约风险"], index=0, label_visibility="hidden") # 模型训练部分 if "训练新模型" in option: st.markdown(""" <div class="model-box"> <h4>模型训练</h4> <p>上传训练数据并训练新的预测模型</p> </div> """, unsafe_allow_html=True) # 上传训练数据 train_file = st.file_uploader("上传训练数据 (CSV格式)", type=["csv"]) if train_file is not None: try: # 读取数据 train_data_old = pd.read_csv(train_file) # 显示数据预览 with st.expander("数据预览", expanded=True): st.dataframe(train_data_old.head()) col1, col2, col3 = st.columns(3) col1.metric("总样本数", train_data_old.shape[0]) col2.metric("特征数量", train_data_old.shape[1] - 1) # 训练参数设置 st.subheader("训练参数") col1, col2 = st.columns(2) test_size = col1.slider("测试集比例", 0.1, 0.4, 0.2, 0.1) n_estimators = col2.slider("树的数量", 10, 500, 100, 10) max_depth = col1.slider("最大深度", 2, 30, 10, 1) random_state = col2.number_input("随机种子", 0, 100, 42) # 开始训练按钮 if st.button("开始训练模型", use_container_width=True): with st.spinner("模型训练中,请稍候..."): # 模拟数据处理 progress_bar = st.progress(0) train_data,preprocessor_params = preprocess_loan_data(train_data_old) joblib.dump(preprocessor_params, 'loan_preprocessor_params.pkl') # 步骤1: 数据预处理 time.sleep(1) progress_bar.progress(25) st.success("✅ 数据预处理完成") # 步骤2: 特征工程 time.sleep(1) progress_bar.progress(50) st.success("✅ 特征工程完成") # 步骤3: 模型训练 time.sleep(2) progress_bar.progress(75) # 实际训练代码 (简化版) X = train_data.drop("loan_status", axis=1) y = train_data["loan_status"] # 划分训练测试集 #todo 自己补齐数据划分代码 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y) # 训练模型 #todo 自己补齐调用随机森林算法完成模型的训练 model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state, n_jobs=-1) model.fit(X_train, y_train) # 保存模型 joblib.dump(model, "risk_model.pkl") # 步骤4: 模型评估 time.sleep(1) progress_bar.progress(100) # 评估模型 #todo 自己补齐调用预测函数完成测试集推理预测 y_pred = model.predict(X_test) y_proba = model.predict_proba(X_test)[:, 1] accuracy = accuracy_score(y_test, y_pred) auc = roc_auc_score(y_test, y_proba) # 保存评估结果 st.session_state.model_trained = True st.session_state.accuracy = accuracy st.session_state.auc = auc st.session_state.y_test = y_test st.session_state.y_pred = y_pred st.success("🎉 模型训练完成!") # 显示模型性能 st.subheader("模型性能评估") col1, col2 = st.columns(2) col1.markdown(f""" <div class="metric-card"> <div class="metric-value">{accuracy*100:.1f}%</div> <div class="metric-label">准确率</div> </div> """, unsafe_allow_html=True) col2.markdown(f""" <div class="metric-card"> <div class="metric-value">{auc:.3f}</div> <div class="metric-label">AUC 分数</div> </div> """, unsafe_allow_html=True) # 混淆矩阵 st.subheader("混淆矩阵") cm = confusion_matrix(y_test, y_pred) fig, ax = plt.subplots(figsize=(6, 4)) sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax) ax.set_xlabel("预测标签") ax.set_ylabel("真实标签") ax.set_title("混淆矩阵") st.pyplot(fig) # 特征重要性 st.subheader("特征重要性") feature_importance = pd.DataFrame({ "特征": X.columns, "重要性": model.feature_importances_ }).sort_values("重要性", ascending=False).head(10) fig, ax = plt.subplots(figsize=(10, 6)) sns.barplot(x="重要性", y="特征", data=feature_importance, palette="viridis", ax=ax) ax.set_title("Top 10 重要特征") st.pyplot(fig) except Exception as e: st.error(f"数据处理错误: {str(e)}") # 推理预测部分 else: st.markdown(""" <div class="model-box"> <h4>风险预测</h4> <p>上传需要预测的数据,生成违约风险评估报告</p> </div> """, unsafe_allow_html=True) # 上传预测数据 predict_file = st.file_uploader("上传预测数据 (CSV格式)", type=["csv"]) if predict_file is not None: try: # 读取数据 predict_data = pd.read_csv(predict_file) # 显示数据预览 with st.expander("数据预览", expanded=True): st.dataframe(predict_data.head()) st.info(f"数据集包含 {predict_data.shape[0]} 个样本,{predict_data.shape[1]} 个特征") # 检查是否有模型 if not os.path.exists("risk_model.pkl"): st.warning("⚠️ 未找到训练好的模型,请先训练模型或使用示例数据") # 使用示例模型 if st.button("使用示例模型进行预测", use_container_width=True): st.info("正在使用预训练的示例模型进行预测...") # 创建示例模型 X = np.random.rand(100, 10) y = np.random.randint(0, 2, 100) model = RandomForestClassifier(n_estimators=50, random_state=42) model.fit(X, y) # 生成预测结果 predictions = model.predict(predict_data.values) probas = model.predict_proba(predict_data.values)[:, 1] # 创建结果DataFrame result_df = pd.DataFrame({ "客户ID": predict_data["member_id"], "违约概率": probas, "预测标签": predictions }) # 添加风险等级 result_df["风险等级"] = pd.cut( result_df["违约概率"], bins=[0, 0.2, 0.5, 1], labels=["低风险", "中风险", "高风险"], include_lowest=True ) # 保存结果 st.session_state.prediction_results = result_df else: # 加载模型 model = joblib.load("risk_model.pkl") preprocessor_params = joblib.load('loan_preprocessor_params.pkl') # 开始预测按钮 if st.button("开始风险预测", use_container_width=True): with st.spinner("预测进行中,请稍候..."): # 模拟预测过程 progress_bar = st.progress(0) # 预处理推理数据 #todo 自己补齐调用推理数据处理函数完成推理数据的清洗 processed_inference = preprocess_loan_data_inference(predict_data, preprocessor_params) # 步骤1: 数据预处理 time.sleep(1) progress_bar.progress(25) # 步骤2: 特征工程 time.sleep(1) progress_bar.progress(50) # 步骤3: 模型预测 time.sleep(1) progress_bar.progress(75) # 生成预测结果 predictions = model.predict(processed_inference.values) probas = model.predict_proba(processed_inference.values)[:, 1] # 创建结果DataFrame result_df = pd.DataFrame({ "客户ID": predict_data["member_id"], "违约概率": probas, "预测标签": predictions }) # 添加风险等级 result_df["风险等级"] = pd.cut( result_df["违约概率"], bins=[0, 0.2, 0.5, 1], labels=["低风险", "中风险", "高风险"], include_lowest=True ) # 步骤4: 生成报告 time.sleep(1) progress_bar.progress(100) # 保存结果 st.session_state.prediction_results = result_df st.success("✅ 预测完成!") except Exception as e: st.error(f"预测错误: {str(e)}") # 显示预测结果 if "prediction_results" in st.session_state: st.markdown(""" <div class="result-box"> <h4>预测结果</h4> <p>客户违约风险评估报告</p> </div> """, unsafe_allow_html=True) result_df = st.session_state.prediction_results # 风险分布 st.subheader("风险分布概览") col1, col2, col3 = st.columns(3) high_risk = (result_df["风险等级"] == "高风险").sum() med_risk = (result_df["风险等级"] == "中风险").sum() low_risk = (result_df["风险等级"] == "低风险").sum() col1.markdown(f""" <div class="metric-card"> <div class="metric-value risk-high">{high_risk}</div> <div class="metric-label">高风险客户</div> </div> """, unsafe_allow_html=True) col2.markdown(f""" <div class="metric-card"> <div class="metric-value risk-medium">{med_risk}</div> <div class="metric-label">中风险客户</div> </div> """, unsafe_allow_html=True) col3.markdown(f""" <div class="metric-card"> <div class="metric-value risk-low">{low_risk}</div> <div class="metric-label">低风险客户</div> </div> """, unsafe_allow_html=True) # 风险分布图 fig, ax = plt.subplots(figsize=(8, 4)) risk_counts = result_df["风险等级"].value_counts() risk_counts.plot(kind="bar", color=["#4CAF50", "#FFC107", "#F44336"], ax=ax) ax.set_title("客户风险等级分布") ax.set_xlabel("风险等级") ax.set_ylabel("客户数量") st.pyplot(fig) # 详细预测结果 st.subheader("详细预测结果") # 样式函数 def color_risk(val): if val == "高风险": return "background-color: #ffcdd2; color: #c62828;" elif val == "中风险": return "background-color: #fff9c4; color: #f57f17;" else: return "background-color: #c8e6c9; color: #388e3c;" # 格式化显示 styled_df = result_df.style.applymap(color_risk, subset=["风险等级"]) st.dataframe(styled_df.format({ "违约概率": "{:.2%}" }), height=400) # 下载结果 csv = result_df.to_csv(index=False).encode("utf-8") st.download_button( label="下载预测结果", data=csv, file_name="风险预测结果.csv", mime="text/csv", use_container_width=True ) 页脚 st.markdown(“—”) st.markdown(“”" <div style="text-align: center; color: #7f8c8d; font-size: 0.9rem; padding: 1rem;"> © 2023 风控违约预测系统 | 基于Streamlit开发 </div> """, unsafe_allow_html=True)
07-03
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值