import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os
import time
import warnings
from io import BytesIO
import platform
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def safe_path(path):
"""处理Windows长路径问题"""
if platform.system() == 'Windows':
try:
import ntpath
return ntpath.realpath(path)
except:
return str(Path(path).resolve())
return path
# 忽略警告
warnings.filterwarnings("ignore")
# 页面设置
st.set_page_config(
page_title="精准营销系统",
page_icon="📊",
layout="wide",
initial_sidebar_state="expanded"
)
# 自定义CSS样式
st.markdown("""
<style>
.stApp {
background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
font-family: 'Helvetica Neue', Arial, sans-serif;
}
.header {
background: linear-gradient(90deg, #1a237e 0%, #283593 100%);
color: white;
padding: 1.5rem;
border-radius: 0.75rem;
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
margin-bottom: 2rem;
}
.card {
background: white;
border-radius: 0.75rem;
padding: 1rem;
margin-bottom: 1.5rem;
box-shadow: 0 4px 12px rgba(0,0,0,0.08);
transition: transform 0.3s ease;
}
.card:hover {
transform: translateY(-5px);
box-shadow: 0 6px 16px rgba(0,0,0,0.12);
}
.stButton button {
background: linear-gradient(90deg, #3949ab 0%, #1a237e 100%) !important;
color: white !important;
border: none !important;
border-radius: 0.5rem;
padding: 0.75rem 1.5rem;
font-size: 1rem;
font-weight: 600;
transition: all 0.3s ease;
width: 100%;
}
.stButton button:hover {
transform: scale(1.05);
box-shadow: 0 4px 8px rgba(57, 73, 171, 0.4);
}
.feature-box {
background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
border-radius: 0.75rem;
padding: 1.5rem;
margin-bottom: 1.5rem;
}
.result-box {
background: linear-gradient(135deg, #e8f5e9 0%, #c8e6c9 100%);
border-radius: 0.75rem;
padding: 1.5rem;
margin-top: 1.5rem;
}
.model-box {
background: linear-gradient(135deg, #fff3e0 0%, #ffe0b2 100%);
border-radius: 0.75rem;
padding: 1.5rem;
margin-top: 1.5rem;
}
.stProgress > div > div > div {
background: linear-gradient(90deg, #2ecc71 0%, #27ae60 100%) !important;
}
.metric-card {
background: white;
border-radius: 0.75rem;
padding: 1rem;
text-align: center;
box-shadow: 0 4px 8px rgba(0,0,0,0.06);
}
.metric-value {
font-size: 1.8rem;
font-weight: 700;
color: #1a237e;
}
.metric-label {
font-size: 0.9rem;
color: #5c6bc0;
margin-top: 0.5rem;
}
.highlight {
background: linear-gradient(90deg, #ffeb3b 0%, #fbc02d 100%);
padding: 0.2rem 0.5rem;
border-radius: 0.25rem;
font-weight: 600;
}
.stDataFrame {
border-radius: 0.75rem;
box-shadow: 0 4px 8px rgba(0,0,0,0.06);
}
.convert-high {
background-color: #c8e6c9 !important;
color: #388e3c !important;
font-weight: 700;
}
.convert-low {
background-color: #ffcdd2 !important;
color: #c62828 !important;
font-weight: 600;
}
</style>
""", unsafe_allow_html=True)
def preprocess_data_train(df):
"""
训练时数据预处理函数
返回处理后的数据和推理时需要的参数
"""
# 1. 复制数据避免污染原始数据
data = df.copy()
# 2. 选择关键特征
available_features = [col for col in data.columns if col in [
'AGE', 'GENDER', 'ONLINE_DAY', 'TERM_CNT',
'IF_YHTS', 'MKT_STAR_GRADE_NAME', 'PROM_AMT_MONTH',
'is_rh_next' # 目标变量
]]
# 确保目标变量存在
if 'is_rh_next' not in available_features:
st.error("错误:数据集中缺少目标变量 'is_rh_next'")
return data, None
data = data[available_features]
# 3. 处理缺失值
# 数值特征用均值填充
numeric_cols = ['AGE', 'ONLINE_DAY', 'TERM_CNT', 'PROM_AMT_MONTH']
for col in numeric_cols:
if col in data.columns:
mean_val = data[col].mean()
data[col].fillna(mean_val, inplace=True)
# 分类特征用众数填充
categorical_cols = ['GENDER', 'MKT_STAR_GRADE_NAME', 'IF_YHTS']
for col in categorical_cols:
if col in data.columns:
mode_val = data[col].mode()[0]
data[col].fillna(mode_val, inplace=True)
# 4. 异常值处理(使用IQR方法)
def handle_outliers(series):
Q1 = series.quantile(0.25)
Q3 = series.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return series.clip(lower_bound, upper_bound)
for col in numeric_cols:
if col in data.columns:
data[col] = handle_outliers(data[col])
# 5. 保存预处理参数
preprocessor_params = {
# 数值特征均值
'numerical_means': {col: data[col].mean() for col in numeric_cols if col in data.columns},
# 分类特征众数
'categorical_modes': {col: data[col].mode()[0] for col in categorical_cols if col in data.columns},
# 特征列表
'features': available_features,
# 数值特征列表
'numeric_cols': numeric_cols,
# 分类特征列表
'categorical_cols': categorical_cols,
# 异常值处理边界
'outlier_bounds': {}
}
# 计算并保存异常值边界
for col in numeric_cols:
if col in data.columns:
Q1 = data[col].quantile(0.25)
Q3 = data[col].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
preprocessor_params['outlier_bounds'][col] = (lower_bound, upper_bound)
return data, preprocessor_params
def preprocess_data_inference(df, preprocessor_params):
"""
推理时数据预处理函数
"""
# 1. 复制数据避免污染原始数据
data = df.copy()
# 2. 只保留需要的特征
if 'features' in preprocessor_params:
data = data[preprocessor_params['features']]
# 3. 处理缺失值
# 数值特征用训练集的均值填充
if 'numerical_means' in preprocessor_params:
for col, mean_val in preprocessor_params['numerical_means'].items():
if col in data.columns:
data[col].fillna(mean_val, inplace=True)
# 分类特征用训练集的众数填充
if 'categorical_modes' in preprocessor_params:
for col, mode_val in preprocessor_params['categorical_modes'].items():
if col in data.columns:
data[col].fillna(mode_val, inplace=True)
# 4. 异常值处理(使用训练集的边界)
if 'outlier_bounds' in preprocessor_params:
for col, bounds in preprocessor_params['outlier_bounds'].items():
if col in data.columns:
lower_bound, upper_bound = bounds
data[col] = data[col].clip(lower_bound, upper_bound)
return data
# 标题区域
st.markdown("""
<div class="header">
<h1 style='text-align: center; margin: 0;'>精准营销系统</h1>
<p style='text-align: center; margin: 0.5rem 0 0; font-size: 1.1rem;'>基于机器学习的单宽转融预测</p>
</div>
""", unsafe_allow_html=True)
# 页面布局
col1, col2 = st.columns([1, 1.5])
# 左侧区域 - 图片和简介
with col1:
st.markdown("""
<div class="card">
<h2>📱 智能营销系统</h2>
<p>预测单宽带用户转化为融合套餐用户的可能性</p>
</div>
""", unsafe_allow_html=True)
# 使用在线图片作为占位符
st.image("https://images.unsplash.com/photo-1551836022-d5d88e9218df?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1200&q=80",
caption="精准营销系统示意图", width=600)
st.markdown("""
<div class="card">
<h4>📈 系统功能</h4>
<ul>
<li>用户转化可能性预测</li>
<li>高精度机器学习模型</li>
<li>可视化数据分析</li>
<li>精准营销策略制定</li>
</ul>
</div>
""", unsafe_allow_html=True)
# 右侧区域 - 功能选择
with col2:
st.markdown("""
<div class="card">
<h3>📋 请选择操作类型</h3>
<p>您可以选择数据分析或使用模型进行预测</p>
</div>
""", unsafe_allow_html=True)
# 功能选择
option = st.radio("", ["📊 数据分析 - 探索数据并训练模型", "🔍 预测分析 - 预测用户转化可能性"],
index=0, label_visibility="hidden")
# 数据分析部分
if "数据分析" in option:
st.markdown("""
<div class="card">
<h3>数据分析与模型训练</h3>
<p>上传数据并训练预测模型</p>
</div>
""", unsafe_allow_html=True)
# 上传训练数据
train_file = st.file_uploader("上传数据集 (CSV格式, GBK编码)", type=["csv"])
if train_file is not None:
try:
# 读取数据
train_data = pd.read_csv(train_file, encoding='GBK')
# 显示数据预览
with st.expander("数据预览", expanded=True):
st.dataframe(train_data.head())
col1, col2 = st.columns(2)
col1.metric("总样本数", train_data.shape[0])
col2.metric("特征数量", train_data.shape[1] - 1)
# 数据预处理
st.subheader("数据预处理")
with st.spinner("数据预处理中..."):
processed_data, preprocessor_params = preprocess_data_train(train_data)
joblib.dump(preprocessor_params, 'preprocessor_params.pkl')
st.success("✅ 数据预处理完成")
# 可视化数据分布
st.subheader("数据分布分析")
# 目标变量分布
st.markdown("**目标变量分布 (is_rh_next)**")
fig, ax = plt.subplots(figsize=(8, 5))
sns.countplot(x='is_rh_next', data=processed_data, palette='viridis')
plt.title('用户转化分布 (0:未转化, 1:转化)')
plt.xlabel('是否转化')
plt.ylabel('用户数量')
st.pyplot(fig)
# 数值特征分布
st.markdown("**数值特征分布**")
numeric_cols = ['AGE', 'ONLINE_DAY', 'TERM_CNT', 'PROM_AMT_MONTH']
# 动态计算子图布局
num_features = len(numeric_cols)
if num_features > 0:
ncols = 2
nrows = (num_features + ncols - 1) // ncols # 向上取整
fig, axes = plt.subplots(nrows, ncols, figsize=(14, 4*nrows))
# 将axes展平为一维数组
if nrows > 1 or ncols > 1:
axes = axes.flatten()
else:
axes = [axes] # 单个子图时确保axes是列表
for i, col in enumerate(numeric_cols):
if col in processed_data.columns and i < len(axes):
sns.histplot(processed_data[col], kde=True, ax=axes[i], color='skyblue')
axes[i].set_title(f'{col}分布')
axes[i].set_xlabel('')
# 隐藏多余的子图
for j in range(i+1, len(axes)):
axes[j].set_visible(False)
plt.tight_layout()
st.pyplot(fig)
else:
st.warning("没有可用的数值特征")
# 特征相关性分析
st.markdown("**特征相关性热力图**")
corr_cols = numeric_cols + ['is_rh_next']
if len(corr_cols) > 1:
corr_data = processed_data[corr_cols].corr()
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(corr_data, annot=True, fmt=".2f", cmap='coolwarm', ax=ax)
plt.title('特征相关性热力图')
st.pyplot(fig)
else:
st.warning("特征不足,无法生成相关性热力图")
# 模型训练
st.subheader("模型训练")
# 训练参数设置
col1, col2 = st.columns(2)
test_size = col1.slider("测试集比例", 0.1, 0.4, 0.2, 0.05)
random_state = col2.number_input("随机种子", 0, 100, 42)
n_estimators = col1.slider("树的数量", 10, 500, 100, 10)
max_depth = col2.slider("最大深度", 2, 30, 10, 1)
# 开始训练按钮
if st.button("开始训练模型", use_container_width=True):
with st.spinner("模型训练中,请稍候..."):
progress_bar = st.progress(0)
# 步骤1: 特征工程
X = processed_data.drop('is_rh_next', axis=1)
y = processed_data['is_rh_next']
# 处理分类特征
categorical_cols = ['GENDER', 'MKT_STAR_GRADE_NAME', 'IF_YHTS']
existing_cat_cols = [col for col in categorical_cols if col in X.columns]
# 创建预处理管道
numeric_features = ['AGE', 'ONLINE_DAY', 'TERM_CNT', 'PROM_AMT_MONTH']
numeric_transformer = Pipeline(steps=[
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, existing_cat_cols)
])
# 步骤2: 处理不平衡数据
os = SMOTE(random_state=random_state)
X_res, y_res = os.fit_resample(X, y)
# 划分训练测试集
X_train, X_test, y_train, y_test = train_test_split(
X_res, y_res, test_size=test_size, random_state=random_state, stratify=y_res
)
progress_bar.progress(30)
time.sleep(0.5)
# 步骤3: 模型训练
model = RandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
random_state=random_state,
n_jobs=-1
)
# 创建完整管道
clf = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', model)
])
clf.fit(X_train, y_train)
progress_bar.progress(80)
time.sleep(0.5)
# 步骤4: 模型评估
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
f1 = f1_score(y_test, y_pred)
# 保存模型
joblib.dump(clf, "marketing_model.pkl")
st.session_state.model = clf
st.session_state.preprocessor_params = preprocessor_params
progress_bar.progress(100)
st.success("🎉 模型训练完成!")
# 显示模型性能
st.subheader("模型性能评估")
col1, col2, col3 = st.columns(3)
col1.markdown(f"""
<div class="metric-card">
<div class="metric-value">{accuracy*100:.1f}%</div>
<div class="metric-label">准确率</div>
</div>
""", unsafe_allow_html=True)
col2.markdown(f"""
<div class="metric-card">
<div class="metric-value">{auc:.3f}</div>
<div class="metric-label">AUC 分数</div>
</div>
""", unsafe_allow_html=True)
col3.markdown(f"""
<div class="metric-card">
<div class="metric-value">{f1:.3f}</div>
<div class="metric-label">F1 分数</div>
</div>
""", unsafe_allow_html=True)
# 混淆矩阵
st.subheader("混淆矩阵")
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
ax.set_xlabel("预测标签")
ax.set_ylabel("真实标签")
ax.set_title("混淆矩阵")
st.pyplot(fig)
# 特征重要性
st.subheader("特征重要性")
# 获取特征名称
feature_names = numeric_features.copy()
if 'cat' in clf.named_steps['preprocessor'].named_transformers_:
ohe = clf.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot']
cat_feature_names = ohe.get_feature_names_out(existing_cat_cols)
feature_names.extend(cat_feature_names)
# 获取特征重要性
feature_importances = clf.named_steps['classifier'].feature_importances_
importance_df = pd.DataFrame({
"特征": feature_names,
"重要性": feature_importances
}).sort_values("重要性", ascending=False).head(10)
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x="重要性", y="特征", data=importance_df, palette="viridis", ax=ax)
ax.set_title("Top 10 重要特征")
st.pyplot(fig)
except Exception as e:
st.error(f"数据处理错误: {str(e)}")
# 预测分析部分
else:
st.markdown("""
<div class="card">
<h3>用户转化预测</h3>
<p>预测单宽带用户转化为融合套餐的可能性</p>
</div>
""", unsafe_allow_html=True)
# 上传预测数据
predict_file = st.file_uploader("上传预测数据 (CSV格式, GBK编码)", type=["csv"])
if predict_file is not None:
try:
# 读取数据
predict_data = pd.read_csv(predict_file, encoding='GBK')
# 显示数据预览
with st.expander("数据预览", expanded=True):
st.dataframe(predict_data.head())
# 检查是否有模型
if not os.path.exists("marketing_model.pkl") or not os.path.exists("preprocessor_params.pkl"):
st.warning("⚠️ 未找到训练好的模型,请先训练模型")
st.stop()
# 开始预测按钮
if st.button("开始预测", use_container_width=True):
with st.spinner("预测进行中,请稍候..."):
progress_bar = st.progress(0)
# 加载预处理参数
preprocessor_params = joblib.load('preprocessor_params.pkl')
# 数据预处理
processed_data = preprocess_data_inference(predict_data, preprocessor_params)
progress_bar.progress(30)
time.sleep(0.5)
# 加载模型
model = joblib.load("marketing_model.pkl")
# 生成预测结果
predictions = model.predict(processed_data)
probas = model.predict_proba(processed_data)[:, 1]
progress_bar.progress(80)
time.sleep(0.5)
# 创建结果DataFrame
if 'CCUST_ROW_ID' in predict_data.columns:
customer_ids = predict_data['CCUST_ROW_ID']
else:
customer_ids = range(1, len(predict_data) + 1)
result_df = pd.DataFrame({
"客户ID": customer_ids,
"转化概率": probas,
"预测结果": predictions
})
# 添加转化可能性等级
result_df['预测标签'] = result_df['预测结果'].apply(lambda x: "可能转化" if x == 1 else "可能不转化")
result_df['转化可能性'] = pd.cut(
result_df['转化概率'],
bins=[0, 0.3, 0.7, 1],
labels=["低可能性", "中可能性", "高可能性"],
include_lowest=True
)
# 保存结果
st.session_state.prediction_results = result_df
progress_bar.progress(100)
st.success("✅ 预测完成!")
except Exception as e:
st.error(f"预测错误: {str(e)}")
# 显示预测结果
if "prediction_results" in st.session_state:
st.markdown("""
<div class="card">
<h3>预测结果</h3>
<p>用户转化可能性评估报告</p>
</div>
""", unsafe_allow_html=True)
result_df = st.session_state.prediction_results
# 转化可能性分布
st.subheader("转化可能性分布概览")
col1, col2, col3 = st.columns(3)
high_conv = (result_df["转化可能性"] == "高可能性").sum()
med_conv = (result_df["转化可能性"] == "中可能性").sum()
low_conv = (result_df["转化可能性"] == "低可能性").sum()
col1.markdown(f"""
<div class="metric-card">
<div class="metric-value">{high_conv}</div>
<div class="metric-label">高可能性用户</div>
</div>
""", unsafe_allow_html=True)
col2.markdown(f"""
<div class="metric-card">
<div class="metric-value">{med_conv}</div>
<div class="metric-label">中可能性用户</div>
</div>
""", unsafe_allow_html=True)
col3.markdown(f"""
<div class="metric-card">
<div class="metric-value">{low_conv}</div>
<div class="metric-label">低可能性用户</div>
</div>
""", unsafe_allow_html=True)
# 转化可能性分布图
fig, ax = plt.subplots(figsize=(8, 5))
conv_counts = result_df["转化可能性"].value_counts()
conv_counts.plot(kind='bar', color=['#4CAF50', '#FFC107', '#F44336'], ax=ax)
plt.title('用户转化可能性分布')
plt.xlabel('可能性等级')
plt.ylabel('用户数量')
st.pyplot(fig)
# 详细预测结果
st.subheader("详细预测结果")
# 样式函数
def color_convert(val):
if val == "高可能性":
return "background-color: #c8e6c9; color: #388e3c;"
elif val == "中可能性":
return "background-color: #fff9c4; color: #f57f17;"
else:
return "background-color: #ffcdd2; color: #c62828;"
# 格式化显示
display_df = result_df[["客户ID", "转化概率", "预测标签", "转化可能性"]]
styled_df = display_df.style.format({
"转化概率": "{:.2%}"
}).applymap(color_convert, subset=["转化可能性"])
st.dataframe(styled_df, height=400)
# 下载结果
csv = display_df.to_csv(index=False).encode("utf-8")
st.download_button(
label="下载预测结果",
data=csv,
file_name="用户转化预测结果.csv",
mime="text/csv",
use_container_width=True
)
# 页脚
st.markdown("---")
st.markdown("""
<div style="text-align: center; color: #5c6bc0; font-size: 0.9rem; padding: 1rem;">
© 2023 精准营销系统 | 基于Sklearn和Streamlit开发
</div>
""", unsafe_allow_html=True)
执行上述代码,出现如下报错,给出修改后完整代码
数据处理错误: Input y contains NaN.