import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import os
import time
import warnings
import tempfile
import subprocess
import sys
# 忽略警告
warnings.filterwarnings("ignore")
# 检查Java版本
def check_java_version():
try:
java_version = subprocess.check_output(['java', '-version'], stderr=subprocess.STDOUT, text=True)
st.info(f"Java版本信息:\n{java_version}")
if 'version "1.8' in java_version:
st.success("检测到Java 8 (1.8.x),已启用兼容模式")
st.warning("注意:Spark 3.0+ 官方推荐使用 Java 11,但我们将尝试兼容 Java 8")
elif 'version "11' in java_version or 'version "17' in java_version:
st.success("Java版本兼容")
else:
st.warning(f"检测到未知Java版本: {java_version}")
except Exception as e:
st.error(f"无法检查Java版本: {str(e)}")
st.stop()
# 设置中文字体
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 页面设置
st.set_page_config(
page_title="精准营销系统",
page_icon="📊",
layout="wide",
initial_sidebar_state="expanded"
)
# 自定义CSS样式 - 修复拼写错误
st.markdown("""
<style>
.stApp {
background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
font-family: 'Helvetica Neue', Arial, sans-serif;
}
.header {
background: linear-gradient(90deg, #1a237e 0%, #283593 100%);
color: white;
padding: 1.5rem;
border-radius: 0.75rem;
box-shadow: 0 4px 12px rgba(0,0,0,0.1);
margin-bottom: 2rem;
}
.card {
background: white;
border-radius: 0.75rem;
padding: 1rem;
margin-bottom: 1.5rem;
box-shadow: 0 4px 12px rgba(0,0,0,0.08);
transition: transform 0.3s ease;
}
.card:hover {
transform: translateY(-5px);
box-shadow: 0 6px 16px rgba(0,0,0,0.12);
}
.stButton button {
background: linear-gradient(90deg, #3949ab 0%, #1a237e 100%) !important;
color: white !important;
border: none !important;
border-radius: 0.5rem;
padding: 0.75rem 1.5rem;
font-size: 1rem;
font-weight: 600;
transition: all 0.3s ease;
width: 100%;
}
.stButton button:hover {
transform: scale(1.05);
box-shadow: 0 4px 8px rgba(57, 73, 171, 0.4);
}
.feature-box {
background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
border-radius: 0.75rem;
padding: 1.5rem;
margin-bottom: 1.5rem;
}
.result-box {
background: linear-gradient(135deg, #e8f5e9 0%, #c8e6c9 100%);
border-radius: 0.75rem;
padding: 1.5rem;
margin-top: 1.5rem;
}
.model-box {
background: linear-gradient(135deg, #fff3e0 0%, #ffe0b2 100%);
border-radius: 0.75rem;
padding: 1.5rem;
margin-top: 1.5rem;
}
.stProgress > div > div > div {
background: linear-gradient(90deg, #2ecc71 0%, #27ae60 100%) !important;
}
.metric-card {
background: white;
border-radius: 0.75rem;
padding: 1rem;
text-align: center;
box-shadow: 0 4px 8px rgba(0,0,0,0.06);
}
.metric-value {
font-size: 1.8rem;
font-weight: 700;
color: #1a237e;
}
.metric-label {
font-size: 0.9rem;
color: #5c6bc0;
margin-top: 0.5rem;
}
.highlight {
background: linear-gradient(90deg, #ffeb3b 0%, #fbc02d 100%);
padding: 0.2rem 0.5rem;
border-radius: 0.25rem;
font-weight: 600;
}
.stDataFrame {
border-radius: 0.75rem;
box-shadow: 0 4px 8px rgba(0,0,0,0.06);
}
.convert-high {
background-color: #c8e6c9 !important;
color: #388e3c !important;
font-weight: 700;
}
.convert-low {
background-color: #ffcdd2 !important;
color: #c62828 !important;
font-weight: 600;
}
.java-warning {
background-color: #fff3cd;
border-left: 4px solid #ffc107;
padding: 1rem;
margin-bottom: 1.5rem;
border-radius: 0 0.25rem 0.25rem 0;
}
</style>
""", unsafe_allow_html=True)
# 创建Spark会话 - 兼容Java 8
def create_spark_session():
# 针对Java 8的兼容性配置
# 1. 使用更小的内存配置避免资源问题
# 2. 添加Java 8特定的兼容性配置
# 3. 使用更低的Spark版本兼容性设置
return SparkSession.builder \
.appName("TelecomPrecisionMarketing") \
.config("spark.driver.memory", "1g") \
.config("spark.executor.memory", "1g") \
.config("spark.sql.shuffle.partitions", "4") \
.config("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true -XX:+UseG1GC") \
.config("spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true -XX:+UseG1GC") \
.config("spark.network.timeout", "800s") \
.config("spark.executor.heartbeatInterval", "60s") \
.config("spark.sql.legacy.allowUntypedScalaUDF", "true") \
.getOrCreate()
# 数据预处理函数 - 优化版
def preprocess_data(df):
"""
优化后的数据预处理函数
参数: df: 原始数据 (DataFrame)
返回: 预处理后的数据 (DataFrame)
"""
# 1. 选择关键特征
available_features = [col for col in df.columns if col in [
'AGE', 'GENDER', 'ONLINE_DAY', 'TERM_CNT',
'IF_YHTS', 'MKT_STAR_GRADE_NAME', 'PROM_AMT_MONTH',
'is_rh_next' # 目标变量
]]
# 确保目标变量存在
if 'is_rh_next' not in available_features:
st.error("错误:数据集中缺少目标变量 'is_rh_next'")
return df
# 只保留需要的列
df = df[available_features].copy()
# 2. 处理缺失值
# 数值特征用中位数填充(比均值更鲁棒)
numeric_cols = ['AGE', 'ONLINE_DAY', 'TERM_CNT', 'PROM_AMT_MONTH']
for col in numeric_cols:
if col in df.columns:
median_val = df[col].median()
df[col].fillna(median_val, inplace=True)
# 分类特征用众数填充
categorical_cols = ['GENDER', 'MKT_STAR_GRADE_NAME', 'IF_YHTS']
for col in categorical_cols:
if col in df.columns:
mode_val = df[col].mode()[0] if not df[col].mode().empty else '未知'
df[col].fillna(mode_val, inplace=True)
# 3. 异常值处理(使用IQR方法)
def handle_outliers(series):
Q1 = series.quantile(0.25)
Q3 = series.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return series.clip(lower_bound, upper_bound)
for col in numeric_cols:
if col in df.columns:
df[col] = handle_outliers(df[col])
return df
# 标题区域
st.markdown("""
<div class="header">
<h1 style='text-align: center; margin: 0;'>精准营销系统</h1>
<p style='text-align: center; margin: 0.5rem 0 0; font-size: 1.1rem;'>基于机器学习的单宽转融预测</p>
</div>
""", unsafe_allow_html=True)
# Java版本检查
check_java_version()
# 页面布局
col1, col2 = st.columns([1, 1.5])
# 左侧区域 - 图片和简介
with col1:
st.markdown("""
<div class="card">
<h2>📱 智能营销系统</h2>
<p>预测单宽带用户转化为融合套餐用户的可能性</p>
</div>
""", unsafe_allow_html=True)
# 使用在线图片作为占位符
st.image("https://images.unsplash.com/photo-1551836022-d5d88e9218df?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1200&q=80",
caption="精准营销系统示意图", use_column_width=True)
st.markdown("""
<div class="card">
<h4>📈 系统功能</h4>
<ul>
<li>用户转化可能性预测</li>
<li>高精度机器学习模型</li>
<li>可视化数据分析</li>
<li>精准营销策略制定</li>
</ul>
</div>
""", unsafe_allow_html=True)
# 右侧区域 - 功能选择
with col2:
st.markdown("""
<div class="card">
<h3>📋 请选择操作类型</h3>
<p>您可以选择数据分析或使用模型进行预测</p>
</div>
""", unsafe_allow_html=True)
# 功能选择 - 添加标签
option = st.radio("操作类型",
["📊 数据分析 - 探索数据并训练模型",
"🔍 预测分析 - 预测用户转化可能性"],
index=0)
# 数据分析部分
if "数据分析" in option:
st.markdown("""
<div class="card">
<h3>数据分析与模型训练</h3>
<p>上传数据并训练预测模型</p>
</div>
""", unsafe_allow_html=True)
# 上传训练数据
train_file = st.file_uploader("上传数据集 (CSV格式, GBK编码)", type=["csv"])
if train_file is not None:
try:
# 读取数据
train_data = pd.read_csv(train_file, encoding='GBK')
# 显示数据预览
with st.expander("数据预览", expanded=True):
st.dataframe(train_data.head())
col1, col2 = st.columns(2)
col1.metric("总样本数", train_data.shape[0])
col2.metric("特征数量", train_data.shape[1] - 1)
# 数据预处理
st.subheader("数据预处理")
with st.spinner("数据预处理中..."):
processed_data = preprocess_data(train_data)
st.success("✅ 数据预处理完成")
# 可视化数据分布
st.subheader("数据分布分析")
# 目标变量分布
st.markdown("**目标变量分布 (is_rh_next)**")
fig, ax = plt.subplots(figsize=(8, 5))
sns.countplot(x='is_rh_next', data=processed_data, palette='viridis')
plt.title('用户转化分布 (0:未转化, 1:转化)')
plt.xlabel('是否转化')
plt.ylabel('用户数量')
st.pyplot(fig)
# 数值特征分布
st.markdown("**数值特征分布**")
numeric_cols = ['AGE', 'ONLINE_DAY', 'TERM_CNT', 'PROM_AMT_MONTH']
# 动态计算子图布局
num_features = len(numeric_cols)
if num_features > 0:
ncols = 2
nrows = (num_features + ncols - 1) // ncols # 向上取整
fig, axes = plt.subplots(nrows, ncols, figsize=(14, 4*nrows))
# 将axes展平为一维数组
if nrows > 1 or ncols > 1:
axes = axes.flatten()
else:
axes = [axes] # 单个子图时确保axes是列表
for i, col in enumerate(numeric_cols):
if col in processed_data.columns and i < len(axes):
sns.histplot(processed_data[col], kde=True, ax=axes[i], color='skyblue')
axes[i].set_title(f'{col}分布')
axes[i].set_xlabel('')
# 隐藏多余的子图
for j in range(i+1, len(axes)):
axes[j].set_visible(False)
plt.tight_layout()
st.pyplot(fig)
else:
st.warning("没有可用的数值特征")
# 特征相关性分析
st.markdown("**特征相关性热力图**")
corr_cols = numeric_cols + ['is_rh_next']
if len(corr_cols) > 1:
corr_data = processed_data[corr_cols].corr()
fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(corr_data, annot=True, fmt=".2f", cmap='coolwarm', ax=ax)
plt.title('特征相关性热力图')
st.pyplot(fig)
else:
st.warning("特征不足,无法生成相关性热力图")
# 模型训练
st.subheader("模型训练")
# 训练参数设置
col1, col2 = st.columns(2)
test_size = col1.slider("测试集比例", 0.1, 0.4, 0.2, 0.05)
random_state = col2.number_input("随机种子", 0, 100, 42)
# 开始训练按钮
if st.button("开始训练模型", use_container_width=True):
# 创建临时目录用于存储模型
with tempfile.TemporaryDirectory() as tmp_dir:
model_path = os.path.join(tmp_dir, "best_model")
progress_bar = st.progress(0)
status_text = st.empty()
# 步骤1: 创建Spark会话
status_text.text("步骤1/7: 初始化Spark会话...")
spark = create_spark_session()
progress_bar.progress(15)
# 步骤2: 转换为Spark DataFrame
status_text.text("步骤2/7: 转换数据为Spark格式...")
spark_df = spark.createDataFrame(processed_data)
progress_bar.progress(30)
# 步骤3: 划分训练集和测试集
status_text.text("步骤3/7: 划分训练集和测试集...")
train_df, test_df = spark_df.randomSplit([1.0 - test_size, test_size], seed=random_state)
progress_bar.progress(40)
# 步骤4: 特征工程
status_text.text("步骤4/7: 特征工程处理...")
categorical_cols = ['GENDER', 'MKT_STAR_GRADE_NAME', 'IF_YHTS']
existing_cat_cols = [col for col in categorical_cols if col in processed_data.columns]
# 创建特征处理管道
indexers = [StringIndexer(inputCol=col, outputCol=col+"_index") for col in existing_cat_cols]
encoders = [OneHotEncoder(inputCol=col+"_index", outputCol=col+"_encoded") for col in existing_cat_cols]
numeric_cols = ['AGE', 'ONLINE_DAY', 'TERM_CNT', 'PROM_AMT_MONTH']
feature_cols = numeric_cols + [col+"_encoded" for col in existing_cat_cols]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
label_indexer = StringIndexer(inputCol="is_rh_next", outputCol="label")
progress_bar.progress(50)
# 步骤5: 构建模型
status_text.text("步骤5/7: 构建和训练模型...")
# 使用更简单的模型配置 - 针对Java 8优化
rf = RandomForestClassifier(
featuresCol="features",
labelCol="label",
numTrees=30, # 减少树的数量以适应Java 8
maxDepth=4, # 限制深度
seed=random_state,
maxBins=32 # 减少bin数量以提高兼容性
)
pipeline = Pipeline(stages=indexers + encoders + [assembler, label_indexer, rf])
model = pipeline.fit(train_df)
progress_bar.progress(80)
# 步骤6: 评估模型
status_text.text("步骤6/7: 评估模型性能...")
predictions = model.transform(test_df)
evaluator_auc = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction")
evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
auc = evaluator_auc.evaluate(predictions)
acc = evaluator_acc.evaluate(predictions)
results = {
"Random Forest": {"AUC": auc, "Accuracy": acc}
}
progress_bar.progress(95)
# 步骤7: 保存结果
status_text.text("步骤7/7: 保存模型和结果...")
model.write().overwrite().save(model_path)
st.session_state.model_results = results
st.session_state.best_model = model
st.session_state.model_path = model_path
st.session_state.spark = spark
progress_bar.progress(100)
st.success("🎉 模型训练完成!")
# 显示模型性能
st.subheader("模型性能评估")
results_df = pd.DataFrame(results).T
st.dataframe(results_df.style.format("{:.4f}").background_gradient(cmap='Blues'))
# 特征重要性
st.subheader("特征重要性")
rf_model = model.stages[-1]
feature_importances = rf_model.featureImportances.toArray()
importance_df = pd.DataFrame({
"Feature": feature_cols,
"Importance": feature_importances
}).sort_values("Importance", ascending=False).head(10)
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=importance_df, palette="viridis", ax=ax)
plt.title('Top 10 重要特征')
st.pyplot(fig)
except Exception as e:
st.error(f"模型训练错误: {str(e)}")
st.error("提示:Java 8兼容性问题可能导致此错误,请尝试升级到Java 11")
# 预测分析部分
else:
st.markdown("""
<div class="card">
<h3>用户转化预测</h3>
<p>预测单宽带用户转化为融合套餐的可能性</p>
</div>
""", unsafe_allow_html=True)
# 上传预测数据
predict_file = st.file_uploader("上传预测数据 (CSV格式, GBK编码)", type=["csv"])
if predict_file is not None:
try:
# 读取数据
predict_data = pd.read_csv(predict_file, encoding='GBK')
# 显示数据预览
with st.expander("数据预览", expanded=True):
st.dataframe(predict_data.head())
# 检查是否有模型
if "model_path" not in st.session_state:
st.warning("⚠️ 未找到训练好的模型,请先训练模型")
st.stop()
# 开始预测按钮
if st.button("开始预测", use_container_width=True):
progress_bar = st.progress(0)
status_text = st.empty()
# 步骤1: 数据预处理
status_text.text("步骤1/4: 数据预处理中...")
processed_data = preprocess_data(predict_data)
progress_bar.progress(25)
# 步骤2: 创建Spark会话
status_text.text("步骤2/4: 初始化Spark会话...")
if "spark" not in st.session_state:
spark = create_spark_session()
st.session_state.spark = spark
else:
spark = st.session_state.spark
progress_bar.progress(50)
# 步骤3: 预测
status_text.text("步骤3/4: 进行预测...")
spark_df = spark.createDataFrame(processed_data)
best_model = st.session_state.best_model
predictions = best_model.transform(spark_df)
progress_bar.progress(75)
# 步骤4: 处理结果
status_text.text("步骤4/4: 处理预测结果...")
predictions_df = predictions.select(
"CCUST_ROW_ID",
"probability",
"prediction"
).toPandas()
# 解析概率值
predictions_df['转化概率'] = predictions_df['probability'].apply(lambda x: float(x[1]))
predictions_df['预测结果'] = predictions_df['prediction'].apply(lambda x: "可能转化" if x == 1.0 else "可能不转化")
# 添加转化可能性等级
predictions_df['转化可能性'] = pd.cut(
predictions_df['转化概率'],
bins=[0, 0.3, 0.7, 1],
labels=["低可能性", "中可能性", "高可能性"]
)
# 保存结果
st.session_state.prediction_results = predictions_df
progress_bar.progress(100)
st.success("✅ 预测完成!")
except Exception as e:
st.error(f"预测错误: {str(e)}")
st.error("提示:Java 8兼容性问题可能导致此错误,请尝试升级到Java 11")
# 显示预测结果
if "prediction_results" in st.session_state:
st.markdown("""
<div class="card">
<h3>预测结果</h3>
<p>用户转化可能性评估报告</p>
</div>
""", unsafe_allow_html=True)
result_df = st.session_state.prediction_results
# 转化可能性分布
st.subheader("转化可能性分布概览")
col1, col2, col3 = st.columns(3)
high_conv = (result_df["转化可能性"] == "高可能性").sum()
med_conv = (result_df["转化可能性"] == "中可能性").sum()
low_conv = (result_df["转化可能性"] == "低可能性").sum()
col1.markdown(f"""
<div class="metric-card">
<div class="metric-value">{high_conv}</div>
<div class="metric-label">高可能性用户</div>
</div>
""", unsafe_allow_html=True)
col2.markdown(f"""
<div class="metric-card">
<div class="metric-value">{med_conv}</div>
<div class="metric-label">中可能性用户</div>
</div>
""", unsafe_allow_html=True)
col3.markdown(f"""
<div class="metric-card">
<div class="metric-value">{low_conv}</div>
<div class="metric-label">低可能性用户</div>
</div>
""", unsafe_allow_html=True)
# 转化可能性分布图 - 修复拼写错误
fig, ax = plt.subplots(figsize=(8, 5))
conv_counts = result_df["转化可能性"].value_counts()
conv_counts.plot(kind='bar', color=['#4CAF50', '#FFC107', '#F44336'], ax=ax)
plt.title('用户转化可能性分布')
plt.xlabel('可能性等级')
plt.ylabel('用户数量')
st.pyplot(fig)
# 详细预测结果
st.subheader("详细预测结果")
# 样式函数
def color_convert(val):
if val == "高可能性":
return "background-color: #c8e6c9; color: #388e3c;"
elif val == "中可能性":
return "background-color: #fff9c4; color: #f57f17;"
else:
return "background-color: #ffcdd2; color: #c62828;"
# 格式化显示
display_df = result_df[["CCUST_ROW_ID", "转化概率", "预测结果", "转化可能性"]]
styled_df = display_df.style.format({
"转化概率": "{:.2%}"
}).applymap(color_convert, subset=["转化可能性"])
st.dataframe(styled_df, height=400)
# 下载结果
csv = display_df.to_csv(index=False).encode("utf-8")
st.download_button(
label="下载预测结果",
data=csv,
file_name="用户转化预测结果.csv",
mime="text/csv",
use_container_width=True
)
# 页脚
st.markdown("---")
st.markdown("""
<div style="text-align: center; color: #5c6bc0; font-size: 0.9rem; padding: 1rem;">
© 2023 精准营销系统 | 基于Spark和Streamlit开发 | Java 8兼容模式
</div>
""", unsafe_allow_html=True)
执行上述代码,系统已成功配置java17,但是仍然在spark初始会话卡住,给出修改后完整代码
最新发布