import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from prophet import Prophet
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import seaborn as sns
import time
import os
# 进度打印函数
def print_progress(step, message, indent=0):
indent_str = " " * indent
timestamp = time.strftime("%H:%M:%S", time.localtime())
print(f"[{timestamp}] {indent_str}步骤 {step}: {message}")
# 设置绘图风格
plt.style.use('seaborn-whitegrid')
sns.set_palette("Set2")
# 1. 数据预处理函数
def preprocess_data():
print_progress(1, "开始数据预处理")
print_progress(1.1, "读取附件1和附件2数据", indent=1)
# 读取数据
df_items = pd.read_excel(r"D:\虚拟c盘\大学\竞赛\数学建模\2023年C题\C题\附件1.xlsx", engine='openpyxl')
df_sales = pd.read_excel(r"D:\虚拟c盘\大学\竞赛\数学建模\2023年C题\C题\附件2.xlsx", engine='openpyxl')
print_progress(1.2, f"合并数据集: 单品编码={len(df_items)}条, 销售记录={len(df_sales)}条", indent=1)
# 合并数据集
df_merged = pd.merge(df_sales, df_items, on='单品编码', how='left')
print_progress(1.3, "处理退货数据", indent=1)
# 优化退货处理
df_merged['金额'] = df_merged['销量(千克)'] * df_merged['销售单价(元/千克)']
is_return = df_merged['销售类型'] == '退货'
df_merged.loc[is_return, ['金额', '销量(千克)']] *= -1
print_progress(1, f"数据预处理完成, 共处理{len(df_merged)}条记录")
return df_merged, df_items
# 2. 创建透视表函数
def create_pivot_tables(df_merged, df_items):
print_progress(2, "开始创建透视表")
print_progress(2.1, "计算每日指标", indent=1)
# 使用单次分组计算所有指标
daily_metrics = df_merged.groupby(['单品编码', '单品名称', '销售日期']).agg(
平均价格=('销售单价(元/千克)', 'mean'),
总销量=('销量(千克)', 'sum'),
总金额=('金额', 'sum')
).reset_index()
print_progress(2.2, "计算退货率", indent=1)
# 优化退货率计算
sales_mask = df_merged['销售类型'] == '销售'
return_mask = df_merged['销售类型'] == '退货'
total_sales = df_merged[sales_mask].groupby('单品编码')['金额'].sum()
total_returns = df_merged[return_mask].groupby('单品编码')['金额'].sum().abs()
return_rate = (total_returns / total_sales).reset_index(name='退货率')
print_progress(2.3, "创建透视表", indent=1)
# 创建透视表函数
def create_pivot(metric):
pivot_df = daily_metrics.pivot_table(
index=['单品编码', '单品名称'],
columns='销售日期',
values=metric,
fill_value=0
).reset_index()
return pd.merge(
df_items[['单品编码', '单品名称', '分类名称']].merge(return_rate, on='单品编码', how='left'),
pivot_df,
on=['单品编码', '单品名称'],
how='left'
)
print_progress(2, "透视表创建完成")
return {
'price': create_pivot('平均价格'),
'vol': create_pivot('总销量'),
'sales': create_pivot('总金额')
}
# 3. 保存结果函数
def save_results(final_dfs):
print_progress(3, "开始保存结果")
for name, df in final_dfs.items():
file_path = f'D:\\虚拟c盘\\大学\\竞赛\\数学建模\\2023年C题\\C235问题二output_{name}.xlsx'
print_progress(3.1, f"保存 {name} 数据到: {file_path}", indent=1)
df.to_excel(file_path, index=False, engine='openpyxl')
print_progress(3, "结果保存完成")
# 4. 模型评估函数
def evaluate_prophet_performance(y_true, y_pred):
"""计算Prophet模型预测的评估指标"""
return {
'R²': r2_score(y_true, y_pred),
'MAE': mean_absolute_error(y_true, y_pred),
'RMSE': np.sqrt(mean_squared_error(y_true, y_pred))
}
# 5. 优化Prophet分析流程并添加可视化
def prophet_analysis_with_visualization(target_category='花叶类'):
print_progress(4, "开始Prophet分析")
print_progress(4.1, "读取预处理数据", indent=1)
# 读取数据
sales_df = pd.read_excel(
r"D:\虚拟c盘\大学\竞赛\数学建模\2023年C题\C题\结果\品类级销售数据.xlsx"
).rename(columns={'销售日期': 'ds'})
price_df = pd.read_excel(
r"D:\虚拟c盘\大学\竞赛\数学建模\2023年C题\C题\结果\单品级销售数据.xlsx"
).rename(columns={'销售日期': 'ds'})
# === 新增:日期验证和清洗 ===
print_progress(4.2, "验证和清洗日期数据", indent=1)
# 1. 确保日期列是datetime类型
sales_df['ds'] = pd.to_datetime(sales_df['ds'], errors='coerce')
price_df['ds'] = pd.to_datetime(price_df['ds'], errors='coerce')
# 2. 删除无效日期
initial_count = len(sales_df)
sales_df = sales_df.dropna(subset=['ds'])
price_df = price_df.dropna(subset=['ds'])
removed_count = initial_count - len(sales_df)
print_progress(4.2, f"移除了 {removed_count} 条无效日期记录", indent=2)
# 3. 验证日期范围
min_date = sales_df['ds'].min()
max_date = sales_df['ds'].max()
print_progress(4.2, f"有效日期范围: {min_date} 到 {max_date}", indent=2)
# 4. 检查日期格式问题
date_issues = sales_df[sales_df['ds'].dt.month == 0]
if not date_issues.empty:
print_progress(4.2, f"发现 {len(date_issues)} 条月份为0的异常记录", indent=2)
sales_df = sales_df[sales_df['ds'].dt.month != 0]
# 结果存储
results = []
categories = sales_df.columns[1:]
print_progress(4.2, f"开始分析 {len(categories)} 个品类", indent=1)
# 分析每个品类
# 分析每个品类
for i, col in enumerate(categories):
start_time = time.time()
print_progress(4.2, f"分析品类 {i+1}/{len(categories)}: {col}", indent=2)
# 检查列是否存在
if col not in sales_df.columns:
print_progress(4.2, f"警告: 列 {col} 不在销售数据中,跳过", indent=3)
continue
# 准备数据 - 确保创建新的DataFrame
temp_df = sales_df[['ds', col]].copy()
temp_df = temp_df.rename(columns={col: 'y'})
# 确保y是数值类型 - 使用整个列
try:
# 正确使用pd.to_numeric
temp_df.loc[:, 'y'] = pd.to_numeric(temp_df['y'], errors='coerce')
except Exception as e:
print_progress(4.2, f"转换数值失败: {e},跳过该品类", indent=3)
continue
# 过滤正数记录
df = temp_df[temp_df['y'] > 0]
# 检查数据是否为空
if df.empty:
print_progress(4.2, f"警告: {col} 没有有效销售数据,跳过分析", indent=3)
continue
print_progress(4.3, f"数据准备完成: {len(df)}条记录", indent=3)
# 合并价格数据
merged_df = pd.merge(df, price_df, on='ds', how='left')
# 初始化模型
model = Prophet(
yearly_seasonality=True,
weekly_seasonality=True,
daily_seasonality=False,
changepoint_prior_scale=0.05
)
model.add_country_holidays(country_name='CN')
# 添加回归量(如果存在)
if col in price_df.columns:
print_progress(4.4, f"添加回归量: {col}", indent=3)
model.add_regressor(col)
# 训练模型
print_progress(4.5, "训练模型中...", indent=3)
model.fit(merged_df)
# 预测
print_progress(4.6, "生成预测", indent=3)
future = model.make_future_dataframe(periods=0)
future = pd.merge(future, price_df, on='ds')
forecast = model.predict(future)
# 评估模型
metrics = evaluate_prophet_performance(merged_df['y'], forecast['yhat'])
results.append({
'品类名称': col,
**metrics
})
elapsed = time.time() - start_time
print_progress(4.2, f"完成分析 {col}: R²={metrics['R²']:.3f}, 用时 {elapsed:.1f}秒", indent=2)
# 如果是目标品类,生成详细可视化
if col == target_category:
print_progress(4.7, f"为 {target_category} 生成可视化图表", indent=2)
# 1. 整体拟合结果图
plt.figure(figsize=(14, 8))
plt.plot(merged_df['ds'], merged_df['y'], 'b.', alpha=0.5, label='实际值')
plt.plot(forecast['ds'], forecast['yhat'], 'r-', linewidth=2, label='预测值')
plt.fill_between(
forecast['ds'],
forecast['yhat_lower'],
forecast['yhat_upper'],
color='r',
alpha=0.1
)
plt.title(f'{target_category}销售总量拟合结果', fontsize=16)
plt.xlabel('日期', fontsize=12)
plt.ylabel('销售量', fontsize=12)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plot_path = f'{target_category}_销售总量拟合结果.png'
plt.savefig(plot_path, dpi=300)
plt.close()
print_progress(4.7, f"保存拟合结果图: {plot_path}", indent=3)
# 2. 各成分分解图
components = ['trend', 'holidays', 'weekly', 'yearly']
if col in price_df.columns:
components.append(col) # 添加外生变量
fig, axes = plt.subplots(len(components), 1, figsize=(14, 10))
fig.suptitle(f'{target_category}销售总量各成分分解', fontsize=16)
for i, comp in enumerate(components):
ax = axes[i]
if comp == 'trend':
ax.plot(forecast['ds'], forecast[comp], 'g-', linewidth=2)
ax.set_title('趋势项', fontsize=12)
elif comp == 'holidays':
holiday_vals = forecast[comp].dropna()
if not holiday_vals.empty:
ax.bar(holiday_vals.index, holiday_vals, color='orange')
ax.set_title('节假日项', fontsize=12)
else:
ax.text(0.5, 0.5, '无节假日数据', ha='center', va='center', fontsize=12)
elif comp == 'weekly':
# 获取一周内的模式
weekly = forecast[['ds', 'weekly']].copy()
weekly['day_of_week'] = weekly['ds'].dt.day_name()
weekly_avg = weekly.groupby('day_of_week')['weekly'].mean()
# 按星期顺序排序
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekly_avg = weekly_avg.reindex(days)
weekly_avg.plot(kind='bar', ax=ax, color='purple')
ax.set_title('周度周期项', fontsize=12)
ax.set_xticklabels(['周一', '周二', '周三', '周四', '周五', '周六', '周日'])
elif comp == 'yearly':
# 获取一年内的模式
yearly = forecast[['ds', 'yearly']].copy()
yearly['day_of_year'] = yearly['ds'].dt.dayofyear
yearly_avg = yearly.groupby('day_of_year')['yearly'].mean()
yearly_avg.plot(ax=ax, color='brown')
ax.set_title('年度周期项', fontsize=12)
else: # 外生变量
ax.plot(forecast['ds'], forecast[comp], 'm-', linewidth=2)
ax.set_title(f'外生变量项: {comp}', fontsize=12)
ax.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout(rect=[0, 0, 1, 0.96]) # 为标题留出空间
plot_path = f'{target_category}_各成分分解.png'
plt.savefig(plot_path, dpi=300)
plt.close()
print_progress(4.7, f"保存成分分解图: {plot_path}", indent=3)
# 3. 价格与销售量关系图
plt.figure(figsize=(10, 6))
plt.scatter(
merged_df[col],
merged_df['y'],
alpha=0.6,
c=pd.to_datetime(merged_df['ds']).astype(int),
cmap='viridis'
)
plt.colorbar(label='日期')
plt.title(f'{target_category}价格与销售量关系', fontsize=14)
plt.xlabel('价格 (元/千克)', fontsize=12)
plt.ylabel('销售量 (千克)', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plot_path = f'{target_category}_价格与销售量关系.png'
plt.savefig(plot_path, dpi=300)
plt.close()
print_progress(4.7, f"保存价格-销售量关系图: {plot_path}", indent=3)
print_progress(4, "Prophet分析完成")
return pd.DataFrame(results).set_index('品类名称')
# 主流程
def main():
print("=" * 70)
print("开始执行蔬菜销售预测分析程序")
print("=" * 70)
# 数据预处理
df_merged, df_items = preprocess_data()
# 创建透视表
final_dfs = create_pivot_tables(df_merged, df_items)
# 保存结果
save_results(final_dfs)
# Prophet分析与可视化
print_progress(5, "开始Prophet时间序列分析")
prophet_results = prophet_analysis_with_visualization(target_category='花叶类')
# 输出结果
print("\n" + "=" * 70)
print("表3 各品类蔬菜的 Prophet 模型拟合水平")
print("=" * 70)
print(prophet_results.round(2))
print("\n" + "=" * 70)
print("程序执行完成!所有分析结果已保存")
print("=" * 70)
if __name__ == "__main__":
main()报错结果是:Traceback (most recent call last):
File "d:/虚拟c盘/大学/竞赛/数学建模/2023年C题/C235问题二prophet.py", line 348, in <module>
main()
File "d:/虚拟c盘/大学/竞赛/数学建模/2023年C题/C235问题二prophet.py", line 334, in main
prophet_results = prophet_analysis_with_visualization(target_category='花叶类')
File "d:/虚拟c盘/大学/竞赛/数学建模/2023年C题/C235问题二prophet.py", line 209, in prophet_analysis_with_visualization
metrics = evaluate_prophet_performance(merged_df['y'], forecast['yhat'])
File "d:/虚拟c盘/大学/竞赛/数学建模/2023年C题/C235问题二prophet.py", line 101, in evaluate_prophet_performance
'R²': r2_score(y_true, y_pred),
File "D:\lib\site-packages\sklearn\utils\_param_validation.py", line 214, in wrapper
return func(*args, **kwargs)
File "D:\lib\site-packages\sklearn\metrics\_regression.py", line 989, in r2_score
y_type, y_true, y_pred, multioutput = _check_reg_targets(
File "D:\lib\site-packages\sklearn\metrics\_regression.py", line 99, in _check_reg_targets
check_consistent_length(y_true, y_pred)
File "D:\lib\site-packages\sklearn\utils\validation.py", line 407, in check_consistent_length
raise ValueError(
ValueError: Found input variables with inconsistent numbers of samples: [278077, 46599]
最新发布