E - Frequent values

这是一个关于处理序列查询的问题,输入包含多个测试用例,每个用例包含一个非递减顺序的整数序列和一系列查询。对于每个查询,需要找出在给定范围内的整数中最常出现的值。输出为每个查询中指定区间内最频繁出现的数值的出现次数。

Time Limit: 2000MS Memory Limit: 65536KB 64bit IO Format: %I64d & %I64u

[Submit]   [Go Back]   [Status]

Description

You are given a sequence of n integers a1 , a2 , ... , an in non-decreasing order. In addition to that, you are given several queries consisting of indices i and j (1 ≤ i ≤ j ≤ n). For each query, determine the most frequent value among the integers ai , ... , aj.

Input

The input consists of several test cases. Each test case starts with a line containing two integers n and q (1 ≤ n, q ≤ 100000). The next line contains n integers a1 , ... , an (-100000 ≤ ai ≤ 100000, for each i ∈ {1, ..., n}) separated by spaces. You can assume that for each i ∈ {1, ..., n-1}: ai ≤ ai+1. The following q lines contain one query each, consisting of two integers i and j (1 ≤ i ≤ j ≤ n), which indicate the boundary indices for the 
query.

The last test case is followed by a line containing a single 0.

Output

For each query, print one line with one integer: The number of occurrences of the most frequent value within the given range.

Sample Input

10 3
-1 -1 1 1 1 1 3 10 10 10
2 3
1 10
5 10
0

Sample Output

1
4
3
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from mlxtend.frequent_patterns import apriori, association_rules from mlxtend.preprocessing import TransactionEncoder import os import gc # 垃圾回收模块 import numpy as np # 用于热力图处理 # 设置中文字体,确保图表中文正常显示 plt.rcParams["font.family"] = ["SimHei"] plt.rcParams['figure.dpi'] = 150 plt.rcParams['axes.unicode_minus'] = False # ---------------------- # 1. 数据加载与预处理 # ---------------------- def load_and_preprocess(attach1_path, attach2_path, attach3_path, output_dir='results'): """加载数据并进行预处理,使用CSV格式保存以避免Parquet依赖""" os.makedirs(output_dir, exist_ok=True) # 定义需要的列,只加载必要数据 attach1_cols = ['单品编码', '单品名称', '分类名称'] attach2_cols = ['销售日期', '单品编码', '销量(千克)', '销售单价(元/千克)', '销售类型', '是否打折销售'] attach3_cols = ['单品编码', '批发价格(元/千克)'] # 加载数据时指定数据类型,减少内存占用 print("正在加载数据") try: # 加载附件1(品类信息) dtype_attach1 = { '单品编码': 'category', '单品名称': 'category', '分类名称': 'category' } df1 = pd.read_excel(attach1_path, engine='openpyxl', usecols=attach1_cols, dtype=dtype_attach1) # 加载附件3(批发价格) dtype_attach3 = { '单品编码': 'category', '批发价格(元/千克)': 'float32' } df3 = pd.read_excel(attach3_path, engine='openpyxl', usecols=attach3_cols, dtype=dtype_attach3) # 去重,保留最新价格 df3 = df3.drop_duplicates('单品编码', keep='last') # 加载附件2(销售流水) dtype_attach2 = { '单品编码': 'category', '销量(千克)': 'float32', '销售单价(元/千克)': 'float32', '销售类型': 'category', '是否打折销售': 'category' } df2 = pd.read_excel(attach2_path, engine='openpyxl', usecols=attach2_cols, dtype=dtype_attach2) # 处理日期列 df2['销售日期'] = pd.to_datetime(df2['销售日期'], errors='coerce') df2 = df2.dropna(subset=['销售日期']) # 保存原始数据量用于后续分析 original_count = len(df2) print(f"原始销售记录数: {original_count}") # 合并数据(优化内存使用) print("开始合并数据...") # 先合并附件1(品类信息) df = pd.merge(df2, df1, on='单品编码', how='left') # 删除不再需要的DataFrame并回收内存 del df1, df2 gc.collect() # 再合并附件3(批发价格) df = pd.merge(df, df3, on='单品编码', how='left') del df3 gc.collect() # 异常值处理 print("开始数据清洗...") # 1. 处理退货异常 return_mask = df['销售类型'] == '退货' return_outliers = df[return_mask].shape[0] df = df[~return_mask] print(f"删除退货记录: {return_outliers} 条") # 2. 处理价格异常(不打折且售价低于批发价) df['销售单价(元/千克)'] = pd.to_numeric(df['销售单价(元/千克)'], errors='coerce').fillna(0) df['批发价格(元/千克)'] = pd.to_numeric(df['批发价格(元/千克)'], errors='coerce').fillna(0) valid_price_mask = ~((df['是否打折销售'] == '否') & (df['销售单价(元/千克)'] < df['批发价格(元/千克)'])) price_outliers = df[~valid_price_mask].shape[0] df = df[valid_price_mask] print(f"删除价格异常记录: {price_outliers} 条") # 3. 按品类处理销量离群值(IQR法) def remove_group_outliers(group): if len(group) < 4: # 数据量太小时不处理 return group q1 = group['销量(千克)'].quantile(0.25) q3 = group['销量(千克)'].quantile(0.75) iqr = q3 - q1 lower = q1 - 1.5 * iqr upper = q3 + 1.5 * iqr return group[(group['销量(千克)'] >= lower) & (group['销量(千克)'] <= upper)] # 按分类名称分组处理 has_category = df['分类名称'].notna() # 添加observed=True参数解决FutureWarning df_with_category = df[has_category].groupby('分类名称', group_keys=False, observed=True).apply(remove_group_outliers) df = pd.concat([df_with_category, df[~has_category]], ignore_index=True) del df_with_category gc.collect() print(f"清洗后剩余记录数: {len(df)} 条 (保留率: {len(df)/original_count:.2%})") # 提取时间特征 df['年份'] = df['销售日期'].dt.year.astype('int16') df['月份'] = df['销售日期'].dt.month.astype('int8') df['日'] = df['销售日期'].dt.day.astype('int8') df['年月'] = df['销售日期'].dt.to_period('M') # 用于按月聚合 df['年月日'] = df['销售日期'].dt.date # 用于按日聚合 df['季节'] = df['月份'].apply(lambda x: '冬季' if x in [12, 1, 2] else '春季' if x in [3, 4, 5] else '夏季' if x in [6, 7, 8] else '秋季' ).astype('category') df['是否周末'] = df['销售日期'].dt.weekday.isin([5, 6]).astype('int8') # 转换为更高效的数据类型 df['分类名称'] = df['分类名称'].astype('category') df['单品名称'] = df['单品名称'].astype('category') # 保存清洗后的数据(使用CSV格式替代Parquet,避免依赖问题) csv_path = os.path.join(output_dir, '清洗后销售数据.csv') # 对于大型数据集,使用适当的参数优化CSV保存 df.to_csv(csv_path, index=False, encoding='utf-8-sig') # 使用utf-8-sig确保中文正常 # 保存样本到Excel sample_size = min(100000, len(df)) df.sample(sample_size).to_excel(os.path.join(output_dir, '清洗后销售数据样本.xlsx'), index=False) print(f"清洗后的数据已保存至: {csv_path}") return df except Exception as e: print(f"数据加载与预处理出错: {str(e)}") raise # ---------------------- # 2. 分布规律分析 # ---------------------- def analyze_distribution(df, output_dir='results'): """分析销售量的分布规律,包括六个时间维度折线图""" os.makedirs(os.path.join(output_dir, '分布分析图表'), exist_ok=True) time_chart_dir = os.path.join(output_dir, '分布分析图表', '时间维度销量变化') os.makedirs(time_chart_dir, exist_ok=True) result_df = pd.DataFrame() try: # 确定销售热度前10的单品 top10_items = df.groupby('单品名称', observed=True)['销量(千克)'].sum().sort_values(ascending=False).head(10).index.tolist() # 确定主要品类(销量前10) top10_categories = df.groupby('分类名称', observed=True)['销量(千克)'].sum().sort_values(ascending=False).head(10).index.tolist() # 2.1 时间维度分析 - 六个折线图 print("\n时间维度分布分析 - 生成时间序列折线图") # 辅助函数:生成折线图 def generate_line_chart(data, x_col, y_col, hue_col, title, filename, x_label): plt.figure(figsize=(14, 8)) sns.lineplot(x=x_col, y=y_col, hue=hue_col, data=data, linewidth=2) plt.title(title, fontsize=15) plt.xlabel(x_label, fontsize=12) plt.ylabel('销量(千克)', fontsize=12) plt.xticks(rotation=45) plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.tight_layout() plt.savefig(os.path.join(time_chart_dir, filename), bbox_inches='tight') plt.close() # 1. 按年维度 - 按种类 yearly_category_sales = df[df['分类名称'].isin(top10_categories)].groupby( ['年份', '分类名称'], observed=True)['销量(千克)'].sum().reset_index() generate_line_chart( yearly_category_sales, '年份', '销量(千克)', '分类名称', '各年份不同品类销量变化', '按年-品类销量折线图.png', '年份' ) # 2. 按年维度 - 按单品(只保留有数据的单品) yearly_item_data = df[df['单品名称'].isin(top10_items)] yearly_item_sales = yearly_item_data.groupby( ['年份', '单品名称'], observed=True)['销量(千克)'].sum().reset_index() # 转换为字符串类型,避免category保留无效类别 yearly_item_sales['单品名称'] = yearly_item_sales['单品名称'].astype(str) generate_line_chart( yearly_item_sales, '年份', '销量(千克)', '单品名称', '各年份不同单品销量变化(前10)', '按年-单品销量折线图.png', '年份' ) # 3. 按月维度 - 按种类 monthly_category_sales = df[df['分类名称'].isin(top10_categories)].groupby( ['年月', '分类名称'], observed=True)['销量(千克)'].sum().reset_index() monthly_category_sales['年月'] = monthly_category_sales['年月'].astype(str) # 转换为字符串以便显示 generate_line_chart( monthly_category_sales, '年月', '销量(千克)', '分类名称', '各月份不同品类销量变化', '按月-品类销量折线图.png', '月份' ) # 4. 按月维度 - 按单品(只保留有数据的单品) monthly_item_data = df[df['单品名称'].isin(top10_items)] monthly_item_sales = monthly_item_data.groupby( ['年月', '单品名称'], observed=True)['销量(千克)'].sum().reset_index() monthly_item_sales['年月'] = monthly_item_sales['年月'].astype(str) # 转换为字符串类型,避免category保留无效类别 monthly_item_sales['单品名称'] = monthly_item_sales['单品名称'].astype(str) generate_line_chart( monthly_item_sales, '年月', '销量(千克)', '单品名称', '各月份不同单品销量变化(前10)', '按月-单品销量折线图.png', '月份' ) # 5. 按日维度 - 按种类(使用最近1095天数据) recent_days = 1095 df_sorted = df.sort_values('销售日期') recent_date = df_sorted['销售日期'].iloc[-1] cutoff_date = recent_date - pd.Timedelta(days=recent_days) daily_data = df[(df['销售日期'] >= cutoff_date) & (df['分类名称'].isin(top10_categories))] daily_category_sales = daily_data.groupby( ['年月日', '分类名称'], observed=True)['销量(千克)'].sum().reset_index() generate_line_chart( daily_category_sales, '年月日', '销量(千克)', '分类名称', f'各天不同品类销量变化', '按日-品类销量折线图.png', '日期' ) # 6. 按日维度 - 按单品(只保留有数据的单品) daily_item_data = df[(df['销售日期'] >= cutoff_date) & (df['单品名称'].isin(top10_items))] daily_item_sales = daily_item_data.groupby( ['年月日', '单品名称'], observed=True)['销量(千克)'].sum().reset_index() # 转换为字符串类型,避免category保留无效类别 daily_item_sales['单品名称'] = daily_item_sales['单品名称'].astype(str) generate_line_chart( daily_item_sales, '年月日', '销量(千克)', '单品名称', f'各天不同单品销量变化(前10)', '按日-单品销量折线图.png', '日期' ) # 原有时间维度分析 print("\n时间维度分布分析") # 年度销售量趋势 yearly_sales = df.groupby('年份')['销量(千克)'].sum().reset_index() plt.figure(figsize=(10, 6)) sns.barplot(x='年份', y='销量(千克)', data=yearly_sales) plt.title('各年度蔬菜总销售量') plt.ylabel('总销量(千克)') plt.tight_layout() plt.savefig(os.path.join(output_dir, '分布分析图表', '年度销售量.png')) plt.close() # 季节分布规律 seasonal_sales = df.groupby(['年份', '季节'])['销量(千克)'].sum().reset_index() plt.figure(figsize=(12, 6)) sns.lineplot(x='季节', y='销量(千克)', hue='年份', data=seasonal_sales) plt.title('各季节蔬菜销售量变化') plt.ylabel('总销量(千克)') plt.xticks(['春季', '夏季', '秋季', '冬季']) plt.tight_layout() plt.savefig(os.path.join(output_dir, '分布分析图表', '季节销售量.png')) plt.close() del seasonal_sales gc.collect() # 月度分布规律 monthly_sales = df.groupby('月份')['销量(千克)'].mean().reset_index() plt.figure(figsize=(12, 6)) sns.barplot(x='月份', y='销量(千克)', data=monthly_sales) plt.title('月度平均销售量分布') plt.ylabel('平均销量(千克)') plt.tight_layout() plt.savefig(os.path.join(output_dir, '分布分析图表', '月度销售量.png')) plt.close() # 周末与工作日差异 weekday_sales = df.groupby('是否周末')['销量(千克)'].mean().reset_index() weekday_sales['类型'] = weekday_sales['是否周末'].apply(lambda x: '周末' if x == 1 else '工作日') plt.figure(figsize=(8, 6)) sns.barplot(x='类型', y='销量(千克)', data=weekday_sales) plt.title('周末与工作日销售量对比') plt.ylabel('平均销量(千克)') plt.tight_layout() plt.savefig(os.path.join(output_dir, '分布分析图表', '周末工作日对比.png')) plt.close() del weekday_sales gc.collect() # 2.2 品类维度分析 print("\n=== 品类维度分布分析 ===") # 各品类销量占比 category_sales = df.groupby('分类名称', observed=True)['销量(千克)'].sum().sort_values(ascending=False).reset_index() total_sales = category_sales['销量(千克)'].sum() category_sales['占比(%)'] = (category_sales['销量(千克)'] / total_sales) * 100 plt.figure(figsize=(12, 8)) sns.barplot(x='占比(%)', y='分类名称', data=category_sales) plt.title('各品类销售量占比') plt.tight_layout() plt.savefig(os.path.join(output_dir, '分布分析图表', '品类销量占比.png')) plt.close() # 只分析销量前5的品类 top_categories = category_sales.head(5)['分类名称'].tolist() cat_seasonal = df[df['分类名称'].isin(top_categories)].groupby( ['分类名称', '季节'], observed=True)['销量(千克)'].mean().reset_index() plt.figure(figsize=(14, 8)) sns.lineplot(x='季节', y='销量(千克)', hue='分类名称', data=cat_seasonal) plt.title('主要品类季节性销量变化') plt.ylabel('平均销量(千克)') plt.xticks(['春季', '夏季', '秋季', '冬季']) plt.tight_layout() plt.savefig(os.path.join(output_dir, '分布分析图表', '品类季节变化.png')) plt.close() del cat_seasonal gc.collect() # 2.3 单品维度分析 print("\n单品维度分布分析") # 各品类下销量前三的单品 top_categories = category_sales.head(3)['分类名称'].tolist() filtered_df = df[df['分类名称'].isin(top_categories)] top_items = filtered_df.groupby(['分类名称', '单品名称'], observed=True)['销量(千克)'].sum().reset_index() top_items = top_items.sort_values(['分类名称', '销量(千克)'], ascending=[True, False]) top3_per_cat = top_items.groupby('分类名称', observed=True).head(3) plt.figure(figsize=(14, 10)) sns.barplot(x='销量(千克)', y='单品名称', hue='分类名称', data=top3_per_cat) plt.title('主要品类下销量前三的单品') plt.tight_layout() plt.savefig(os.path.join(output_dir, '分布分析图表', '品类TOP3单品.png')) plt.close() # 汇总关键结果 result_df = pd.concat([ result_df, pd.DataFrame({ '分析维度': ['时间', '时间', '品类', '品类'], '指标': ['年度总销量', '季节差异', '销量占比前三品类', '单品集中度'], '结果': [ yearly_sales.set_index('年份')['销量(千克)'].to_dict(), f"四季销量差异显著,详情见图表", dict(zip(category_sales.head(3)['分类名称'], category_sales.head(3)['占比(%)'])), f"各品类TOP3单品贡献超{top3_per_cat.groupby('分类名称', observed=True)['销量(千克)'].sum().sum()/total_sales:.2%}" ] }) ], ignore_index=True) # 保存分析结果 result_df.to_excel(os.path.join(output_dir, '分布规律分析结果.xlsx'), index=False) print(f"分布规律分析结果已保存至: {os.path.join(output_dir, '分布规律分析结果.xlsx')}") print(f"六个时间维度销量折线图已保存至: {time_chart_dir}") return df, result_df except Exception as e: print(f"分布规律分析出错: {str(e)}") raise # ---------------------- # 3. 相关性分析 # ---------------------- def analyze_correlation(df, output_dir='results'): """分析销量与价格、时间因素的相关性,生成六个热力图: 按年、月、日三个时间维度,以及品类和单品两个方面""" os.makedirs(os.path.join(output_dir, '相关性分析图表'), exist_ok=True) heatmap_dir = os.path.join(output_dir, '相关性分析图表', '热力图') os.makedirs(heatmap_dir, exist_ok=True) try: # 确定分析对象 # 确定销售热度前5的单品(减少热力图复杂度) top5_items = df.groupby('单品名称', observed=True)['销量(千克)'].sum().sort_values(ascending=False).head(5).index.tolist() # 确定主要品类(销量前6) top6_categories = df.groupby('分类名称', observed=True)['销量(千克)'].sum().sort_values(ascending=False).head(6).index.tolist() # 辅助函数:生成热力图 def generate_heatmap(corr_matrix, title, filename, figsize=(10, 8)): plt.figure(figsize=figsize) sns.heatmap( corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1, cbar_kws={'label': '相关系数'} ) plt.title(title) plt.tight_layout() plt.savefig(os.path.join(heatmap_dir, filename), bbox_inches='tight') plt.close() # 3.1 按年维度分析 print("\n=== 按年维度相关性分析 ===") # 3.1.1 品类间年度销量相关性热力图 yearly_category_matrix = df[df['分类名称'].isin(top6_categories)].pivot_table( index='年份', columns='分类名称', values='销量(千克)', aggfunc='sum' ).fillna(0) yearly_category_corr = yearly_category_matrix.corr() generate_heatmap( yearly_category_corr, f'年度维度下{len(top6_categories)}个主要品类销量相关性热力图', '年度-品类相关性热力图.png' ) # 3.1.2 单品间年度销量相关性热力图 # 只筛选top5单品的数据 yearly_item_data = df[df['单品名称'].isin(top5_items)] yearly_item_matrix = yearly_item_data.pivot_table( index='年份', columns='单品名称', values='销量(千克)', aggfunc='sum' ).fillna(0) # 确保只包含top5单品 yearly_item_matrix = yearly_item_matrix.reindex(columns=top5_items) yearly_item_corr = yearly_item_matrix.corr() generate_heatmap( yearly_item_corr, f'年度维度下{len(top5_items)}个热销单品销量相关性热力图', '年度-单品相关性热力图.png' ) # 3.2 按月维度分析 print("\n=== 按月维度相关性分析 ===") # 3.2.1 品类间月度销量相关性热力图 monthly_category_matrix = df[df['分类名称'].isin(top6_categories)].pivot_table( index='年月', columns='分类名称', values='销量(千克)', aggfunc='sum' ).fillna(0) monthly_category_corr = monthly_category_matrix.corr() generate_heatmap( monthly_category_corr, f'月度维度下{len(top6_categories)}个主要品类销量相关性热力图', '月度-品类相关性热力图.png' ) # 3.2.2 单品间月度销量相关性热力图 # 只筛选top5单品的数据 monthly_item_data = df[df['单品名称'].isin(top5_items)] monthly_item_matrix = monthly_item_data.pivot_table( index='年月', columns='单品名称', values='销量(千克)', aggfunc='sum' ).fillna(0) # 确保只包含top5单品 monthly_item_matrix = monthly_item_matrix.reindex(columns=top5_items) monthly_item_corr = monthly_item_matrix.corr() generate_heatmap( monthly_item_corr, f'月度维度下{len(top5_items)}个热销单品销量相关性热力图', '月度-单品相关性热力图.png' ) # 3.3 按日维度分析(统一使用最近1095天数据) print("\n=== 按日维度相关性分析 ===") # 统一使用1095天数据 recent_days = 1095 # 确定时间范围 df_sorted = df.sort_values('销售日期') recent_date = df_sorted['销售日期'].iloc[-1] cutoff_date = recent_date - pd.Timedelta(days=recent_days) # 3.3.1 品类间每日销量相关性热力图(使用1095天) daily_category_data = df[(df['销售日期'] >= cutoff_date) & (df['分类名称'].isin(top6_categories))] daily_category_matrix = daily_category_data.pivot_table( index='年月日', columns='分类名称', values='销量(千克)', aggfunc='sum' ).fillna(0) daily_category_corr = daily_category_matrix.corr() generate_heatmap( daily_category_corr, f'各天{len(top6_categories)}个主要品类销量相关性热力图', '每日-品类相关性热力图.png' ) # 3.3.2 单品间每日销量相关性热力图(使用1095天) daily_item_data = df[(df['销售日期'] >= cutoff_date) & (df['单品名称'].isin(top5_items))] daily_item_matrix = daily_item_data.pivot_table( index='年月日', columns='单品名称', values='销量(千克)', aggfunc='sum' ).fillna(0) # 确保只包含top5单品 daily_item_matrix = daily_item_matrix.reindex(columns=top5_items) daily_item_corr = daily_item_matrix.corr() generate_heatmap( daily_item_corr, f'各天{len(top5_items)}个热销单品销量相关性热力图', '每日-单品相关性热力图.png' ) # 3.4 价格与销量的相关性 print("\n=== 价格与销量相关性分析 ===") # 对大型数据集使用样本 sample_ratio = 0.3 if len(df) > 100000 else 1.0 sample_df = df.sample(frac=sample_ratio, random_state=42) # 计算各单品的平均价格和销量 item_stats = sample_df.groupby('单品编码', observed=True).agg({ '销量(千克)': 'mean', '销售单价(元/千克)': 'mean', '批发价格(元/千克)': 'mean' }).reset_index().dropna() # 计算相关系数 price_corr = item_stats[['销量(千克)', '销售单价(元/千克)']].corr().iloc[0, 1] wholesale_corr = item_stats[['销量(千克)', '批发价格(元/千克)']].corr().iloc[0, 1] print(f"销量与销售单价相关系数: {price_corr:.4f}") print(f"销量与批发价格相关系数: {wholesale_corr:.4f}") # 绘制散点图 plt.figure(figsize=(12, 5)) plt.subplot(1, 2, 1) sns.scatterplot(x='销售单价(元/千克)', y='销量(千克)', data=item_stats, alpha=0.6, s=30) sns.regplot(x='销售单价(元/千克)', y='销量(千克)', data=item_stats, scatter=False, color='red') plt.title(f'销量与销售单价 (r={price_corr:.4f})') plt.subplot(1, 2, 2) sns.scatterplot(x='批发价格(元/千克)', y='销量(千克)', data=item_stats, alpha=0.6, s=30) sns.regplot(x='批发价格(元/千克)', y='销量(千克)', data=item_stats, scatter=False, color='red') plt.title(f'销量与批发价格 (r={wholesale_corr:.4f})') plt.tight_layout() plt.savefig(os.path.join(output_dir, '相关性分析图表', '价格与销量相关性.png')) plt.close() # 3.5 时间因素与销量的相关性 print("\n=== 时间因素与销量相关性分析 ===") time_corr = sample_df[['月份', '是否周末', '销量(千克)']].corr() print("时间因素与销量的相关系数矩阵:") print(time_corr) # 绘制时间因素热力图 plt.figure(figsize=(8, 6)) sns.heatmap(time_corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1) plt.title('时间因素与销量的相关性热力图') plt.tight_layout() plt.savefig(os.path.join(output_dir, '相关性分析图表', '时间与销量相关性.png')) plt.close() # 清理内存 del sample_df, daily_item_matrix, monthly_item_matrix, yearly_item_matrix del daily_category_matrix, monthly_category_matrix, yearly_category_matrix gc.collect() # 3.6 不同品类的价格弹性 top_categories = df.groupby('分类名称', observed=True)['销量(千克)'].sum().sort_values(ascending=False).head(3).index elasticity_df = pd.DataFrame() for cat in top_categories: cat_sample = df[df['分类名称'] == cat].sample(frac=0.5, random_state=42) corr = cat_sample[['销量(千克)', '销售单价(元/千克)']].corr().iloc[0, 1] elasticity_df = pd.concat([elasticity_df, pd.DataFrame({ '品类': [cat], '价格弹性系数': [corr] })], ignore_index=True) plt.figure(figsize=(10, 6)) sns.barplot(x='价格弹性系数', y='品类', data=elasticity_df) plt.axvline(x=0, color='gray', linestyle='--') plt.title('主要品类的价格弹性系数') plt.tight_layout() plt.savefig(os.path.join(output_dir, '相关性分析图表', '品类价格弹性.png')) plt.close() # 汇总相关性分析结果 corr_results = pd.DataFrame({ '相关因素': [ '销量与销售单价', '销量与批发价格', '销量与月份', '销量与是否周末' ], '相关系数': [ price_corr, wholesale_corr, time_corr.loc['销量(千克)', '月份'], time_corr.loc['销量(千克)', '是否周末'] ], '相关性描述': [ '负相关' if price_corr < 0 else '正相关', '负相关' if wholesale_corr < 0 else '正相关', '负相关' if time_corr.loc['销量(千克)', '月份'] < 0 else '正相关', '负相关' if time_corr.loc['销量(千克)', '是否周末'] < 0 else '正相关' ] }) # 合并品类弹性系数 corr_results = pd.concat([corr_results, elasticity_df.rename(columns={'品类': '相关因素', '价格弹性系数': '相关系数'})], ignore_index=True) # 保存相关性分析结果 corr_results.to_excel(os.path.join(output_dir, '相关性分析结果.xlsx'), index=False) print(f"相关性分析结果已保存至: {os.path.join(output_dir, '相关性分析结果.xlsx')}") print(f"六个相关性热力图已保存至: {heatmap_dir}") return corr_results except Exception as e: print(f"相关性分析出错: {str(e)}") raise # ---------------------- # 4. 关联规则挖掘(基于Apriori算法) # ---------------------- def analyze_association_rules(df, output_dir='results', min_support=0.05, min_confidence=0.2, sample_ratio=0.2): """使用Apriori算法挖掘单品之间的关联销售规则,优化内存使用""" print("\n关联规则挖掘(基于Apriori算法)") os.makedirs(os.path.join(output_dir, '关联规则分析'), exist_ok=True) assoc_dir = os.path.join(output_dir, '关联规则分析') try: # 对大型数据集使用更小的样本 if len(df) > 100000: print(f"使用{sample_ratio*100}%的样本进行关联规则挖掘以节省内存") df_sample = df.sample(frac=sample_ratio, random_state=42) else: df_sample = df # 构建交易列表:按销售日期分组,1分钟内的购买视为同一交易 # 首先确保销售日期是datetime类型 df_sample['销售日期'] = pd.to_datetime(df_sample['销售日期']) # 按1分钟窗口分组 df_sample = df_sample.sort_values('销售日期') # 创建1分钟间隔的交易ID time_diff = df_sample['销售日期'].diff().dt.total_seconds() > 60 df_sample['交易ID'] = time_diff.cumsum() # 按交易ID分组,收集单品编码 basket_data = df_sample.groupby('交易ID', observed=True)['单品编码'].unique().reset_index() transactions = basket_data['单品编码'].apply(list).tolist() print(f"共构建 {len(transactions)} 个交易样本(1分钟内购买视为同一交易)") # 过滤高频单品(提高阈值以减少商品数量) min_transactions = max(5, int(len(transactions) * 0.02)) # 至少出现在2%的交易中或至少5次 item_counts = pd.Series([item for transaction in transactions for item in transaction]).value_counts() frequent_items = item_counts[item_counts >= min_transactions].index.tolist() transactions_filtered = [[item for item in trans if item in frequent_items] for trans in transactions] # 只保留包含至少2个商品的交易 transactions_filtered = [trans for trans in transactions_filtered if len(trans) >= 2] print(f"过滤后保留 {len(frequent_items)} 个高频单品,{len(transactions_filtered)} 个有效交易(每个交易至少包含2个商品)") # 如果有效交易太少,直接返回 if len(transactions_filtered) < 100: print("有效交易样本不足,无法进行关联规则挖掘") return None # Apriori算法步骤1:转换为交易矩阵(使用稀疏矩阵优化内存) te = TransactionEncoder() te_ary = te.fit(transactions_filtered).transform(transactions_filtered, sparse=True) basket_df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_) # Apriori算法步骤2:挖掘频繁项集 try: # 生成频繁项集:先找1-项集,再基于此生成高阶项集 frequent_itemsets = apriori( basket_df, min_support=min_support, use_colnames=True, max_len=3, # 限制最大项集长度为3 low_memory=True ) frequent_itemsets['item_count'] = frequent_itemsets['itemsets'].apply(lambda x: len(x)) except MemoryError: print("内存不足,尝试降低支持度阈值...") frequent_itemsets = apriori( basket_df, min_support=min_support * 0.7, # 降低支持度阈值 use_colnames=True, max_len=2, # 进一步限制最大项集长度为2 low_memory=True ) frequent_itemsets['item_count'] = frequent_itemsets['itemsets'].apply(lambda x: len(x)) # 重命名频繁项集的列名为中文 frequent_itemsets_cn = frequent_itemsets.rename(columns={ 'support': '支持度', 'itemsets': '项集', 'item_count': '项集长度' }) # 转换项集中的单品编码为单品名称 item_name_map = dict(zip(df['单品编码'], df['单品名称'])) def items_to_names(items): return '∩'.join([str(item_name_map.get(item, item)) for item in items]) frequent_itemsets_cn['项集'] = frequent_itemsets_cn['项集'].apply(items_to_names) # 保存频繁项集 frequent_itemsets_cn.to_excel(os.path.join(assoc_dir, '频繁项集.xlsx'), index=False) print(f"挖掘到 {len(frequent_itemsets_cn)} 个频繁项集,已保存至频繁项集.xlsx") # 可视化不同长度的频繁项集分布 plt.figure(figsize=(8, 5)) sns.countplot(x='项集长度', data=frequent_itemsets_cn) plt.title('不同长度频繁项集数量分布') plt.xlabel('项集长度(k)') plt.ylabel('数量') plt.tight_layout() plt.savefig(os.path.join(assoc_dir, '频繁项集长度分布.png')) plt.close() # Apriori算法步骤3:从频繁项集中生成关联规则 if len(frequent_itemsets) > 0: rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence) # 转换单品编码为单品名称,方便阅读 def encode_items(items): return '∩'.join([str(item_name_map.get(item, item)) for item in items]) if not rules.empty: # 重命名关联规则的列名为中文 rules_cn = rules.rename(columns={ 'antecedents': '前项', 'consequents': '后项', 'antecedent support': '前项支持度', 'consequent support': '后项支持度', 'support': '支持度', 'confidence': '置信度', 'lift': '提升度', 'leverage': '杠杆度', 'conviction': '确信度' }) # 转换前项和后项为单品名称 rules_cn['前项'] = rules_cn['前项'].apply(encode_items) rules_cn['后项'] = rules_cn['后项'].apply(encode_items) # 按提升度排序 rules_cn = rules_cn.sort_values('提升度', ascending=False) # 保存关联规则 rules_cn.to_excel(os.path.join(assoc_dir, '关联规则.xlsx'), index=False) print(f"挖掘到 {len(rules_cn)} 条关联规则,已保存至关联规则.xlsx") # 可视化Top10关联规则 plt.figure(figsize=(12, 8)) top_rules = rules_cn.head(10) sns.barplot(x='提升度', y='前项', hue='后项', data=top_rules, dodge=False) plt.title('前几名关联规则(按提升度)') plt.xlabel('提升度 (Lift)') plt.ylabel('前项商品') plt.tight_layout() plt.savefig(os.path.join(assoc_dir, '关联规则前几名.png')) plt.close() # 清理内存 del basket_df, te_ary, df_sample gc.collect() return rules_cn else: print("未挖掘到满足条件的关联规则,可尝试降低min_support或min_confidence阈值") return None else: print("未找到足够的频繁项集,无法生成关联规则") return None except Exception as e: print(f"关联规则挖掘出错: {str(e)}") raise # ---------------------- # 主函数 # ---------------------- def main(): # 配置文件路径(请根据实际情况修改) file_paths = { 'attach1': r"D:\Desktop\2023C题\附件1.xlsx", 'attach2': r"D:\Desktop\2023C题\附件2_处理后.xlsx", 'attach3': r"D:\Desktop\2023C题\附件3.xlsx" } output_dir = r"D:\Desktop\2023C题\第一题数据分析结果" try: # 1. 数据加载与预处理 df = load_and_preprocess( file_paths['attach1'], file_paths['attach2'], file_paths['attach3'], output_dir ) # 2. 分布规律分析 df, dist_results = analyze_distribution(df, output_dir) gc.collect() # 3. 相关性分析 corr_results = analyze_correlation(df, output_dir) gc.collect() # 4. 关联规则挖掘(使用Apriori算法) # 调整参数以获取更符合业务需求的结果 rules = analyze_association_rules( df, output_dir, min_support=0.03, # 支持度阈值 min_confidence=0.2, # 置信度阈值 sample_ratio=0.3 # 样本比例 ) print("\n=== 分析完成 ===") print(f"所有分析结果已保存至: {output_dir}") print("包含以下关键分析内容:") print("1. 清洗后的销售数据(CSV格式,兼容性更好)") print("2. 销售量的时间、品类、单品分布规律及图表") print("3. 价格与销量、时间与销量的相关性分析,包括六个时间维度的相关性热力图") print("4. 基于Apriori算法的单品关联销售规则(频繁项集和关联规则)") except Exception as e: print(f"程序执行出错: {str(e)}") if __name__ == "__main__": main() 检查一下这段代码里面自定义函数analyze_association_rules有没有问题
最新发布
08-23
### Autosar E2E Protection Counter Implementation and Usage In the context of AUTOSAR (Automotive Open System Architecture), End-to-End (E2E) protection mechanisms ensure reliable communication between nodes by providing various checks such as freshness verification through counters. The counter functionality plays an essential role in preventing replay attacks and ensuring message integrity. #### Freshness Verification Using Counters The E2E protocol uses a monotonically increasing counter to verify the freshness of messages transmitted over the network. Each time a new message is sent, this counter increments according to predefined rules specified in the configuration parameters[^1]. This ensures that each subsequent transmission has a higher or equal counter value compared to previous transmissions. For example, consider a simple implementation where the sender maintains a local copy of the current counter state: ```c uint8_t e2e_counter = 0; void send_message(E2EPduType *pdu) { pdu->counter = e2e_counter; transmit_pdu(pdu); // Increment after sending successfully if (++e2e_counter >= MAX_COUNTER_VALUE) { e2e_counter = 0; // Wrap around when reaching maximum allowed value } } ``` On reception side, receivers validate incoming PDUs against their own expected counter range based on previously received valid frames: ```c bool check_freshness(const E2EPduType *receivedPdu) { static uint8_t last_valid_counter = 0; bool isValid = false; // Check whether received counter falls into acceptable window if ((last_valid_counter <= receivedPdu->counter && receivedPdu->counter < (last_valid_counter + WINDOW_SIZE)) || (last_valid_counter > receivedPdu->counter && receivedPdu->counter >= (last_valid_counter - ROLLOVER_THRESHOLD))) { last_valid_counter = receivedPdu->counter; isValid = true; } return isValid; } ``` This approach helps detect any attempt at resending old messages which could otherwise compromise system security. #### Configuration Parameters Several important factors influence how these counters operate within the E2E framework including but not limited to: - **Counter Length**: Defines number of bits allocated for storing counter information. - **Window Size**: Specifies allowable difference between actual and expected counter values during validation process. - **Rollover Handling**: Mechanism employed upon exceeding upper limit defined by `MAX_COUNTER_VALUE`. These settings must align closely with application requirements while considering potential performance impacts associated with frequent updates or large windows sizes. --related questions-- 1. How does changing Window Size affect overall reliability? 2. What considerations should one take regarding rollover thresholds? 3. Can you provide more examples demonstrating practical applications of E2E protections? 4. Are there specific guidelines provided by AUTOSAR concerning optimal configurations?
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值