PythonTest100例-03 完全平方数问题

该博客通过Python代码解决了一个数学问题:找到一个整数,使得这个数加100后和加268后都是完全平方数。通过循环遍历并检查开方结果是否为整数,最终找到了满足条件的数。
#一个整数,它加上100后是一个完全平方数,再加上168又是一个完全平方数,请问该数是多少?
from math import sqrt
for i in range(1,10000):
    x=sqrt(i+100)
    y=sqrt(i+100+168)
    if int(x) == x and int(y)==y:
        print(i)
题解思路:假设该数是i,循环该数加上100后开更号为整数x,该数加上100再加上168后开更号为整数y,只要证明x仍然等于整数x,y仍为整数y,则输出该数i
#!/usr/bin/env python # coding: utf-8 # 导入 # In[1]: import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats import warnings warnings.filterwarnings('ignore') # pd.read_csv() 是 pandas 库中专门用于从 CSV 文件读取结构化数据并返回 DataFrame 的函数 # df = pd.read_csv('movies_dataset.csv') # print(df) # print(df.info()) # 因为数据集太大,占用内存且耗时长,因此抽取target_rows条数据进行处理 # In[2]: # 定义抽样参数 target_rows = 20000 # 目标抽取行数 chunksize = 10000 # 每次读取的块大小(可根据内存调整,如5000、20000) sample_fraction = 0.05 # 每个块的抽样比(如0.05=5%,可根据文件总大小调整) # 初始化空列表存储各块抽样结果 sample_list = [] # 分块读取并抽样 for chunk in pd.read_csv('movies_dataset.csv', chunksize=chunksize,index_col=0): # 对当前块随机抽样 # 方法1:按比抽样(更灵活,适应不同大小的块) chunk_sample = chunk.sample(frac=sample_fraction, random_state=42) # 方法2:固定每个块抽样行数(如每个块抽10行) # chunk_sample = chunk.sample(n=min(10, len(chunk)), random_state=42) # 将抽样结果添加到列表 sample_list.append(chunk_sample) # 合并所有块的抽样结果 df_sample = pd.concat(sample_list, ignore_index=True) df = df_sample.reset_index(drop=True) # 再次抽样到目标行数(如果合并后超过200行) if len(df) > target_rows: df = df.sample(n=target_rows, random_state=42) df = df.reset_index(drop=True) # 查看抽样结果 print(f"抽样后数据形状:{df.shape}") print(df.head()) # In[3]: df.to_csv(f'movies_dataset{target_rows}.csv', index=False) # 给数据集添加噪声 # In[4]: import pandas as pd import numpy as np import random import string # df = pd.read_csv(f'movies_dataset{target_rows}.csv') # --------------------------- 1. 插入缺失值(NaN) --------------------------- # Rating列:10%缺失 rating_nan_idx = np.random.choice(df.index, size=int(len(df)*0.1), replace=False) df.loc[rating_nan_idx, 'Rating'] = np.nan # Movie_Name列:5%缺失 movie_nan_idx = np.random.choice(df.index, size=int(len(df)*0.05), replace=False) df.loc[movie_nan_idx, 'Movie_Name'] = np.nan # Genre列:5%缺失 genre_nan_idx = np.random.choice(df.index, size=int(len(df)*0.05), replace=False) df.loc[genre_nan_idx, 'Genre'] = np.nan # User_Id列:3%缺失 user_nan_idx = np.random.choice(df.index, size=int(len(df)*0.03), replace=False) df.loc[user_nan_idx, 'User_Id'] = np.nan # --------------------------- 2. 插入异常值 --------------------------- # Rating列:5%异常值(超出0-5范围) rating_outlier_idx = np.random.choice(df.index, size=int(len(df)*0.05), replace=False) df.loc[rating_outlier_idx, 'Rating'] = np.random.choice([6.0, -1.0, 10.0, -5.0], size=len(rating_outlier_idx)) # User_Id列:3%异常值(超大值) max_user_id = df['User_Id'].max() user_outlier_idx = np.random.choice(df.index, size=int(len(df)*0.03), replace=False) df.loc[user_outlier_idx, 'User_Id'] = max_user_id * 10 + np.random.randint(1, 1000, size=len(user_outlier_idx)) # --------------------------- 3. 插入格式错误 --------------------------- # Movie_Name列:5%格式错误(末尾添加乱码) movie_format_idx = np.random.choice(df.index, size=int(len(df)*0.05), replace=False) def add_garbage(text): if pd.isna(text): return text garbage = ''.join(random.choices(string.punctuation + string.digits, k=3)) return f"{text}{garbage}" df.loc[movie_format_idx, 'Movie_Name'] = df.loc[movie_format_idx, 'Movie_Name'].apply(add_garbage) # Genre列:5%格式错误(替换分隔符或删除分隔符) genre_format_idx = np.random.choice(df.index, size=int(len(df)*0.05), replace=False) def corrupt_genre(genre): if pd.isna(genre): return genre # 50%概率替换分隔符,50%概率删除分隔符 if random.random() < 0.5: return genre.replace('|', '/') # 替换为其他分隔符 else: return genre.replace('|', '') # 删除分隔符 df.loc[genre_format_idx, 'Genre'] = df.loc[genre_format_idx, 'Genre'].apply(corrupt_genre) # --------------------------- 4. 插入重复行 --------------------------- # 随机复制50行插入数据集 duplicate_rows = df.sample(n=50, replace=True) df = pd.concat([df, duplicate_rows], ignore_index=True) # --------------------------- 5. 打乱数据集(可选,增强随机性) --------------------------- df = df.sample(frac=1, random_state=42).reset_index(drop=True) # --------------------------- 6. 查看噪声插入结果 --------------------------- print("\n噪声插入后数据形状:", df.shape) print("\n噪声插入后数据前10行:") print(df.head(10)) # 统计缺失值数量 print("\n各列缺失值数量:") print(df.isnull().sum()) # 统计Rating异常值数量(超出0-5范围) rating_outliers = df[(df['Rating'] < 0) | (df['Rating'] > 5)] print(f"\nRating异常值数量:{len(rating_outliers)}") # In[5]: # 保存带噪声的数据集 df.to_csv(f'movie_data{target_rows}_with_noise.csv', index=False, encoding='utf-8') print(f"\n带噪声的数据集已保存为:movie_data_with_noise{target_rows}.csv") # df.to_csv('movie_data_with_noise.csv', index=False, encoding='utf-8') # print("\n带噪声的数据集已保存为:movie_data_with_noise.csv") # 处理数据集的噪声与年份提取 # In[6]: import pandas as pd import numpy as np import re # 读取带噪声的数据集 # df = pd.read_csv(f'movie_data{target_rows}_with_noise.csv') # df = pd.read_csv('movie_data_with_noise.csv') print("原始带噪声数据形状:", df.shape) # 处理Movie_Name缺失行:直接删除 df = df.dropna(subset=['Movie_Name']).reset_index(drop=True) print("删除Movie_Name缺失行后形状:", df.shape) # 提取年份并创建year列 # 正则表达式:\((18|19|20)\d{2}\) 匹配1800-2099年的年份(括号内4位数字) # expand=False 返回Series,方便直接赋值 df['Year'] = df['Movie_Name'].str.extract(r'\((\d{4})\)', expand=False) # 将year列转换为可空整数类型(支持NaN,避免无年份时出错) df['Year'] = df['Year'].astype('Int64') # 注意:大写I的Int64是pandas的可空整数类型 # 清理原Movie_Name列:删除年份部分并处理空格 # \s* 匹配年份前的空格,确保删除后无多余空格 # 先删除年份,再strip()首尾空格,最后合并中间多余空格 df['Movie_Name'] = ( df['Movie_Name'] .str.replace(r'\s*\((\d{4})\)[\W\d]*$', '', regex=True) # 删除年份及前面的空格 .str.strip() # 去除首尾空格 .str.replace(r'\s+', ' ', regex=True) # 合并连续空格 ) # 修复Genre格式错误:统一分隔符为| def fix_genre(genre): if pd.isna(genre): return genre # 将所有非|分隔符替换为|(如/、空) #匹配「不是大小写字母、也不是数字」的任意单个字符 return re.sub(r'[^a-zA-Z0-9]', '|', genre).replace('||', '|').strip('|') df['Genre'] = df['Genre'].apply(fix_genre) # 处理Rating异常值:截断到0-5范围 df['Rating'] = df['Rating'].clip(lower=0.0, upper=5.0) # 处理Rating缺失值:填充为中位数(可选:改为df = df.dropna(subset=['Rating'])删除) rating_median = df['Rating'].median() df['Rating'] = df['Rating'].fillna(rating_median) # 处理User_Id异常值:删除超大值(原最大值的10倍以上) # 先计算原数据的User_Id最大值(假设噪声前的最大值是合理的) # 注意:如果原数据的User_Id是随机生成的,可能需要调整阈值 original_max_user_id = df['User_Id'].dropna().sort_values().iloc[-int(len(df)*0.05)] # 取非缺失值中95%分位数作为原最大值估计 df = df[~((df['User_Id'] > original_max_user_id * 10) & pd.notna(df['User_Id']))].reset_index(drop=True) print("删除User_Id异常值后形状:", df.shape) # 处理User_Id缺失值:填充为众数(可选:改为df = df.dropna(subset=['User_Id'])删除) # user_id_mode = df['User_Id'].mode()[0] df['User_Id'] = df['User_Id'].fillna(-1) # 使用-1表示缺失的User_Id # 处理Genre缺失值:填充为"Unknown"(可选:改为df = df.dropna(subset=['Genre'])删除) df['Genre'] = df['Genre'].fillna("Unknown") # 使用-1表示缺失的Year df['Year'] = df['Year'].fillna(-1) # 删除重复行(基于所有列)————注意要先把异常数据等处理完再去重 df = df.drop_duplicates().reset_index(drop=True) print("删除重复行后形状:", df.shape) # 数据类型修正(确保数值列类型正确) df['User_Id'] = df['User_Id'].astype(int) df['Rating'] = df['Rating'].astype(float) df['Year'] = df['Year'].astype(int) # 13. 最终验证 print("\n还原后数据形状:", df.shape) print("\n还原后数据前10行:") print(df.head(10)) print("\n各列缺失值数量:") print(df.isnull().sum()) print("\nRating值范围:", df['Rating'].min(), "~", df['Rating'].max()) print(f"年份范围:{df['Year'].min()} ~ {df['Year'].max()}") # 仅统计非空值 # # 7. 保存处理后的数据集 # df.to_csv('movie_data_with_year.csv', index=False, encoding='utf-8') # print("\n处理后的数据集已保存为:movie_data_with_year.csv") # In[7]: df.to_csv(f'movie_data{target_rows}_deal.csv', index=False, encoding='utf-8') print(f"\n处理后的数据集已保存为:movie_data{target_rows}_deal.csv") # df.to_csv(f'movie_data_deal.csv', index=False, encoding='utf-8') # print("\n处理后的数据集已保存为:movie_data_deal.csv") # 读取数据集 # In[8]: df = pd.read_csv(f'movie_data{target_rows}_deal.csv') # df = pd.read_csv('movie_data_deal.csv') # 数据基本信息 # In[9]: print(f"数据集形状: {df.shape}") print(f"行数(样本量): {df.shape[0]}") print(f"列数(特征数): {df.shape[1]}") print("\n列名:") for i, col in enumerate(df.columns, 1):#序号从 1 开始计数 print(f" {i}. {col}") print("\n重复列名检查:") duplicate_cols = df.columns[df.columns.duplicated()].tolist() if duplicate_cols: print(f" 发现重复列名: {duplicate_cols}") else: print(" 无重复列名") print("\n前5行数据预览:") print(df.head()) print("\n数据类型:") print(df.dtypes) # 缺失值情况 # In[10]: print("\n\n缺失值普查") print("-"*40) #isnull():DataFrame 的方法,用于检测缺失值。 # sum():对 isnull() 的结果进行求和计算。 missing_info = df.isnull().sum() # print(missing_info) missing_pct = (missing_info / len(df) * 100).round(2)#返回行数(样本数量) # print(df.columns) print("各列缺失值统计:") for col in df.columns: missing_count = missing_info[col] missing_percent = missing_pct[col] print(f" {col}: {missing_count}个缺失值 ({missing_percent}%)") print(f"\n总缺失值比: {(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100).round(2)}%") # 特殊符号识别 # In[11]: # 识别特殊占位符表示的缺失值 special_missing = ['', 'NA', 'N/A', 'NaN', 'null', 'NULL', '-', '?', '未知'] #include=['object']:指定筛选 object 类型的列 #.columns:获取筛选后 DataFrame 的列名列表(返回 Index 对象,可直接迭代) for col in df.select_dtypes(include=['object']).columns: special_counts = df[col].isin(special_missing).sum() if special_counts > 0: print(f" {col}列中发现特殊占位符表示的缺失值: {special_counts}个") # 重复值判断 # In[12]: print("\n\n重复值普查") print("-"*40) duplicate_rows = df.duplicated().sum() print(f"完全重复的行数: {duplicate_rows}") if duplicate_rows > 0: print("重复行的示:") print(df[df.duplicated(keep=False)]) #keep 参数:控制重复行的标记规则 # keep=False:标记所有重复行(包括首次出现) # keep='first'(默认):仅标记首次出现的重复行 # keep='last':仅标记最后出现的重复行 # 各特征情况分析 - 优化版 # In[13]: # ============================================================================= # 特征分析:使用更合适的图表和方法进行深入分析 # ============================================================================= print("\n" + "="*60) print("特征分析:使用合适图表和方法进行深入分析") print("="*60) # --------------------------- 按评分(Rating)分析 --------------------------- print("\n","-"*20,"1. 评分(Rating)特征分析","-"*20) # 基础统计量计算 rating_stats = df['Rating'].describe() print("\n📊 Rating核心统计量:") print(rating_stats) # 详细统计计算 total_movies = len(df) rating_variance = df['Rating'].var() rating_std = df['Rating'].std() rating_skew = df['Rating'].skew() rating_kurtosis = df['Rating'].kurtosis() print(f"\n📈 Rating详细统计信息:") print(f" 方差:{rating_variance:.4f}") # 方差越大,评分分布越分散 print(f" 标准差:{rating_std:.4f}") # 标准差反映评分的波动程度 print(f" 偏度:{rating_skew:.4f}") # 偏度>0右偏,<0左偏,=0正态 print(f" 峰度:{rating_kurtosis:.4f}") # 峰度>3尖峰,<3平峰 # 极值分析 max_rating_count = len(df[df['Rating'] == 5.0]) min_rating_count = len(df[df['Rating'] == 0.0]) print(f"\n📊 Rating极值分析:") print(f" 总电影数:{total_movies}") print(f" 最高评分(5.0)电影数:{max_rating_count},占比:{max_rating_count/total_movies:.2%}") print(f" 最低评分(0.0)电影数:{min_rating_count},占比:{min_rating_count/total_movies:.2%}") # 评分分布形态分析 print(f"\n📊 Rating分布形态分析:") if rating_skew < -0.5: print(" ✅ Rating呈左偏分布,多数用户倾向于给高分") elif rating_skew > 0.5: print(" ✅ Rating呈右偏分布,多数用户倾向于给低分") else: print(" ✅ Rating分布接近正态,评分较为均衡") # 评分区间分析 rating_bins = pd.cut(df['Rating'], bins=[0, 1, 2, 3, 4, 5], include_lowest=True) rating_distribution = rating_bins.value_counts().sort_index() print(f"\n📊 Rating区间分布:") for interval, count in rating_distribution.items(): print(f" {interval}: {count}部电影,占比:{count/total_movies:.2%}") # --------------------------- 按电影类型(Genre)分析 --------------------------- print("\n","-"*20,"2. 电影类型(Genre)特征分析","-"*20) # Genre频次统计 genre_counts = df['Genre'].value_counts() genre_top10 = genre_counts.head(10) print(f"\n📊 Genre频次统计(Top10):") for genre, count in genre_top10.items(): percentage = count/total_movies*100 print(f" {genre}: {count}次,占比:{percentage:.2f}%") # Genre多样性分析 unique_genres = df['Genre'].nunique() print(f"\n📊 Genre多样性分析:") print(f" 不同类型组合总数:{unique_genres}") print(f" 平均每个类型包含的电影数:{total_movies/unique_genres:.1f}") # 类型评分分析 genre_rating_stats = df.groupby('Genre')['Rating'].agg(['mean', 'median', 'std', 'count']).round(2) genre_rating_stats = genre_rating_stats.sort_values('count', ascending=False).head(10) print(f"\n📊 Top10类型评分统计:") print(genre_rating_stats) # 类型稳定性分析(标准差越小,评分越稳定) stable_genres = genre_rating_stats.sort_values('std').head(5) volatile_genres = genre_rating_stats.sort_values('std', ascending=False).head(5) print(f"\n📊 类型稳定性分析:") print(f" 评分最稳定的类型(标准差最小):") for genre, row in stable_genres.iterrows(): print(f" {genre}: 标准差={row['std']}, 平均分={row['mean']}") print(f" 评分波动最大的类型(标准差最大):") for genre, row in volatile_genres.iterrows(): print(f" {genre}: 标准差={row['std']}, 平均分={row['mean']}") # --------------------------- 按年份(Year)分析 --------------------------- print("\n","-"*20,"3. 年份(Year)特征分析","-"*20) df_year_valid = df[df['Year'] != -1] # 过滤缺失年份 if len(df_year_valid) > 0: # 年份范围分析 earliest_year = df_year_valid['Year'].min() latest_year = df_year_valid['Year'].max() year_range = latest_year - earliest_year print(f"\n📊 Year范围分析:") print(f" 有效年份电影数:{len(df_year_valid)}") print(f" 年份跨度:{earliest_year} - {latest_year}(共{year_range}年)") print(f" 最早年份:{earliest_year},电影数:{len(df_year_valid[df_year_valid['Year'] == earliest_year])}") print(f" 最晚年份:{latest_year},电影数:{len(df_year_valid[df_year_valid['Year'] == latest_year])}") # 年代分析 df_year_valid['Decade'] = (df_year_valid['Year'] // 10) * 10 decade_stats = df_year_valid.groupby('Decade').agg({ 'Rating': ['mean', 'count'], 'Year': ['min', 'max'] }).round(2) print(f"\n📊 年代分析:") for decade, row in decade_stats.iterrows(): mean_rating = row[('Rating', 'mean')] count = row[('Rating', 'count')] year_min = row[('Year', 'min')] year_max = row[('Year', 'max')] print(f" {decade}s年代: {count}部电影,平均分={mean_rating},年份范围={year_min}-{year_max}") # 年份分布分析 year_distribution = df_year_valid['Year'].value_counts().sort_index() peak_year = year_distribution.idxmax() peak_count = year_distribution.max() print(f"\n📊 年份分布分析:") print(f" 电影数量最多的年份:{peak_year},数量:{peak_count}") print(f" 平均每年电影数量:{len(df_year_valid)/len(year_distribution):.1f}") # 年代趋势分析 decade_trend = df_year_valid.groupby('Decade')['Rating'].mean() print(f"\n📊 年代评分趋势:") for decade, rating in decade_trend.items(): print(f" {decade}s年代: 平均分={rating:.2f}") # 计算年代间评分变化 if len(decade_trend) > 1: trend_change = decade_trend.diff().dropna() print(f"\n📊 年代间评分变化:") for i, (decade, change) in enumerate(trend_change.items()): prev_decade = decade_trend.index[i] print(f" {prev_decade}s → {decade}s: 评分变化={change:+.2f}") else: print("⚠️ 无有效年份数据") # --------------------------- 按用户(User_Id)分析 --------------------------- print("\n","-"*20,"4. 用户(User_Id)特征分析","-"*20) df_user_valid = df[df['User_Id'] != -1] # 过滤缺失用户ID if len(df_user_valid) > 0: # 用户行为分析 user_stats = df_user_valid.groupby('User_Id')['Rating'].agg(['count', 'mean', 'std']) total_users = len(user_stats) print(f"\n📊 用户行为分析:") print(f" 有效用户数:{total_users}") print(f" 总评分记录数:{len(df_user_valid)}") print(f" 平均每个用户评分数:{len(df_user_valid)/total_users:.1f}") # 用户评分频率分析 user_freq_stats = user_stats['count'].describe() print(f"\n📊 用户评分频率分析:") print(f" 用户评分次数统计:") print(f" 最小值:{user_freq_stats['min']}") print(f" 平均值:{user_freq_stats['mean']:.1f}") print(f" 中位数:{user_freq_stats['50%']}") print(f" 最大值:{user_freq_stats['max']}") # 用户分类分析 high_freq_users = user_stats[user_stats['count'] > 50] # 高频用户 medium_freq_users = user_stats[(user_stats['count'] >= 10) & (user_stats['count'] <= 50)] # 中频用户 low_freq_users = user_stats[user_stats['count'] < 10] # 低频用户 print(f"\n📊 用户分类分析:") print(f" 高频用户(评分>50次):{len(high_freq_users)}人,占比:{len(high_freq_users)/total_users:.2%}") print(f" 中频用户(10-50次):{len(medium_freq_users)}人,占比:{len(medium_freq_users)/total_users:.2%}") print(f" 低频用户(<10次):{len(low_freq_users)}人,占比:{len(low_freq_users)/total_users:.2%}") # 极端评分用户分析 extreme_high_users = user_stats[user_stats['mean'] > 4.5] # 极端高分用户 extreme_low_users = user_stats[user_stats['mean'] < 2.0] # 极端低分用户 normal_users = user_stats[(user_stats['mean'] >= 2.0) & (user_stats['mean'] <= 4.5)] # 正常用户 print(f"\n📊 极端评分用户分析:") print(f" 极端高分用户(平均>4.5):{len(extreme_high_users)}人,占比:{len(extreme_high_users)/total_users:.2%}") print(f" 极端低分用户(平均<2.0):{len(extreme_low_users)}人,占比:{len(extreme_low_users)/total_users:.2%}") print(f" 正常评分用户:{len(normal_users)}人,占比:{len(normal_users)/total_users:.2%}") # 用户评分一致性分析(标准差越小,评分越一致) consistent_users = user_stats.sort_values('std').head(5) inconsistent_users = user_stats.sort_values('std', ascending=False).head(5) print(f"\n📊 用户评分一致性分析:") print(f" 评分最一致的用户(标准差最小):") for user_id, row in consistent_users.iterrows(): print(f" 用户{user_id}: 标准差={row['std']:.2f}, 平均分={row['mean']:.2f}, 评分次数={row['count']}") print(f" 评分最不一致的用户(标准差最大):") for user_id, row in inconsistent_users.iterrows(): print(f" 用户{user_id}: 标准差={row['std']:.2f}, 平均分={row['mean']:.2f}, 评分次数={row['count']}") else: print("⚠️ 无有效用户数据") # 分布与趋势分析 # In[14]: # --------------------------- 分布与趋势分析 --------------------------- print("\n" + "="*50) print("分布与趋势分析") print("="*50) # --------------------------- Rating分布形态 --------------------------- print("\nRating分布形态:") # 计算偏态系数(<0左偏,>0右偏,=0正态) skewness = stats.skew(df['Rating']) print(f"Rating偏态系数:{skewness:.2f}") if skewness < -0.5: print("Rating呈左偏分布,多数用户倾向于给高分") elif skewness > 0.5: print("Rating呈右偏分布,多数用户倾向于给低分") else: print("Rating分布接近正态,评分较为均衡") # --------------------------- Year分布 --------------------------- print("\nYear分布:") if len(df_year_valid) > 0: year_dist = df_year_valid['Year'].value_counts().sort_index() print(f"电影年份覆盖:{df_year_valid['Year'].min()} - {df_year_valid['Year'].max()}") print(f"电影数量最多的年份:{year_dist.idxmax()},数量:{year_dist.max()}") else: print("无有效年份数据") # 多变量间影响分析 # In[15]: # --------------------------- 相关性与关联分析 --------------------------- print("\n" + "="*50) print("相关性与关联分析") print("="*50) # --------------------------- 数值特征相关性 --------------------------- print("\n数值特征相关性:") # 选择数值特征(过滤无效Year) numeric_cols = ['Rating', 'Year'] df_corr = df_year_valid[numeric_cols] corr_matrix = df_corr.corr() print("数值特征皮尔逊相关系数:") print(corr_matrix) # --------------------------- ANOVA分析(Genre对Rating的影响) --------------------------- print("\nANOVA分析(Genre对Rating的影响):") # 选择Top10 Genre进行ANOVA(避免类别过多) top_genres = genre_top10.index.tolist() df_anova = df[df['Genre'].isin(top_genres)] # 提取各Genre组的Rating数据 genre_rating_groups = [group['Rating'].values for name, group in df_anova.groupby('Genre')] # 执行ANOVA f_stat, p_value = stats.f_oneway(*genre_rating_groups) print(f"ANOVA F统计量:{f_stat:.2f}") print(f"p值:{p_value:.6f}") if p_value < 0.05: print("结论:不同Genre的Rating存在显著差异(p<0.05)") else: print("结论:不同Genre的Rating无显著差异(p≥0.05)") # 电影特征分析 # In[16]: # --------------------------- 业务导向的专题分析 --------------------------- print("\n" + "="*50) print("业务专题分析:Top20高评分电影特征") print("="*50) # 筛选Top20高评分电影(Rating降序,相同评分按评分次数降序) top20_movies = df.groupby('Movie_Name').agg( avg_rating=('Rating', 'mean'), rating_count=('Rating', 'count'), genre=('Genre', 'first'), year=('Year', 'first') ).sort_values(by=['avg_rating', 'rating_count'], ascending=False).head(20) print("Top20高评分电影特征:") print(top20_movies) # 分析共同特征 print("\nTop20高评分电影共同特征:") # Genre分布 top20_genres = top20_movies['genre'].value_counts() print(f"Top20电影主要类型:{top20_genres.index.tolist()}") # Year分布 top20_years = top20_movies[top20_movies['year'] != -1]['year'].value_counts() if len(top20_years) > 0: print(f"Top20电影主要年份:{top20_years.index.tolist()}") print(f"Top20电影平均年份:{top20_movies[top20_movies['year'] != -1]['year'].mean():.0f}") # 基本统计分析绘图 # In[17]: import matplotlib.font_manager as fm # 列出系统中所有可用的中文字体 chinese_fonts = [f.name for f in fm.fontManager.ttflist if 'Sim' in f.name or 'Hei' in f.name or 'Song' in f.name or 'Kai' in f.name] print("系统可用中文字体:", chinese_fonts) # 打印所有字体的完整信息(名称+路径) for font in fm.fontManager.ttflist: if 'YaHei' in font.name or '微软雅黑' in font.name: print(f"字体名称:{font.name}") print(f"字体路径:{font.fname}") print("-" * 50) for f in fm.fontManager.ttflist: if 'hei' in f.name.lower() or '黑' in f.name: print(f"字体名称: {f.name}, 字体文件: {f.fname}") # In[18]: import matplotlib.font_manager as fm import matplotlib.pyplot as plt import seaborn as sns # 设置中文字体(解决中文显示问题) # 方法1:使用系统字体路径 font_path = 'C:/Windows/Fonts/msyh.ttc' # 微软雅黑字体 # 检查字体文件是否存在 import os if os.path.exists(font_path): # 方法1:使用FontProperties对象 my_font = fm.FontProperties(fname=font_path) # 方法2:设置全局字体(推荐) plt.rcParams['font.family'] = ['Microsoft YaHei', 'SimHei', 'DejaVu Sans'] plt.rcParams['font.sans-serif'] = ['Microsoft YaHei', 'SimHei', 'DejaVu Sans'] plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题 print(f"✅ 已设置中文字体: {font_path}") else: # 如果微软雅黑不存在,尝试其他字体 print("⚠️ 微软雅黑字体不存在,尝试使用其他字体") plt.rcParams['font.family'] = ['SimHei', 'DejaVu Sans'] plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans'] plt.rcParams['axes.unicode_minus'] = False my_font = None # 设置Seaborn字体 sns.set_theme(style="whitegrid") sns.set_style("whitegrid") # 设置seaborn主题 sns.set(font=plt.rcParams['font.family'][0]) # 设置seaborn使用相同字体 # --------------------------- 可视化辅助分析 --------------------------- print("\n" + "="*50) print("可视化分析(图表已生成)") print("="*50) # --------------------------- Rating分布(直方图+箱线图) --------------------------- fig, axes = plt.subplots(1, 2, figsize=(12, 5)) # 直方图 sns.histplot(df['Rating'], bins=10, kde=True, ax=axes[0]) axes[0].set_title('Rating分布直方图') axes[0].set_xlabel('评分') axes[0].set_ylabel('电影数量') # 箱线图 sns.boxplot(x=df['Rating'], ax=axes[1]) axes[1].set_title('Rating分布箱线图') axes[1].set_xlabel('评分') plt.tight_layout() plt.savefig('images/rating_distribution.png', dpi=300) plt.close() # --------------------------- Genre占比(饼图) --------------------------- plt.figure(figsize=(10, 8)) genre_top8 = genre_counts.head(8) # 取Top8,其余归为"其他" genre_top8['其他'] = genre_counts[8:].sum() # plt.pie(genre_top8.values, labels=genre_top8.index, autopct='%1.1f%%', startangle=90) wedges, texts, autotexts = plt.pie( genre_top8.values, labels=genre_top8.index, # 包含“其他”的标签列表 autopct='%1.1f%%', startangle=90 ) # 应用字体到所有标签,包括"其他" for text in texts: if my_font is not None: text.set_fontproperties(my_font) else: text.set_fontfamily(plt.rcParams['font.family'][0]) plt.title('电影类型占比(Top8+其他)') plt.axis('equal') plt.savefig('images/genre_pie.png', dpi=300) plt.close() # --------------------------- 按年代的评分趋势(折线图) --------------------------- if len(decade_stats) > 0: plt.figure(figsize=(10, 6)) # 使用decade_stats中的Rating均值数据 sns.lineplot(x=decade_stats.index, y=decade_stats[('Rating', 'mean')], marker='o') plt.title('不同年代电影评分均值趋势') plt.xlabel('年代') plt.ylabel('平均评分') plt.xticks(decade_stats.index) plt.grid(True) plt.savefig('images/decade_rating_trend.png', dpi=300) plt.close() # --------------------------- Top10 Genre评分对比(箱线图) --------------------------- plt.figure(figsize=(12, 8)) sns.boxplot(x='Genre', y='Rating', data=df_anova) plt.xticks(rotation=45, ha='right') plt.title('Top10类型电影评分分布对比') plt.xlabel('电影类型') plt.ylabel('评分') plt.tight_layout() plt.savefig('images/genre_rating_boxplot.png', dpi=300) plt.close() # --------------------------- Year与Rating相关性(散点图) --------------------------- # 回归线:红色直线(color='red'),表示年份与评分的线性相关趋势 if len(df_year_valid) > 0: plt.figure(figsize=(10, 6)) sns.scatterplot(x='Year', y='Rating', data=df_year_valid, alpha=0.5) sns.regplot(x='Year', y='Rating', data=df_year_valid, scatter=False, color='red') plt.title('Year与Rating相关性散点图') plt.xlabel('年份') plt.ylabel('评分') plt.grid(True) plt.savefig('images/year_rating_corr.png', dpi=300) plt.close() print("所有图表已保存为PNG文件!") print("\n数据分析完成!") # ### 机器学习 # 数据预处理 # In[22]: from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.preprocessing import MultiLabelBinarizer from sklearn.model_selection import train_test_split from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler from sklearn.model_selection import train_test_split from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix from sklearn.decomposition import PCA # --------------------------- 数据预处理 --------------------------- df['User_Id'] = df['User_Id'].astype(str) # 生成电影唯一标识(Movie_Id) movie_le = LabelEncoder() df['Movie_Id'] = movie_le.fit_transform(df['Movie_Name']) # 处理多标签Genre(拆分并独热编码) df['Genre_List'] = df['Genre'].str.split('|') # 拆分多标签 mlb = MultiLabelBinarizer() genre_encoded = pd.DataFrame(mlb.fit_transform(df['Genre_List']), columns=mlb.classes_, index=df.index) df = pd.concat([df, genre_encoded], axis=1) # 合并独热编码后的Genre特征 # 处理User_Id(LabelEncoder编码) user_le = LabelEncoder() df['User_Id_Encoded'] = user_le.fit_transform(df['User_Id']) # 评分预测回归任务 # In[ ]: # --------------------------- 特征选择与数据集划分 --------------------------- # 选择特征:User_Id_Encoded、Movie_Id、Year、独热编码后的Genre特征 feature_cols = ['User_Id_Encoded', 'Movie_Id', 'Year'] + list(mlb.classes_) X = df[feature_cols] y = df['Rating'] # 目标变量:评分 # 划分训练集和测试集(80%训练,20%测试) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # --------------------------- 模型训练 --------------------------- # 初始化多个回归模型 models = { '线性回归': LinearRegression(), '随机森林回归': RandomForestRegressor(n_estimators=100, random_state=42), '梯度提升树': GradientBoostingRegressor(n_estimators=100, random_state=42) } # 训练并评估所有模型 results = {} for model_name, model in models.items(): # 训练模型 model.fit(X_train, y_train) # 预测 y_pred = model.predict(X_test) # 评估指标 mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) # 保存结果 results[model_name] = { 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R²': r2, '预测值': y_pred } # --------------------------- 模型评估与结果展示 --------------------------- # 打印各模型评估结果 print("="*50) print("模型评估结果") print("="*50) for model_name, metrics in results.items(): print(f"\n{model_name}:") print(f" MSE(均方误差): {metrics['MSE']:.4f}") print(f" RMSE(均方根误差): {metrics['RMSE']:.4f}") print(f" MAE(平均绝对误差): {metrics['MAE']:.4f}") print(f" R²(决定系数): {metrics['R²']:.4f}") # 展示测试集真实值与预测值对比 print("\n" + "="*50) print("测试集真实值与预测值对比") print("="*50) test_results = pd.DataFrame({ '真实评分': y_test.values, '线性回归预测': results['线性回归']['预测值'], '随机森林预测': results['随机森林回归']['预测值'], '梯度提升树预测': results['梯度提升树']['预测值'] }) print(test_results) # --------------------------- 模型预测示 --------------------------- # 示:预测新样本的评分 print("\n" + "="*50) print("新样本预测示") print("="*50) # 构造新样本(User_Id=-1,Movie_Name='Sleepers',Year=1996,Genre='Thriller') new_sample = { 'User_Id': '-1', 'Movie_Name': 'Sleepers', 'Year': 1996, 'Genre': 'Thriller' } # 处理新样本特征 new_sample['User_Id_Encoded'] = user_le.transform([new_sample['User_Id']])[0] new_sample['Movie_Id'] = movie_le.transform([new_sample['Movie_Name']])[0] # 处理Genre(独热编码) new_genre_list = new_sample['Genre'].split('|') new_genre_encoded = mlb.transform([new_genre_list])[0] # 构造完整特征向量 new_features = np.array([ new_sample['User_Id_Encoded'], new_sample['Movie_Id'], new_sample['Year'] ] + list(new_genre_encoded)).reshape(1, -1) # 预测评分 for model_name, model in models.items(): predicted_rating = model.predict(new_features)[0] print(f"{model_name}预测评分: {predicted_rating:.2f}") # 用户聚类 # In[25]: # --------------------------- 聚类分析:用户聚类 --------------------------- print("="*60) print("聚类分析:用户聚类") print("="*60) # 构建用户特征:平均评分、评分次数、偏好类型(Genre独热编码的均值) user_features = df.groupby('User_Id').agg({ 'Rating': ['mean', 'count'], # 平均评分、评分次数 **{genre: 'mean' for genre in mlb.classes_} # 各类型偏好程度(均值) }).reset_index() user_features.columns = ['User_Id'] + ['_'.join(col).strip() for col in user_features.columns[1:]] # 第一步:聚合 Rating 列(平均评分、评分次数) user_rating_features = df.groupby('User_Id')['Rating'].agg(['mean', 'count']).reset_index() user_rating_features.columns = ['User_Id', 'Rating_mean', 'Rating_count'] # 第二步:聚合类型列(各类型偏好程度) genre_cols = list(mlb.classes_) # 获取所有类型列名 user_genre_features = df.groupby('User_Id')[genre_cols].mean().reset_index() # 第三步:合并特征 user_features = pd.merge(user_rating_features, user_genre_features, on='User_Id', how='inner') # 特征标准化(聚类算法对尺度敏感) scaler = StandardScaler() user_features_scaled = scaler.fit_transform(user_features.drop('User_Id', axis=1)) # 尝试多种聚类算法 cluster_algorithms = { 'K-Means (K=3)': KMeans(n_clusters=3, random_state=42), 'DBSCAN': DBSCAN(eps=0.5, min_samples=2), '层次聚类 (K=3)': AgglomerativeClustering(n_clusters=3) } # 训练聚类模型并评估 for alg_name, alg in cluster_algorithms.items(): clusters = alg.fit_predict(user_features_scaled) user_features[alg_name.replace(' ', '_')] = clusters # 评估聚类效果(DBSCAN可能产生-1噪声点,需过滤) if alg_name != 'DBSCAN' or len(set(clusters)) > 1: valid_clusters = clusters[clusters != -1] valid_features = user_features_scaled[clusters != -1] if len(set(valid_clusters)) > 1: silhouette = silhouette_score(valid_features, valid_clusters) calinski = calinski_harabasz_score(valid_features, valid_clusters) davies = davies_bouldin_score(valid_features, valid_clusters) print(f"\n{alg_name}聚类结果:") print(f" 聚类数量:{len(set(clusters))}") print(f" 轮廓系数:{silhouette:.4f}(越接近1越好)") print(f" Calinski-Harabasz指数:{calinski:.4f}(越大越好)") print(f" Davies-Bouldin指数:{davies:.4f}(越小越好)") # 可视化用户聚类结果(PCA降维到2维) pca = PCA(n_components=2) user_pca = pca.fit_transform(user_features_scaled) user_features['PCA1'] = user_pca[:, 0] user_features['PCA2'] = user_pca[:, 1] plt.figure(figsize=(12, 6)) for i, alg_name in enumerate(cluster_algorithms.keys()): plt.subplot(1, 3, i+1) cluster_col = alg_name.replace(' ', '_') sns.scatterplot(x='PCA1', y='PCA2', hue=cluster_col, data=user_features, palette='viridis') plt.title(f'用户聚类:{alg_name}') plt.xlabel('PCA1') plt.ylabel('PCA2') plt.legend(title='聚类标签') plt.tight_layout() plt.savefig('user_clustering.png', dpi=300) plt.close() # 展示用户聚类结果 print("\n用户聚类结果概览(K-Means):") user_cluster_summary = user_features.groupby('K-Means_(K=3)').agg({ 'User_Id': 'count', 'Rating_mean': 'mean', 'Rating_count': 'mean' }).rename(columns={'User_Id': '用户数量', 'Rating_mean': '平均评分', 'Rating_count': '平均评分次数'}) print(user_cluster_summary) # 电影聚类 # In[ ]: # --------------------------- 聚类分析:电影聚类 --------------------------- print("\n" + "="*60) print("聚类分析:电影聚类") print("="*60) # 构建电影特征:平均评分、年份、类型编码(Genre独热编码的均值) movie_features = df.groupby('Movie_Name').agg({ 'Rating': 'mean', # 平均评分 'Year': 'first', # 上映年份 **{genre: 'mean' for genre in mlb.classes_} # 类型编码(0/1) }).reset_index() # 特征标准化 movie_features_scaled = scaler.fit_transform(movie_features.drop('Movie_Name', axis=1)) # 训练K-Means聚类(K=3) kmeans_movie = KMeans(n_clusters=3, random_state=42) movie_features['Cluster'] = kmeans_movie.fit_predict(movie_features_scaled) # 评估电影聚类效果 silhouette_movie = silhouette_score(movie_features_scaled, movie_features['Cluster']) calinski_movie = calinski_harabasz_score(movie_features_scaled, movie_features['Cluster']) print(f"电影K-Means聚类效果:") print(f" 轮廓系数:{silhouette_movie:.4f}") print(f" Calinski-Harabasz指数:{calinski_movie:.4f}") # 可视化电影聚类(PCA降维) movie_pca = pca.fit_transform(movie_features_scaled) movie_features['PCA1'] = movie_pca[:, 0] movie_features['PCA2'] = movie_pca[:, 1] plt.figure(figsize=(8, 6)) sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=movie_features, palette='viridis', s=100) for i, row in movie_features.iterrows(): plt.text(row['PCA1'], row['PCA2'], row['Movie_Name'].split()[0], fontsize=8) plt.title('电影聚类结果(PCA降维)') plt.xlabel('PCA1') plt.ylabel('PCA2') plt.legend(title='聚类标签') plt.tight_layout() plt.savefig('movie_clustering.png', dpi=300) plt.close() # 展示电影聚类结果 print("\n电影聚类结果:") for cluster in sorted(movie_features['Cluster'].unique()): cluster_movies = movie_features[movie_features['Cluster'] == cluster]['Movie_Name'].tolist() print(f"\n聚类{cluster}电影:{', '.join(cluster_movies)}") # 分类分析 # In[ ]: # --------------------------- 分类分析:评分等级预测 --------------------------- print("\n" + "="*60) print("5. 分类分析:评分等级预测") print("="*60) # 定义分类标签:将评分分为3类 # 高评分:>4.0,中评分:2.0-4.0,低评分:<2.0 df['Rating_Class'] = pd.cut(df['Rating'], bins=[-float('inf'), 2.0, 4.0, float('inf')], labels=['低评分', '中评分', '高评分']) # 特征选择 feature_cols = ['User_Id_Encoded', 'Movie_Id', 'Year'] + list(mlb.classes_) X = df[feature_cols] y = df['Rating_Class'] # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 尝试多种分类算法 classifiers = { '逻辑回归': LogisticRegression(max_iter=1000, random_state=42), '随机森林': RandomForestClassifier(n_estimators=100, random_state=42), '梯度提升树': GradientBoostingClassifier(n_estimators=100, random_state=42) } # 训练分类模型并评估 classification_results = {} for clf_name, clf in classifiers.items(): # 训练模型 clf.fit(X_train, y_train) # 预测 y_pred = clf.predict(X_test) # 评估指标 accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred, average='weighted') recall = recall_score(y_test, y_pred, average='weighted') f1 = f1_score(y_test, y_pred, average='weighted') cm = confusion_matrix(y_test, y_pred) # 保存结果 classification_results[clf_name] = { '准确率': accuracy, '精确率': precision, '召回率': recall, 'F1值': f1, '混淆矩阵': cm, '预测值': y_pred } # 打印评估结果 print(f"\n{clf_name}分类结果:") print(f" 准确率:{accuracy:.4f}") print(f" 精确率:{precision:.4f}") print(f" 召回率:{recall:.4f}") print(f" F1值:{f1:.4f}") print(f" 混淆矩阵:\n{cm}") # 可视化混淆矩阵(随机森林为) plt.figure(figsize=(10, 6)) for i, (clf_name, metrics) in enumerate(classification_results.items()): plt.subplot(1, 3, i+1) sns.heatmap(metrics['混淆矩阵'], annot=True, fmt='d', cmap='Blues', xticklabels=['低评分', '中评分', '高评分'], yticklabels=['低评分', '中评分', '高评分']) plt.title(f'{clf_name}混淆矩阵') plt.xlabel('预测标签') plt.ylabel('真实标签') plt.tight_layout() plt.savefig('classification_confusion_matrix.png', dpi=300) plt.close() # 展示测试集真实值与预测值对比 print("\n测试集真实值与预测值对比:") test_results = pd.DataFrame({ '真实等级': y_test.values, **{clf_name: metrics['预测值'] for clf_name, metrics in classification_results.items()} }) print(test_results) # ============================================================================= # 新增:数据统计、机器学习建模和可视化 # ============================================================================= # In[19]: print("\n" + "="*60) print("新增:数据统计与机器学习建模") print("="*60) # 1. 对特定数据列进行统计计算 print("\n1. 特定数据列统计计算") print("-"*40) # Rating列详细统计 print("Rating列统计信息:") rating_stats = df['Rating'].describe() print(rating_stats) # 计算方差和标准差 rating_variance = df['Rating'].var() rating_std = df['Rating'].std() print(f"Rating方差:{rating_variance:.4f}") print(f"Rating标准差:{rating_std:.4f}") # 偏度和峰度 rating_skew = df['Rating'].skew() rating_kurtosis = df['Rating'].kurtosis() print(f"Rating偏度:{rating_skew:.4f}") print(f"Rating峰度:{rating_kurtosis:.4f}") # Year列统计(过滤掉缺失值) df_year_valid = df[df['Year'] != -1] if len(df_year_valid) > 0: print("\nYear列统计信息(有效年份):") year_stats = df_year_valid['Year'].describe() print(year_stats) print(f"Year方差:{df_year_valid['Year'].var():.2f}") print(f"Year标准差:{df_year_valid['Year'].std():.2f}") # User_Id列统计 df_user_valid = df[df['User_Id'] != -1] if len(df_user_valid) > 0: print("\nUser_Id列统计信息(有效用户):") user_stats = df_user_valid['User_Id'].describe() print(user_stats) print(f"User_Id方差:{df_user_valid['User_Id'].var():.2f}") print(f"User_Id标准差:{df_user_valid['User_Id'].std():.2f}") # In[20]: # 2. 数据预处理和特征工程 print("\n2. 数据预处理和特征工程") print("-"*40) # 创建用于机器学习的数据集 # 选择数值特征和类别特征 numeric_features = ['Rating', 'Year'] # 处理缺失值 df_ml = df.copy() # 将缺失值替换为特定值 df_ml['Year'] = df_ml['Year'].replace(-1, df_ml[df_ml['Year'] != -1]['Year'].median() if len(df_ml[df_ml['Year'] != -1]) > 0 else 2000) df_ml['User_Id'] = df_ml['User_Id'].replace(-1, df_ml[df_ml['User_Id'] != -1]['User_Id'].median() if len(df_ml[df_ml['User_Id'] != -1]) > 0 else 0) # 特征工程:创建新特征 # 电影评分频次特征 df_ml['movie_rating_count'] = df_ml.groupby('Movie_Name')['Rating'].transform('count') # 用户评分频次特征 df_ml['user_rating_count'] = df_ml.groupby('User_Id')['Rating'].transform('count') # 用户平均评分特征 df_ml['user_avg_rating'] = df_ml.groupby('User_Id')['Rating'].transform('mean') # 处理Genre特征:提取主要类型 def get_main_genre(genre_str): if pd.isna(genre_str) or genre_str == "Unknown": return "Unknown" genres = genre_str.split('|') return genres[0] if genres else "Unknown" df_ml['main_genre'] = df_ml['Genre'].apply(get_main_genre) # 对主要类型进行编码 from sklearn.preprocessing import LabelEncoder le_genre = LabelEncoder() df_ml['genre_encoded'] = le_genre.fit_transform(df_ml['main_genre']) print("特征工程完成,新增特征:") print(f"- movie_rating_count: 电影评分次数") print(f"- user_rating_count: 用户评分次数") print(f"- user_avg_rating: 用户平均评分") print(f"- genre_encoded: 主要类型编码") print(f"\n处理后数据形状:{df_ml.shape}") print("处理后数据前5行:") print(df_ml[['Rating', 'Year', 'movie_rating_count', 'user_rating_count', 'user_avg_rating', 'genre_encoded']].head()) # In[21]: # 3. 聚类分析(K-means) print("\n3. 聚类分析(K-means)") print("-"*40) from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler from sklearn.metrics import silhouette_score # 选择聚类特征 cluster_features = ['Rating', 'Year', 'movie_rating_count', 'user_rating_count', 'user_avg_rating', 'genre_encoded'] X_cluster = df_ml[cluster_features].dropna() # 数据标准化 scaler = StandardScaler() X_cluster_scaled = scaler.fit_transform(X_cluster) # 使用肘部法则确定最佳聚类数 print("使用肘部法则确定最佳聚类数...") wcss = [] # 簇内平方和 silhouette_scores = [] K_range = range(2, 8) for k in K_range: kmeans = KMeans(n_clusters=k, random_state=42, n_init=10) kmeans.fit(X_cluster_scaled) wcss.append(kmeans.inertia_) silhouette_scores.append(silhouette_score(X_cluster_scaled, kmeans.labels_)) # 绘制肘部法则图 plt.figure(figsize=(12, 4)) plt.subplot(1, 2, 1) plt.plot(K_range, wcss, 'bo-') plt.xlabel('聚类数 K') plt.ylabel('簇内平方和 (WCSS)') plt.title('肘部法则 - 确定最佳聚类数') plt.grid(True) plt.subplot(1, 2, 2) plt.plot(K_range, silhouette_scores, 'ro-') plt.xlabel('聚类数 K') plt.ylabel('轮廓系数') plt.title('轮廓系数 - 确定最佳聚类数') plt.grid(True) plt.tight_layout() plt.savefig('kmeans_elbow_silhouette.png', dpi=300, bbox_inches='tight') plt.close() # 选择最佳聚类数(基于轮廓系数) best_k = K_range[np.argmax(silhouette_scores)] print(f"最佳聚类数:{best_k}(轮廓系数:{max(silhouette_scores):.4f})") # 使用最佳聚类数进行聚类 kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10) cluster_labels = kmeans.fit_predict(X_cluster_scaled) # 将聚类结果添加到数据集 df_ml['cluster'] = -1 # 初始化聚类标签 df_ml.loc[X_cluster.index, 'cluster'] = cluster_labels # 分析聚类结果 print(f"\n聚类结果分布:") cluster_counts = df_ml['cluster'].value_counts().sort_index() for cluster_id, count in cluster_counts.items(): if cluster_id != -1: # 排除未聚类的样本 cluster_data = df_ml[df_ml['cluster'] == cluster_id] print(f"聚类 {cluster_id}: {count} 个样本") print(f" 平均评分:{cluster_data['Rating'].mean():.2f}") print(f" 平均年份:{cluster_data['Year'].mean():.0f}") print(f" 主要类型:{cluster_data['main_genre'].mode().iloc[0] if len(cluster_data['main_genre'].mode()) > 0 else 'Unknown'}") # 可视化聚类结果(使用PCA降维) from sklearn.decomposition import PCA pca = PCA(n_components=2) X_pca = pca.fit_transform(X_cluster_scaled) plt.figure(figsize=(10, 8)) scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', alpha=0.6) plt.colorbar(scatter, label='聚类标签') plt.xlabel(f'主成分 1 ({pca.explained_variance_ratio_[0]:.2%})') plt.ylabel(f'主成分 2 ({pca.explained_variance_ratio_[1]:.2%})') plt.title('K-means聚类结果(PCA降维可视化)') plt.grid(True, alpha=0.3) plt.savefig('kmeans_clustering_pca.png', dpi=300, bbox_inches='tight') plt.close() print("聚类分析完成!") # In[22]: # 4. 回归分析(预测评分) print("\n4. 回归分析(预测评分)") print("-"*40) from sklearn.linear_model import LinearRegression, Ridge, Lasso from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # 准备回归特征和目标变量 regression_features = ['Year', 'movie_rating_count', 'user_rating_count', 'user_avg_rating', 'genre_encoded'] X_reg = df_ml[regression_features] y_reg = df_ml['Rating'] # 划分训练集和测试集 X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split( X_reg, y_reg, test_size=0.2, random_state=42 ) # 尝试多种回归算法 regressors = { '线性回归': LinearRegression(), '岭回归': Ridge(alpha=1.0), 'Lasso回归': Lasso(alpha=0.1), '随机森林回归': RandomForestRegressor(n_estimators=100, random_state=42), '梯度提升回归': GradientBoostingRegressor(n_estimators=100, random_state=42) } # 训练回归模型并评估 regression_results = {} for reg_name, reg in regressors.items(): # 训练模型 reg.fit(X_train_reg, y_train_reg) # 预测 y_pred_reg = reg.predict(X_test_reg) # 评估指标 mse = mean_squared_error(y_test_reg, y_pred_reg) rmse = np.sqrt(mse) mae = mean_absolute_error(y_test_reg, y_pred_reg) r2 = r2_score(y_test_reg, y_pred_reg) # 保存结果 regression_results[reg_name] = { 'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2, '预测值': y_pred_reg } # 打印评估结果 print(f"\n{reg_name}回归结果:") print(f" 均方误差 (MSE):{mse:.4f}") print(f" 均方根误差 (RMSE):{rmse:.4f}") print(f" 平均绝对误差 (MAE):{mae:.4f}") print(f" 决定系数 (R²):{r2:.4f}") # 可视化回归结果 plt.figure(figsize=(15, 10)) # 真实值 vs 预测值散点图 for i, (reg_name, metrics) in enumerate(regression_results.items()): plt.subplot(2, 3, i+1) plt.scatter(y_test_reg, metrics['预测值'], alpha=0.5) plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'r--', lw=2) plt.xlabel('真实评分') plt.ylabel('预测评分') plt.title(f'{reg_name}\nR² = {metrics["R2"]:.3f}') plt.grid(True, alpha=0.3) plt.tight_layout() plt.savefig('regression_results.png', dpi=300, bbox_inches='tight') plt.close() # 模型性能比较 plt.figure(figsize=(12, 6)) models = list(regression_results.keys()) r2_scores = [metrics['R2'] for metrics in regression_results.values()] rmse_scores = [metrics['RMSE'] for metrics in regression_results.values()] plt.subplot(1, 2, 1) plt.bar(models, r2_scores, color='skyblue') plt.title('回归模型R²比较') plt.ylabel('R²值') plt.xticks(rotation=45) plt.grid(True, alpha=0.3) plt.subplot(1, 2, 2) plt.bar(models, rmse_scores, color='lightcoral') plt.title('回归模型RMSE比较') plt.ylabel('RMSE值') plt.xticks(rotation=45) plt.grid(True, alpha=0.3) plt.tight_layout() plt.savefig('regression_model_comparison.png', dpi=300, bbox_inches='tight') plt.close() print("\n回归分析完成!") # In[23]: # 5. 模型效果分析和改进建议 print("\n5. 模型效果分析和改进建议") print("-"*40) # 分析模型性能 best_reg_model = max(regression_results.items(), key=lambda x: x[1]['R2']) print(f"最佳回归模型:{best_reg_model[0]} (R² = {best_reg_model[1]['R2']:.4f})") # 模型效果评估标准 print("\n模型效果评估标准:") print("- R² > 0.8: 优秀拟合") print("- R² 0.6-0.8: 良好拟合") print("- R² 0.4-0.6: 一般拟合") print("- R² < 0.4: 较差拟合") # 分析可能的问题和改进建议 print("\n可能的问题和改进建议:") if best_reg_model[1]['R2'] < 0.4: print("❌ 模型拟合效果较差,可能原因:") print(" 1. 特征与目标变量相关性较弱") print(" 2. 数据噪声较大") print(" 3. 特征工程不足") print(" 4. 需要更复杂的模型或参数调优") print("\n💡 改进建议:") print(" 1. 探索更多特征(如文本特征、时间特征)") print(" 2. 进行特征选择,去除冗余特征") print(" 3. 尝试深度学习模型") print(" 4. 进行超参数调优") print(" 5. 增加数据量或数据质量") elif best_reg_model[1]['R2'] < 0.6: print("⚠️ 模型拟合效果一般,有改进空间") print("💡 改进建议:") print(" 1. 优化特征工程") print(" 2. 尝试集成学习方法") print(" 3. 进行交叉验证调优") elif best_reg_model[1]['R2'] < 0.8: print("✅ 模型拟合效果良好") print("💡 可进一步优化:") print(" 1. 精细调参") print(" 2. 特征重要性分析") print(" 3. 模型融合") else: print("🎉 模型拟合效果优秀!") # 特征重要性分析(随机森林回归为) if '随机森林回归' in regression_results: rf_reg = regressors['随机森林回归'] feature_importance = pd.DataFrame({ 'feature': regression_features, 'importance': rf_reg.feature_importances_ }).sort_values('importance', ascending=False) print("\n随机森林回归特征重要性:") print(feature_importance) # 可视化特征重要性 plt.figure(figsize=(10, 6)) plt.barh(feature_importance['feature'], feature_importance['importance']) plt.xlabel('特征重要性') plt.title('随机森林回归特征重要性') plt.tight_layout() plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight') plt.close() print("\n" + "="*60) print("数据统计与机器学习建模完成!") print("="*60) # 保存处理后的数据集 df_ml.to_csv('movie_data_ml_processed.csv', index=False, encoding='utf-8') print("\n处理后的数据集已保存为:movie_data_ml_processed.csv") 上述代码设置全局字体没有生效,但是像# 应用字体到所有标签,包括“其他” for text in texts: text.set_fontproperties(my_font) 单个设置是可以的 为什么呢
最新发布
12-14
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值