#!/usr/bin/env python
# coding: utf-8
# 导入
# In[1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
# pd.read_csv() 是 pandas 库中专门用于从 CSV 文件读取结构化数据并返回 DataFrame 的函数
# df = pd.read_csv('movies_dataset.csv')
# print(df)
# print(df.info())
# 因为数据集太大,占用内存且耗时长,因此抽取target_rows条数据进行处理
# In[2]:
# 定义抽样参数
target_rows = 20000 # 目标抽取行数
chunksize = 10000 # 每次读取的块大小(可根据内存调整,如5000、20000)
sample_fraction = 0.05 # 每个块的抽样比例(如0.05=5%,可根据文件总大小调整)
# 初始化空列表存储各块抽样结果
sample_list = []
# 分块读取并抽样
for chunk in pd.read_csv('movies_dataset.csv', chunksize=chunksize,index_col=0):
# 对当前块随机抽样
# 方法1:按比例抽样(更灵活,适应不同大小的块)
chunk_sample = chunk.sample(frac=sample_fraction, random_state=42)
# 方法2:固定每个块抽样行数(如每个块抽10行)
# chunk_sample = chunk.sample(n=min(10, len(chunk)), random_state=42)
# 将抽样结果添加到列表
sample_list.append(chunk_sample)
# 合并所有块的抽样结果
df_sample = pd.concat(sample_list, ignore_index=True)
df = df_sample.reset_index(drop=True)
# 再次抽样到目标行数(如果合并后超过200行)
if len(df) > target_rows:
df = df.sample(n=target_rows, random_state=42)
df = df.reset_index(drop=True)
# 查看抽样结果
print(f"抽样后数据形状:{df.shape}")
print(df.head())
# In[3]:
df.to_csv(f'movies_dataset{target_rows}.csv', index=False)
# 给数据集添加噪声
# In[4]:
import pandas as pd
import numpy as np
import random
import string
# df = pd.read_csv(f'movies_dataset{target_rows}.csv')
# --------------------------- 1. 插入缺失值(NaN) ---------------------------
# Rating列:10%缺失
rating_nan_idx = np.random.choice(df.index, size=int(len(df)*0.1), replace=False)
df.loc[rating_nan_idx, 'Rating'] = np.nan
# Movie_Name列:5%缺失
movie_nan_idx = np.random.choice(df.index, size=int(len(df)*0.05), replace=False)
df.loc[movie_nan_idx, 'Movie_Name'] = np.nan
# Genre列:5%缺失
genre_nan_idx = np.random.choice(df.index, size=int(len(df)*0.05), replace=False)
df.loc[genre_nan_idx, 'Genre'] = np.nan
# User_Id列:3%缺失
user_nan_idx = np.random.choice(df.index, size=int(len(df)*0.03), replace=False)
df.loc[user_nan_idx, 'User_Id'] = np.nan
# --------------------------- 2. 插入异常值 ---------------------------
# Rating列:5%异常值(超出0-5范围)
rating_outlier_idx = np.random.choice(df.index, size=int(len(df)*0.05), replace=False)
df.loc[rating_outlier_idx, 'Rating'] = np.random.choice([6.0, -1.0, 10.0, -5.0], size=len(rating_outlier_idx))
# User_Id列:3%异常值(超大值)
max_user_id = df['User_Id'].max()
user_outlier_idx = np.random.choice(df.index, size=int(len(df)*0.03), replace=False)
df.loc[user_outlier_idx, 'User_Id'] = max_user_id * 10 + np.random.randint(1, 1000, size=len(user_outlier_idx))
# --------------------------- 3. 插入格式错误 ---------------------------
# Movie_Name列:5%格式错误(末尾添加乱码)
movie_format_idx = np.random.choice(df.index, size=int(len(df)*0.05), replace=False)
def add_garbage(text):
if pd.isna(text):
return text
garbage = ''.join(random.choices(string.punctuation + string.digits, k=3))
return f"{text}{garbage}"
df.loc[movie_format_idx, 'Movie_Name'] = df.loc[movie_format_idx, 'Movie_Name'].apply(add_garbage)
# Genre列:5%格式错误(替换分隔符或删除分隔符)
genre_format_idx = np.random.choice(df.index, size=int(len(df)*0.05), replace=False)
def corrupt_genre(genre):
if pd.isna(genre):
return genre
# 50%概率替换分隔符,50%概率删除分隔符
if random.random() < 0.5:
return genre.replace('|', '/') # 替换为其他分隔符
else:
return genre.replace('|', '') # 删除分隔符
df.loc[genre_format_idx, 'Genre'] = df.loc[genre_format_idx, 'Genre'].apply(corrupt_genre)
# --------------------------- 4. 插入重复行 ---------------------------
# 随机复制50行插入数据集
duplicate_rows = df.sample(n=50, replace=True)
df = pd.concat([df, duplicate_rows], ignore_index=True)
# --------------------------- 5. 打乱数据集(可选,增强随机性) ---------------------------
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
# --------------------------- 6. 查看噪声插入结果 ---------------------------
print("\n噪声插入后数据形状:", df.shape)
print("\n噪声插入后数据前10行:")
print(df.head(10))
# 统计缺失值数量
print("\n各列缺失值数量:")
print(df.isnull().sum())
# 统计Rating异常值数量(超出0-5范围)
rating_outliers = df[(df['Rating'] < 0) | (df['Rating'] > 5)]
print(f"\nRating异常值数量:{len(rating_outliers)}")
# In[5]:
# 保存带噪声的数据集
df.to_csv(f'movie_data{target_rows}_with_noise.csv', index=False, encoding='utf-8')
print(f"\n带噪声的数据集已保存为:movie_data_with_noise{target_rows}.csv")
# df.to_csv('movie_data_with_noise.csv', index=False, encoding='utf-8')
# print("\n带噪声的数据集已保存为:movie_data_with_noise.csv")
# 处理数据集的噪声与年份提取
# In[6]:
import pandas as pd
import numpy as np
import re
# 读取带噪声的数据集
# df = pd.read_csv(f'movie_data{target_rows}_with_noise.csv')
# df = pd.read_csv('movie_data_with_noise.csv')
print("原始带噪声数据形状:", df.shape)
# 处理Movie_Name缺失行:直接删除
df = df.dropna(subset=['Movie_Name']).reset_index(drop=True)
print("删除Movie_Name缺失行后形状:", df.shape)
# 提取年份并创建year列
# 正则表达式:\((18|19|20)\d{2}\) 匹配1800-2099年的年份(括号内4位数字)
# expand=False 返回Series,方便直接赋值
df['Year'] = df['Movie_Name'].str.extract(r'\((\d{4})\)', expand=False)
# 将year列转换为可空整数类型(支持NaN,避免无年份时出错)
df['Year'] = df['Year'].astype('Int64') # 注意:大写I的Int64是pandas的可空整数类型
# 清理原Movie_Name列:删除年份部分并处理空格
# \s* 匹配年份前的空格,确保删除后无多余空格
# 先删除年份,再strip()首尾空格,最后合并中间多余空格
df['Movie_Name'] = (
df['Movie_Name']
.str.replace(r'\s*\((\d{4})\)[\W\d]*$', '', regex=True) # 删除年份及前面的空格
.str.strip() # 去除首尾空格
.str.replace(r'\s+', ' ', regex=True) # 合并连续空格
)
# 修复Genre格式错误:统一分隔符为|
def fix_genre(genre):
if pd.isna(genre):
return genre
# 将所有非|分隔符替换为|(如/、空)
#匹配「不是大小写字母、也不是数字」的任意单个字符
return re.sub(r'[^a-zA-Z0-9]', '|', genre).replace('||', '|').strip('|')
df['Genre'] = df['Genre'].apply(fix_genre)
# 处理Rating异常值:截断到0-5范围
df['Rating'] = df['Rating'].clip(lower=0.0, upper=5.0)
# 处理Rating缺失值:填充为中位数(可选:改为df = df.dropna(subset=['Rating'])删除)
rating_median = df['Rating'].median()
df['Rating'] = df['Rating'].fillna(rating_median)
# 处理User_Id异常值:删除超大值(原最大值的10倍以上)
# 先计算原数据的User_Id最大值(假设噪声前的最大值是合理的)
# 注意:如果原数据的User_Id是随机生成的,可能需要调整阈值
original_max_user_id = df['User_Id'].dropna().sort_values().iloc[-int(len(df)*0.05)] # 取非缺失值中95%分位数作为原最大值估计
df = df[~((df['User_Id'] > original_max_user_id * 10) & pd.notna(df['User_Id']))].reset_index(drop=True)
print("删除User_Id异常值后形状:", df.shape)
# 处理User_Id缺失值:填充为众数(可选:改为df = df.dropna(subset=['User_Id'])删除)
# user_id_mode = df['User_Id'].mode()[0]
df['User_Id'] = df['User_Id'].fillna(-1) # 使用-1表示缺失的User_Id
# 处理Genre缺失值:填充为"Unknown"(可选:改为df = df.dropna(subset=['Genre'])删除)
df['Genre'] = df['Genre'].fillna("Unknown")
# 使用-1表示缺失的Year
df['Year'] = df['Year'].fillna(-1)
# 删除重复行(基于所有列)————注意要先把异常数据等处理完再去重
df = df.drop_duplicates().reset_index(drop=True)
print("删除重复行后形状:", df.shape)
# 数据类型修正(确保数值列类型正确)
df['User_Id'] = df['User_Id'].astype(int)
df['Rating'] = df['Rating'].astype(float)
df['Year'] = df['Year'].astype(int)
# 13. 最终验证
print("\n还原后数据形状:", df.shape)
print("\n还原后数据前10行:")
print(df.head(10))
print("\n各列缺失值数量:")
print(df.isnull().sum())
print("\nRating值范围:", df['Rating'].min(), "~", df['Rating'].max())
print(f"年份范围:{df['Year'].min()} ~ {df['Year'].max()}") # 仅统计非空值
# # 7. 保存处理后的数据集
# df.to_csv('movie_data_with_year.csv', index=False, encoding='utf-8')
# print("\n处理后的数据集已保存为:movie_data_with_year.csv")
# In[7]:
df.to_csv(f'movie_data{target_rows}_deal.csv', index=False, encoding='utf-8')
print(f"\n处理后的数据集已保存为:movie_data{target_rows}_deal.csv")
# df.to_csv(f'movie_data_deal.csv', index=False, encoding='utf-8')
# print("\n处理后的数据集已保存为:movie_data_deal.csv")
# 读取数据集
# In[8]:
df = pd.read_csv(f'movie_data{target_rows}_deal.csv')
# df = pd.read_csv('movie_data_deal.csv')
# 数据基本信息
# In[9]:
print(f"数据集形状: {df.shape}")
print(f"行数(样本量): {df.shape[0]}")
print(f"列数(特征数): {df.shape[1]}")
print("\n列名:")
for i, col in enumerate(df.columns, 1):#序号从 1 开始计数
print(f" {i}. {col}")
print("\n重复列名检查:")
duplicate_cols = df.columns[df.columns.duplicated()].tolist()
if duplicate_cols:
print(f" 发现重复列名: {duplicate_cols}")
else:
print(" 无重复列名")
print("\n前5行数据预览:")
print(df.head())
print("\n数据类型:")
print(df.dtypes)
# 缺失值情况
# In[10]:
print("\n\n缺失值普查")
print("-"*40)
#isnull():DataFrame 的方法,用于检测缺失值。
# sum():对 isnull() 的结果进行求和计算。
missing_info = df.isnull().sum()
# print(missing_info)
missing_pct = (missing_info / len(df) * 100).round(2)#返回行数(样本数量)
# print(df.columns)
print("各列缺失值统计:")
for col in df.columns:
missing_count = missing_info[col]
missing_percent = missing_pct[col]
print(f" {col}: {missing_count}个缺失值 ({missing_percent}%)")
print(f"\n总缺失值比例: {(df.isnull().sum().sum() / (df.shape[0] * df.shape[1]) * 100).round(2)}%")
# 特殊符号识别
# In[11]:
# 识别特殊占位符表示的缺失值
special_missing = ['', 'NA', 'N/A', 'NaN', 'null', 'NULL', '-', '?', '未知']
#include=['object']:指定筛选 object 类型的列
#.columns:获取筛选后 DataFrame 的列名列表(返回 Index 对象,可直接迭代)
for col in df.select_dtypes(include=['object']).columns:
special_counts = df[col].isin(special_missing).sum()
if special_counts > 0:
print(f" {col}列中发现特殊占位符表示的缺失值: {special_counts}个")
# 重复值判断
# In[12]:
print("\n\n重复值普查")
print("-"*40)
duplicate_rows = df.duplicated().sum()
print(f"完全重复的行数: {duplicate_rows}")
if duplicate_rows > 0:
print("重复行的示例:")
print(df[df.duplicated(keep=False)])
#keep 参数:控制重复行的标记规则
# keep=False:标记所有重复行(包括首次出现)
# keep='first'(默认):仅标记首次出现的重复行
# keep='last':仅标记最后出现的重复行
# 各特征情况分析 - 优化版
# In[13]:
# =============================================================================
# 特征分析:使用更合适的图表和方法进行深入分析
# =============================================================================
print("\n" + "="*60)
print("特征分析:使用合适图表和方法进行深入分析")
print("="*60)
# --------------------------- 按评分(Rating)分析 ---------------------------
print("\n","-"*20,"1. 评分(Rating)特征分析","-"*20)
# 基础统计量计算
rating_stats = df['Rating'].describe()
print("\n📊 Rating核心统计量:")
print(rating_stats)
# 详细统计计算
total_movies = len(df)
rating_variance = df['Rating'].var()
rating_std = df['Rating'].std()
rating_skew = df['Rating'].skew()
rating_kurtosis = df['Rating'].kurtosis()
print(f"\n📈 Rating详细统计信息:")
print(f" 方差:{rating_variance:.4f}") # 方差越大,评分分布越分散
print(f" 标准差:{rating_std:.4f}") # 标准差反映评分的波动程度
print(f" 偏度:{rating_skew:.4f}") # 偏度>0右偏,<0左偏,=0正态
print(f" 峰度:{rating_kurtosis:.4f}") # 峰度>3尖峰,<3平峰
# 极值分析
max_rating_count = len(df[df['Rating'] == 5.0])
min_rating_count = len(df[df['Rating'] == 0.0])
print(f"\n📊 Rating极值分析:")
print(f" 总电影数:{total_movies}")
print(f" 最高评分(5.0)电影数:{max_rating_count},占比:{max_rating_count/total_movies:.2%}")
print(f" 最低评分(0.0)电影数:{min_rating_count},占比:{min_rating_count/total_movies:.2%}")
# 评分分布形态分析
print(f"\n📊 Rating分布形态分析:")
if rating_skew < -0.5:
print(" ✅ Rating呈左偏分布,多数用户倾向于给高分")
elif rating_skew > 0.5:
print(" ✅ Rating呈右偏分布,多数用户倾向于给低分")
else:
print(" ✅ Rating分布接近正态,评分较为均衡")
# 评分区间分析
rating_bins = pd.cut(df['Rating'], bins=[0, 1, 2, 3, 4, 5], include_lowest=True)
rating_distribution = rating_bins.value_counts().sort_index()
print(f"\n📊 Rating区间分布:")
for interval, count in rating_distribution.items():
print(f" {interval}: {count}部电影,占比:{count/total_movies:.2%}")
# --------------------------- 按电影类型(Genre)分析 ---------------------------
print("\n","-"*20,"2. 电影类型(Genre)特征分析","-"*20)
# Genre频次统计
genre_counts = df['Genre'].value_counts()
genre_top10 = genre_counts.head(10)
print(f"\n📊 Genre频次统计(Top10):")
for genre, count in genre_top10.items():
percentage = count/total_movies*100
print(f" {genre}: {count}次,占比:{percentage:.2f}%")
# Genre多样性分析
unique_genres = df['Genre'].nunique()
print(f"\n📊 Genre多样性分析:")
print(f" 不同类型组合总数:{unique_genres}")
print(f" 平均每个类型包含的电影数:{total_movies/unique_genres:.1f}")
# 类型评分分析
genre_rating_stats = df.groupby('Genre')['Rating'].agg(['mean', 'median', 'std', 'count']).round(2)
genre_rating_stats = genre_rating_stats.sort_values('count', ascending=False).head(10)
print(f"\n📊 Top10类型评分统计:")
print(genre_rating_stats)
# 类型稳定性分析(标准差越小,评分越稳定)
stable_genres = genre_rating_stats.sort_values('std').head(5)
volatile_genres = genre_rating_stats.sort_values('std', ascending=False).head(5)
print(f"\n📊 类型稳定性分析:")
print(f" 评分最稳定的类型(标准差最小):")
for genre, row in stable_genres.iterrows():
print(f" {genre}: 标准差={row['std']}, 平均分={row['mean']}")
print(f" 评分波动最大的类型(标准差最大):")
for genre, row in volatile_genres.iterrows():
print(f" {genre}: 标准差={row['std']}, 平均分={row['mean']}")
# --------------------------- 按年份(Year)分析 ---------------------------
print("\n","-"*20,"3. 年份(Year)特征分析","-"*20)
df_year_valid = df[df['Year'] != -1] # 过滤缺失年份
if len(df_year_valid) > 0:
# 年份范围分析
earliest_year = df_year_valid['Year'].min()
latest_year = df_year_valid['Year'].max()
year_range = latest_year - earliest_year
print(f"\n📊 Year范围分析:")
print(f" 有效年份电影数:{len(df_year_valid)}")
print(f" 年份跨度:{earliest_year} - {latest_year}(共{year_range}年)")
print(f" 最早年份:{earliest_year},电影数:{len(df_year_valid[df_year_valid['Year'] == earliest_year])}")
print(f" 最晚年份:{latest_year},电影数:{len(df_year_valid[df_year_valid['Year'] == latest_year])}")
# 年代分析
df_year_valid['Decade'] = (df_year_valid['Year'] // 10) * 10
decade_stats = df_year_valid.groupby('Decade').agg({
'Rating': ['mean', 'count'],
'Year': ['min', 'max']
}).round(2)
print(f"\n📊 年代分析:")
for decade, row in decade_stats.iterrows():
mean_rating = row[('Rating', 'mean')]
count = row[('Rating', 'count')]
year_min = row[('Year', 'min')]
year_max = row[('Year', 'max')]
print(f" {decade}s年代: {count}部电影,平均分={mean_rating},年份范围={year_min}-{year_max}")
# 年份分布分析
year_distribution = df_year_valid['Year'].value_counts().sort_index()
peak_year = year_distribution.idxmax()
peak_count = year_distribution.max()
print(f"\n📊 年份分布分析:")
print(f" 电影数量最多的年份:{peak_year},数量:{peak_count}")
print(f" 平均每年电影数量:{len(df_year_valid)/len(year_distribution):.1f}")
# 年代趋势分析
decade_trend = df_year_valid.groupby('Decade')['Rating'].mean()
print(f"\n📊 年代评分趋势:")
for decade, rating in decade_trend.items():
print(f" {decade}s年代: 平均分={rating:.2f}")
# 计算年代间评分变化
if len(decade_trend) > 1:
trend_change = decade_trend.diff().dropna()
print(f"\n📊 年代间评分变化:")
for i, (decade, change) in enumerate(trend_change.items()):
prev_decade = decade_trend.index[i]
print(f" {prev_decade}s → {decade}s: 评分变化={change:+.2f}")
else:
print("⚠️ 无有效年份数据")
# --------------------------- 按用户(User_Id)分析 ---------------------------
print("\n","-"*20,"4. 用户(User_Id)特征分析","-"*20)
df_user_valid = df[df['User_Id'] != -1] # 过滤缺失用户ID
if len(df_user_valid) > 0:
# 用户行为分析
user_stats = df_user_valid.groupby('User_Id')['Rating'].agg(['count', 'mean', 'std'])
total_users = len(user_stats)
print(f"\n📊 用户行为分析:")
print(f" 有效用户数:{total_users}")
print(f" 总评分记录数:{len(df_user_valid)}")
print(f" 平均每个用户评分数:{len(df_user_valid)/total_users:.1f}")
# 用户评分频率分析
user_freq_stats = user_stats['count'].describe()
print(f"\n📊 用户评分频率分析:")
print(f" 用户评分次数统计:")
print(f" 最小值:{user_freq_stats['min']}")
print(f" 平均值:{user_freq_stats['mean']:.1f}")
print(f" 中位数:{user_freq_stats['50%']}")
print(f" 最大值:{user_freq_stats['max']}")
# 用户分类分析
high_freq_users = user_stats[user_stats['count'] > 50] # 高频用户
medium_freq_users = user_stats[(user_stats['count'] >= 10) & (user_stats['count'] <= 50)] # 中频用户
low_freq_users = user_stats[user_stats['count'] < 10] # 低频用户
print(f"\n📊 用户分类分析:")
print(f" 高频用户(评分>50次):{len(high_freq_users)}人,占比:{len(high_freq_users)/total_users:.2%}")
print(f" 中频用户(10-50次):{len(medium_freq_users)}人,占比:{len(medium_freq_users)/total_users:.2%}")
print(f" 低频用户(<10次):{len(low_freq_users)}人,占比:{len(low_freq_users)/total_users:.2%}")
# 极端评分用户分析
extreme_high_users = user_stats[user_stats['mean'] > 4.5] # 极端高分用户
extreme_low_users = user_stats[user_stats['mean'] < 2.0] # 极端低分用户
normal_users = user_stats[(user_stats['mean'] >= 2.0) & (user_stats['mean'] <= 4.5)] # 正常用户
print(f"\n📊 极端评分用户分析:")
print(f" 极端高分用户(平均>4.5):{len(extreme_high_users)}人,占比:{len(extreme_high_users)/total_users:.2%}")
print(f" 极端低分用户(平均<2.0):{len(extreme_low_users)}人,占比:{len(extreme_low_users)/total_users:.2%}")
print(f" 正常评分用户:{len(normal_users)}人,占比:{len(normal_users)/total_users:.2%}")
# 用户评分一致性分析(标准差越小,评分越一致)
consistent_users = user_stats.sort_values('std').head(5)
inconsistent_users = user_stats.sort_values('std', ascending=False).head(5)
print(f"\n📊 用户评分一致性分析:")
print(f" 评分最一致的用户(标准差最小):")
for user_id, row in consistent_users.iterrows():
print(f" 用户{user_id}: 标准差={row['std']:.2f}, 平均分={row['mean']:.2f}, 评分次数={row['count']}")
print(f" 评分最不一致的用户(标准差最大):")
for user_id, row in inconsistent_users.iterrows():
print(f" 用户{user_id}: 标准差={row['std']:.2f}, 平均分={row['mean']:.2f}, 评分次数={row['count']}")
else:
print("⚠️ 无有效用户数据")
# 分布与趋势分析
# In[14]:
# --------------------------- 分布与趋势分析 ---------------------------
print("\n" + "="*50)
print("分布与趋势分析")
print("="*50)
# --------------------------- Rating分布形态 ---------------------------
print("\nRating分布形态:")
# 计算偏态系数(<0左偏,>0右偏,=0正态)
skewness = stats.skew(df['Rating'])
print(f"Rating偏态系数:{skewness:.2f}")
if skewness < -0.5:
print("Rating呈左偏分布,多数用户倾向于给高分")
elif skewness > 0.5:
print("Rating呈右偏分布,多数用户倾向于给低分")
else:
print("Rating分布接近正态,评分较为均衡")
# --------------------------- Year分布 ---------------------------
print("\nYear分布:")
if len(df_year_valid) > 0:
year_dist = df_year_valid['Year'].value_counts().sort_index()
print(f"电影年份覆盖:{df_year_valid['Year'].min()} - {df_year_valid['Year'].max()}")
print(f"电影数量最多的年份:{year_dist.idxmax()},数量:{year_dist.max()}")
else:
print("无有效年份数据")
# 多变量间影响分析
# In[15]:
# --------------------------- 相关性与关联分析 ---------------------------
print("\n" + "="*50)
print("相关性与关联分析")
print("="*50)
# --------------------------- 数值特征相关性 ---------------------------
print("\n数值特征相关性:")
# 选择数值特征(过滤无效Year)
numeric_cols = ['Rating', 'Year']
df_corr = df_year_valid[numeric_cols]
corr_matrix = df_corr.corr()
print("数值特征皮尔逊相关系数:")
print(corr_matrix)
# --------------------------- ANOVA分析(Genre对Rating的影响) ---------------------------
print("\nANOVA分析(Genre对Rating的影响):")
# 选择Top10 Genre进行ANOVA(避免类别过多)
top_genres = genre_top10.index.tolist()
df_anova = df[df['Genre'].isin(top_genres)]
# 提取各Genre组的Rating数据
genre_rating_groups = [group['Rating'].values for name, group in df_anova.groupby('Genre')]
# 执行ANOVA
f_stat, p_value = stats.f_oneway(*genre_rating_groups)
print(f"ANOVA F统计量:{f_stat:.2f}")
print(f"p值:{p_value:.6f}")
if p_value < 0.05:
print("结论:不同Genre的Rating存在显著差异(p<0.05)")
else:
print("结论:不同Genre的Rating无显著差异(p≥0.05)")
# 电影特征分析
# In[16]:
# --------------------------- 业务导向的专题分析 ---------------------------
print("\n" + "="*50)
print("业务专题分析:Top20高评分电影特征")
print("="*50)
# 筛选Top20高评分电影(Rating降序,相同评分按评分次数降序)
top20_movies = df.groupby('Movie_Name').agg(
avg_rating=('Rating', 'mean'),
rating_count=('Rating', 'count'),
genre=('Genre', 'first'),
year=('Year', 'first')
).sort_values(by=['avg_rating', 'rating_count'], ascending=False).head(20)
print("Top20高评分电影特征:")
print(top20_movies)
# 分析共同特征
print("\nTop20高评分电影共同特征:")
# Genre分布
top20_genres = top20_movies['genre'].value_counts()
print(f"Top20电影主要类型:{top20_genres.index.tolist()}")
# Year分布
top20_years = top20_movies[top20_movies['year'] != -1]['year'].value_counts()
if len(top20_years) > 0:
print(f"Top20电影主要年份:{top20_years.index.tolist()}")
print(f"Top20电影平均年份:{top20_movies[top20_movies['year'] != -1]['year'].mean():.0f}")
# 基本统计分析绘图
# In[17]:
import matplotlib.font_manager as fm
# 列出系统中所有可用的中文字体
chinese_fonts = [f.name for f in fm.fontManager.ttflist if 'Sim' in f.name or 'Hei' in f.name or 'Song' in f.name or 'Kai' in f.name]
print("系统可用中文字体:", chinese_fonts)
# 打印所有字体的完整信息(名称+路径)
for font in fm.fontManager.ttflist:
if 'YaHei' in font.name or '微软雅黑' in font.name:
print(f"字体名称:{font.name}")
print(f"字体路径:{font.fname}")
print("-" * 50)
for f in fm.fontManager.ttflist:
if 'hei' in f.name.lower() or '黑' in f.name:
print(f"字体名称: {f.name}, 字体文件: {f.fname}")
# In[18]:
import matplotlib.font_manager as fm
import matplotlib.pyplot as plt
import seaborn as sns
# 设置中文字体(解决中文显示问题)
# 方法1:使用系统字体路径
font_path = 'C:/Windows/Fonts/msyh.ttc' # 微软雅黑字体
# 检查字体文件是否存在
import os
if os.path.exists(font_path):
# 方法1:使用FontProperties对象
my_font = fm.FontProperties(fname=font_path)
# 方法2:设置全局字体(推荐)
plt.rcParams['font.family'] = ['Microsoft YaHei', 'SimHei', 'DejaVu Sans']
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei', 'SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False # 解决负号显示问题
print(f"✅ 已设置中文字体: {font_path}")
else:
# 如果微软雅黑不存在,尝试其他字体
print("⚠️ 微软雅黑字体不存在,尝试使用其他字体")
plt.rcParams['font.family'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['font.sans-serif'] = ['SimHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
my_font = None
# 设置Seaborn字体
sns.set_theme(style="whitegrid")
sns.set_style("whitegrid") # 设置seaborn主题
sns.set(font=plt.rcParams['font.family'][0]) # 设置seaborn使用相同字体
# --------------------------- 可视化辅助分析 ---------------------------
print("\n" + "="*50)
print("可视化分析(图表已生成)")
print("="*50)
# --------------------------- Rating分布(直方图+箱线图) ---------------------------
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
# 直方图
sns.histplot(df['Rating'], bins=10, kde=True, ax=axes[0])
axes[0].set_title('Rating分布直方图')
axes[0].set_xlabel('评分')
axes[0].set_ylabel('电影数量')
# 箱线图
sns.boxplot(x=df['Rating'], ax=axes[1])
axes[1].set_title('Rating分布箱线图')
axes[1].set_xlabel('评分')
plt.tight_layout()
plt.savefig('images/rating_distribution.png', dpi=300)
plt.close()
# --------------------------- Genre占比(饼图) ---------------------------
plt.figure(figsize=(10, 8))
genre_top8 = genre_counts.head(8) # 取Top8,其余归为"其他"
genre_top8['其他'] = genre_counts[8:].sum()
# plt.pie(genre_top8.values, labels=genre_top8.index, autopct='%1.1f%%', startangle=90)
wedges, texts, autotexts = plt.pie(
genre_top8.values,
labels=genre_top8.index, # 包含“其他”的标签列表
autopct='%1.1f%%',
startangle=90
)
# 应用字体到所有标签,包括"其他"
for text in texts:
if my_font is not None:
text.set_fontproperties(my_font)
else:
text.set_fontfamily(plt.rcParams['font.family'][0])
plt.title('电影类型占比(Top8+其他)')
plt.axis('equal')
plt.savefig('images/genre_pie.png', dpi=300)
plt.close()
# --------------------------- 按年代的评分趋势(折线图) ---------------------------
if len(decade_stats) > 0:
plt.figure(figsize=(10, 6))
# 使用decade_stats中的Rating均值数据
sns.lineplot(x=decade_stats.index, y=decade_stats[('Rating', 'mean')], marker='o')
plt.title('不同年代电影评分均值趋势')
plt.xlabel('年代')
plt.ylabel('平均评分')
plt.xticks(decade_stats.index)
plt.grid(True)
plt.savefig('images/decade_rating_trend.png', dpi=300)
plt.close()
# --------------------------- Top10 Genre评分对比(箱线图) ---------------------------
plt.figure(figsize=(12, 8))
sns.boxplot(x='Genre', y='Rating', data=df_anova)
plt.xticks(rotation=45, ha='right')
plt.title('Top10类型电影评分分布对比')
plt.xlabel('电影类型')
plt.ylabel('评分')
plt.tight_layout()
plt.savefig('images/genre_rating_boxplot.png', dpi=300)
plt.close()
# --------------------------- Year与Rating相关性(散点图) ---------------------------
# 回归线:红色直线(color='red'),表示年份与评分的线性相关趋势
if len(df_year_valid) > 0:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Year', y='Rating', data=df_year_valid, alpha=0.5)
sns.regplot(x='Year', y='Rating', data=df_year_valid, scatter=False, color='red')
plt.title('Year与Rating相关性散点图')
plt.xlabel('年份')
plt.ylabel('评分')
plt.grid(True)
plt.savefig('images/year_rating_corr.png', dpi=300)
plt.close()
print("所有图表已保存为PNG文件!")
print("\n数据分析完成!")
# ### 机器学习
# 数据预处理
# In[22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.decomposition import PCA
# --------------------------- 数据预处理 ---------------------------
df['User_Id'] = df['User_Id'].astype(str)
# 生成电影唯一标识(Movie_Id)
movie_le = LabelEncoder()
df['Movie_Id'] = movie_le.fit_transform(df['Movie_Name'])
# 处理多标签Genre(拆分并独热编码)
df['Genre_List'] = df['Genre'].str.split('|') # 拆分多标签
mlb = MultiLabelBinarizer()
genre_encoded = pd.DataFrame(mlb.fit_transform(df['Genre_List']), columns=mlb.classes_, index=df.index)
df = pd.concat([df, genre_encoded], axis=1) # 合并独热编码后的Genre特征
# 处理User_Id(LabelEncoder编码)
user_le = LabelEncoder()
df['User_Id_Encoded'] = user_le.fit_transform(df['User_Id'])
# 评分预测回归任务
# In[ ]:
# --------------------------- 特征选择与数据集划分 ---------------------------
# 选择特征:User_Id_Encoded、Movie_Id、Year、独热编码后的Genre特征
feature_cols = ['User_Id_Encoded', 'Movie_Id', 'Year'] + list(mlb.classes_)
X = df[feature_cols]
y = df['Rating'] # 目标变量:评分
# 划分训练集和测试集(80%训练,20%测试)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# --------------------------- 模型训练 ---------------------------
# 初始化多个回归模型
models = {
'线性回归': LinearRegression(),
'随机森林回归': RandomForestRegressor(n_estimators=100, random_state=42),
'梯度提升树': GradientBoostingRegressor(n_estimators=100, random_state=42)
}
# 训练并评估所有模型
results = {}
for model_name, model in models.items():
# 训练模型
model.fit(X_train, y_train)
# 预测
y_pred = model.predict(X_test)
# 评估指标
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# 保存结果
results[model_name] = {
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'R²': r2,
'预测值': y_pred
}
# --------------------------- 模型评估与结果展示 ---------------------------
# 打印各模型评估结果
print("="*50)
print("模型评估结果")
print("="*50)
for model_name, metrics in results.items():
print(f"\n{model_name}:")
print(f" MSE(均方误差): {metrics['MSE']:.4f}")
print(f" RMSE(均方根误差): {metrics['RMSE']:.4f}")
print(f" MAE(平均绝对误差): {metrics['MAE']:.4f}")
print(f" R²(决定系数): {metrics['R²']:.4f}")
# 展示测试集真实值与预测值对比
print("\n" + "="*50)
print("测试集真实值与预测值对比")
print("="*50)
test_results = pd.DataFrame({
'真实评分': y_test.values,
'线性回归预测': results['线性回归']['预测值'],
'随机森林预测': results['随机森林回归']['预测值'],
'梯度提升树预测': results['梯度提升树']['预测值']
})
print(test_results)
# --------------------------- 模型预测示例 ---------------------------
# 示例:预测新样本的评分
print("\n" + "="*50)
print("新样本预测示例")
print("="*50)
# 构造新样本(User_Id=-1,Movie_Name='Sleepers',Year=1996,Genre='Thriller')
new_sample = {
'User_Id': '-1',
'Movie_Name': 'Sleepers',
'Year': 1996,
'Genre': 'Thriller'
}
# 处理新样本特征
new_sample['User_Id_Encoded'] = user_le.transform([new_sample['User_Id']])[0]
new_sample['Movie_Id'] = movie_le.transform([new_sample['Movie_Name']])[0]
# 处理Genre(独热编码)
new_genre_list = new_sample['Genre'].split('|')
new_genre_encoded = mlb.transform([new_genre_list])[0]
# 构造完整特征向量
new_features = np.array([
new_sample['User_Id_Encoded'],
new_sample['Movie_Id'],
new_sample['Year']
] + list(new_genre_encoded)).reshape(1, -1)
# 预测评分
for model_name, model in models.items():
predicted_rating = model.predict(new_features)[0]
print(f"{model_name}预测评分: {predicted_rating:.2f}")
# 用户聚类
# In[25]:
# --------------------------- 聚类分析:用户聚类 ---------------------------
print("="*60)
print("聚类分析:用户聚类")
print("="*60)
# 构建用户特征:平均评分、评分次数、偏好类型(Genre独热编码的均值)
user_features = df.groupby('User_Id').agg({
'Rating': ['mean', 'count'], # 平均评分、评分次数
**{genre: 'mean' for genre in mlb.classes_} # 各类型偏好程度(均值)
}).reset_index()
user_features.columns = ['User_Id'] + ['_'.join(col).strip() for col in user_features.columns[1:]]
# 第一步:聚合 Rating 列(平均评分、评分次数)
user_rating_features = df.groupby('User_Id')['Rating'].agg(['mean', 'count']).reset_index()
user_rating_features.columns = ['User_Id', 'Rating_mean', 'Rating_count']
# 第二步:聚合类型列(各类型偏好程度)
genre_cols = list(mlb.classes_) # 获取所有类型列名
user_genre_features = df.groupby('User_Id')[genre_cols].mean().reset_index()
# 第三步:合并特征
user_features = pd.merge(user_rating_features, user_genre_features, on='User_Id', how='inner')
# 特征标准化(聚类算法对尺度敏感)
scaler = StandardScaler()
user_features_scaled = scaler.fit_transform(user_features.drop('User_Id', axis=1))
# 尝试多种聚类算法
cluster_algorithms = {
'K-Means (K=3)': KMeans(n_clusters=3, random_state=42),
'DBSCAN': DBSCAN(eps=0.5, min_samples=2),
'层次聚类 (K=3)': AgglomerativeClustering(n_clusters=3)
}
# 训练聚类模型并评估
for alg_name, alg in cluster_algorithms.items():
clusters = alg.fit_predict(user_features_scaled)
user_features[alg_name.replace(' ', '_')] = clusters
# 评估聚类效果(DBSCAN可能产生-1噪声点,需过滤)
if alg_name != 'DBSCAN' or len(set(clusters)) > 1:
valid_clusters = clusters[clusters != -1]
valid_features = user_features_scaled[clusters != -1]
if len(set(valid_clusters)) > 1:
silhouette = silhouette_score(valid_features, valid_clusters)
calinski = calinski_harabasz_score(valid_features, valid_clusters)
davies = davies_bouldin_score(valid_features, valid_clusters)
print(f"\n{alg_name}聚类结果:")
print(f" 聚类数量:{len(set(clusters))}")
print(f" 轮廓系数:{silhouette:.4f}(越接近1越好)")
print(f" Calinski-Harabasz指数:{calinski:.4f}(越大越好)")
print(f" Davies-Bouldin指数:{davies:.4f}(越小越好)")
# 可视化用户聚类结果(PCA降维到2维)
pca = PCA(n_components=2)
user_pca = pca.fit_transform(user_features_scaled)
user_features['PCA1'] = user_pca[:, 0]
user_features['PCA2'] = user_pca[:, 1]
plt.figure(figsize=(12, 6))
for i, alg_name in enumerate(cluster_algorithms.keys()):
plt.subplot(1, 3, i+1)
cluster_col = alg_name.replace(' ', '_')
sns.scatterplot(x='PCA1', y='PCA2', hue=cluster_col, data=user_features, palette='viridis')
plt.title(f'用户聚类:{alg_name}')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(title='聚类标签')
plt.tight_layout()
plt.savefig('user_clustering.png', dpi=300)
plt.close()
# 展示用户聚类结果
print("\n用户聚类结果概览(K-Means):")
user_cluster_summary = user_features.groupby('K-Means_(K=3)').agg({
'User_Id': 'count',
'Rating_mean': 'mean',
'Rating_count': 'mean'
}).rename(columns={'User_Id': '用户数量', 'Rating_mean': '平均评分', 'Rating_count': '平均评分次数'})
print(user_cluster_summary)
# 电影聚类
# In[ ]:
# --------------------------- 聚类分析:电影聚类 ---------------------------
print("\n" + "="*60)
print("聚类分析:电影聚类")
print("="*60)
# 构建电影特征:平均评分、年份、类型编码(Genre独热编码的均值)
movie_features = df.groupby('Movie_Name').agg({
'Rating': 'mean', # 平均评分
'Year': 'first', # 上映年份
**{genre: 'mean' for genre in mlb.classes_} # 类型编码(0/1)
}).reset_index()
# 特征标准化
movie_features_scaled = scaler.fit_transform(movie_features.drop('Movie_Name', axis=1))
# 训练K-Means聚类(K=3)
kmeans_movie = KMeans(n_clusters=3, random_state=42)
movie_features['Cluster'] = kmeans_movie.fit_predict(movie_features_scaled)
# 评估电影聚类效果
silhouette_movie = silhouette_score(movie_features_scaled, movie_features['Cluster'])
calinski_movie = calinski_harabasz_score(movie_features_scaled, movie_features['Cluster'])
print(f"电影K-Means聚类效果:")
print(f" 轮廓系数:{silhouette_movie:.4f}")
print(f" Calinski-Harabasz指数:{calinski_movie:.4f}")
# 可视化电影聚类(PCA降维)
movie_pca = pca.fit_transform(movie_features_scaled)
movie_features['PCA1'] = movie_pca[:, 0]
movie_features['PCA2'] = movie_pca[:, 1]
plt.figure(figsize=(8, 6))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=movie_features, palette='viridis', s=100)
for i, row in movie_features.iterrows():
plt.text(row['PCA1'], row['PCA2'], row['Movie_Name'].split()[0], fontsize=8)
plt.title('电影聚类结果(PCA降维)')
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.legend(title='聚类标签')
plt.tight_layout()
plt.savefig('movie_clustering.png', dpi=300)
plt.close()
# 展示电影聚类结果
print("\n电影聚类结果:")
for cluster in sorted(movie_features['Cluster'].unique()):
cluster_movies = movie_features[movie_features['Cluster'] == cluster]['Movie_Name'].tolist()
print(f"\n聚类{cluster}电影:{', '.join(cluster_movies)}")
# 分类分析
# In[ ]:
# --------------------------- 分类分析:评分等级预测 ---------------------------
print("\n" + "="*60)
print("5. 分类分析:评分等级预测")
print("="*60)
# 定义分类标签:将评分分为3类
# 高评分:>4.0,中评分:2.0-4.0,低评分:<2.0
df['Rating_Class'] = pd.cut(df['Rating'], bins=[-float('inf'), 2.0, 4.0, float('inf')],
labels=['低评分', '中评分', '高评分'])
# 特征选择
feature_cols = ['User_Id_Encoded', 'Movie_Id', 'Year'] + list(mlb.classes_)
X = df[feature_cols]
y = df['Rating_Class']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 尝试多种分类算法
classifiers = {
'逻辑回归': LogisticRegression(max_iter=1000, random_state=42),
'随机森林': RandomForestClassifier(n_estimators=100, random_state=42),
'梯度提升树': GradientBoostingClassifier(n_estimators=100, random_state=42)
}
# 训练分类模型并评估
classification_results = {}
for clf_name, clf in classifiers.items():
# 训练模型
clf.fit(X_train, y_train)
# 预测
y_pred = clf.predict(X_test)
# 评估指标
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)
# 保存结果
classification_results[clf_name] = {
'准确率': accuracy,
'精确率': precision,
'召回率': recall,
'F1值': f1,
'混淆矩阵': cm,
'预测值': y_pred
}
# 打印评估结果
print(f"\n{clf_name}分类结果:")
print(f" 准确率:{accuracy:.4f}")
print(f" 精确率:{precision:.4f}")
print(f" 召回率:{recall:.4f}")
print(f" F1值:{f1:.4f}")
print(f" 混淆矩阵:\n{cm}")
# 可视化混淆矩阵(随机森林为例)
plt.figure(figsize=(10, 6))
for i, (clf_name, metrics) in enumerate(classification_results.items()):
plt.subplot(1, 3, i+1)
sns.heatmap(metrics['混淆矩阵'], annot=True, fmt='d', cmap='Blues',
xticklabels=['低评分', '中评分', '高评分'],
yticklabels=['低评分', '中评分', '高评分'])
plt.title(f'{clf_name}混淆矩阵')
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.tight_layout()
plt.savefig('classification_confusion_matrix.png', dpi=300)
plt.close()
# 展示测试集真实值与预测值对比
print("\n测试集真实值与预测值对比:")
test_results = pd.DataFrame({
'真实等级': y_test.values,
**{clf_name: metrics['预测值'] for clf_name, metrics in classification_results.items()}
})
print(test_results)
# =============================================================================
# 新增:数据统计、机器学习建模和可视化
# =============================================================================
# In[19]:
print("\n" + "="*60)
print("新增:数据统计与机器学习建模")
print("="*60)
# 1. 对特定数据列进行统计计算
print("\n1. 特定数据列统计计算")
print("-"*40)
# Rating列详细统计
print("Rating列统计信息:")
rating_stats = df['Rating'].describe()
print(rating_stats)
# 计算方差和标准差
rating_variance = df['Rating'].var()
rating_std = df['Rating'].std()
print(f"Rating方差:{rating_variance:.4f}")
print(f"Rating标准差:{rating_std:.4f}")
# 偏度和峰度
rating_skew = df['Rating'].skew()
rating_kurtosis = df['Rating'].kurtosis()
print(f"Rating偏度:{rating_skew:.4f}")
print(f"Rating峰度:{rating_kurtosis:.4f}")
# Year列统计(过滤掉缺失值)
df_year_valid = df[df['Year'] != -1]
if len(df_year_valid) > 0:
print("\nYear列统计信息(有效年份):")
year_stats = df_year_valid['Year'].describe()
print(year_stats)
print(f"Year方差:{df_year_valid['Year'].var():.2f}")
print(f"Year标准差:{df_year_valid['Year'].std():.2f}")
# User_Id列统计
df_user_valid = df[df['User_Id'] != -1]
if len(df_user_valid) > 0:
print("\nUser_Id列统计信息(有效用户):")
user_stats = df_user_valid['User_Id'].describe()
print(user_stats)
print(f"User_Id方差:{df_user_valid['User_Id'].var():.2f}")
print(f"User_Id标准差:{df_user_valid['User_Id'].std():.2f}")
# In[20]:
# 2. 数据预处理和特征工程
print("\n2. 数据预处理和特征工程")
print("-"*40)
# 创建用于机器学习的数据集
# 选择数值特征和类别特征
numeric_features = ['Rating', 'Year']
# 处理缺失值
df_ml = df.copy()
# 将缺失值替换为特定值
df_ml['Year'] = df_ml['Year'].replace(-1, df_ml[df_ml['Year'] != -1]['Year'].median() if len(df_ml[df_ml['Year'] != -1]) > 0 else 2000)
df_ml['User_Id'] = df_ml['User_Id'].replace(-1, df_ml[df_ml['User_Id'] != -1]['User_Id'].median() if len(df_ml[df_ml['User_Id'] != -1]) > 0 else 0)
# 特征工程:创建新特征
# 电影评分频次特征
df_ml['movie_rating_count'] = df_ml.groupby('Movie_Name')['Rating'].transform('count')
# 用户评分频次特征
df_ml['user_rating_count'] = df_ml.groupby('User_Id')['Rating'].transform('count')
# 用户平均评分特征
df_ml['user_avg_rating'] = df_ml.groupby('User_Id')['Rating'].transform('mean')
# 处理Genre特征:提取主要类型
def get_main_genre(genre_str):
if pd.isna(genre_str) or genre_str == "Unknown":
return "Unknown"
genres = genre_str.split('|')
return genres[0] if genres else "Unknown"
df_ml['main_genre'] = df_ml['Genre'].apply(get_main_genre)
# 对主要类型进行编码
from sklearn.preprocessing import LabelEncoder
le_genre = LabelEncoder()
df_ml['genre_encoded'] = le_genre.fit_transform(df_ml['main_genre'])
print("特征工程完成,新增特征:")
print(f"- movie_rating_count: 电影评分次数")
print(f"- user_rating_count: 用户评分次数")
print(f"- user_avg_rating: 用户平均评分")
print(f"- genre_encoded: 主要类型编码")
print(f"\n处理后数据形状:{df_ml.shape}")
print("处理后数据前5行:")
print(df_ml[['Rating', 'Year', 'movie_rating_count', 'user_rating_count', 'user_avg_rating', 'genre_encoded']].head())
# In[21]:
# 3. 聚类分析(K-means)
print("\n3. 聚类分析(K-means)")
print("-"*40)
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
# 选择聚类特征
cluster_features = ['Rating', 'Year', 'movie_rating_count', 'user_rating_count', 'user_avg_rating', 'genre_encoded']
X_cluster = df_ml[cluster_features].dropna()
# 数据标准化
scaler = StandardScaler()
X_cluster_scaled = scaler.fit_transform(X_cluster)
# 使用肘部法则确定最佳聚类数
print("使用肘部法则确定最佳聚类数...")
wcss = [] # 簇内平方和
silhouette_scores = []
K_range = range(2, 8)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(X_cluster_scaled)
wcss.append(kmeans.inertia_)
silhouette_scores.append(silhouette_score(X_cluster_scaled, kmeans.labels_))
# 绘制肘部法则图
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(K_range, wcss, 'bo-')
plt.xlabel('聚类数 K')
plt.ylabel('簇内平方和 (WCSS)')
plt.title('肘部法则 - 确定最佳聚类数')
plt.grid(True)
plt.subplot(1, 2, 2)
plt.plot(K_range, silhouette_scores, 'ro-')
plt.xlabel('聚类数 K')
plt.ylabel('轮廓系数')
plt.title('轮廓系数 - 确定最佳聚类数')
plt.grid(True)
plt.tight_layout()
plt.savefig('kmeans_elbow_silhouette.png', dpi=300, bbox_inches='tight')
plt.close()
# 选择最佳聚类数(基于轮廓系数)
best_k = K_range[np.argmax(silhouette_scores)]
print(f"最佳聚类数:{best_k}(轮廓系数:{max(silhouette_scores):.4f})")
# 使用最佳聚类数进行聚类
kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_cluster_scaled)
# 将聚类结果添加到数据集
df_ml['cluster'] = -1 # 初始化聚类标签
df_ml.loc[X_cluster.index, 'cluster'] = cluster_labels
# 分析聚类结果
print(f"\n聚类结果分布:")
cluster_counts = df_ml['cluster'].value_counts().sort_index()
for cluster_id, count in cluster_counts.items():
if cluster_id != -1: # 排除未聚类的样本
cluster_data = df_ml[df_ml['cluster'] == cluster_id]
print(f"聚类 {cluster_id}: {count} 个样本")
print(f" 平均评分:{cluster_data['Rating'].mean():.2f}")
print(f" 平均年份:{cluster_data['Year'].mean():.0f}")
print(f" 主要类型:{cluster_data['main_genre'].mode().iloc[0] if len(cluster_data['main_genre'].mode()) > 0 else 'Unknown'}")
# 可视化聚类结果(使用PCA降维)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cluster_scaled)
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', alpha=0.6)
plt.colorbar(scatter, label='聚类标签')
plt.xlabel(f'主成分 1 ({pca.explained_variance_ratio_[0]:.2%})')
plt.ylabel(f'主成分 2 ({pca.explained_variance_ratio_[1]:.2%})')
plt.title('K-means聚类结果(PCA降维可视化)')
plt.grid(True, alpha=0.3)
plt.savefig('kmeans_clustering_pca.png', dpi=300, bbox_inches='tight')
plt.close()
print("聚类分析完成!")
# In[22]:
# 4. 回归分析(预测评分)
print("\n4. 回归分析(预测评分)")
print("-"*40)
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# 准备回归特征和目标变量
regression_features = ['Year', 'movie_rating_count', 'user_rating_count', 'user_avg_rating', 'genre_encoded']
X_reg = df_ml[regression_features]
y_reg = df_ml['Rating']
# 划分训练集和测试集
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
X_reg, y_reg, test_size=0.2, random_state=42
)
# 尝试多种回归算法
regressors = {
'线性回归': LinearRegression(),
'岭回归': Ridge(alpha=1.0),
'Lasso回归': Lasso(alpha=0.1),
'随机森林回归': RandomForestRegressor(n_estimators=100, random_state=42),
'梯度提升回归': GradientBoostingRegressor(n_estimators=100, random_state=42)
}
# 训练回归模型并评估
regression_results = {}
for reg_name, reg in regressors.items():
# 训练模型
reg.fit(X_train_reg, y_train_reg)
# 预测
y_pred_reg = reg.predict(X_test_reg)
# 评估指标
mse = mean_squared_error(y_test_reg, y_pred_reg)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)
# 保存结果
regression_results[reg_name] = {
'MSE': mse,
'RMSE': rmse,
'MAE': mae,
'R2': r2,
'预测值': y_pred_reg
}
# 打印评估结果
print(f"\n{reg_name}回归结果:")
print(f" 均方误差 (MSE):{mse:.4f}")
print(f" 均方根误差 (RMSE):{rmse:.4f}")
print(f" 平均绝对误差 (MAE):{mae:.4f}")
print(f" 决定系数 (R²):{r2:.4f}")
# 可视化回归结果
plt.figure(figsize=(15, 10))
# 真实值 vs 预测值散点图
for i, (reg_name, metrics) in enumerate(regression_results.items()):
plt.subplot(2, 3, i+1)
plt.scatter(y_test_reg, metrics['预测值'], alpha=0.5)
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'r--', lw=2)
plt.xlabel('真实评分')
plt.ylabel('预测评分')
plt.title(f'{reg_name}\nR² = {metrics["R2"]:.3f}')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('regression_results.png', dpi=300, bbox_inches='tight')
plt.close()
# 模型性能比较
plt.figure(figsize=(12, 6))
models = list(regression_results.keys())
r2_scores = [metrics['R2'] for metrics in regression_results.values()]
rmse_scores = [metrics['RMSE'] for metrics in regression_results.values()]
plt.subplot(1, 2, 1)
plt.bar(models, r2_scores, color='skyblue')
plt.title('回归模型R²比较')
plt.ylabel('R²值')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.subplot(1, 2, 2)
plt.bar(models, rmse_scores, color='lightcoral')
plt.title('回归模型RMSE比较')
plt.ylabel('RMSE值')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('regression_model_comparison.png', dpi=300, bbox_inches='tight')
plt.close()
print("\n回归分析完成!")
# In[23]:
# 5. 模型效果分析和改进建议
print("\n5. 模型效果分析和改进建议")
print("-"*40)
# 分析模型性能
best_reg_model = max(regression_results.items(), key=lambda x: x[1]['R2'])
print(f"最佳回归模型:{best_reg_model[0]} (R² = {best_reg_model[1]['R2']:.4f})")
# 模型效果评估标准
print("\n模型效果评估标准:")
print("- R² > 0.8: 优秀拟合")
print("- R² 0.6-0.8: 良好拟合")
print("- R² 0.4-0.6: 一般拟合")
print("- R² < 0.4: 较差拟合")
# 分析可能的问题和改进建议
print("\n可能的问题和改进建议:")
if best_reg_model[1]['R2'] < 0.4:
print("❌ 模型拟合效果较差,可能原因:")
print(" 1. 特征与目标变量相关性较弱")
print(" 2. 数据噪声较大")
print(" 3. 特征工程不足")
print(" 4. 需要更复杂的模型或参数调优")
print("\n💡 改进建议:")
print(" 1. 探索更多特征(如文本特征、时间特征)")
print(" 2. 进行特征选择,去除冗余特征")
print(" 3. 尝试深度学习模型")
print(" 4. 进行超参数调优")
print(" 5. 增加数据量或数据质量")
elif best_reg_model[1]['R2'] < 0.6:
print("⚠️ 模型拟合效果一般,有改进空间")
print("💡 改进建议:")
print(" 1. 优化特征工程")
print(" 2. 尝试集成学习方法")
print(" 3. 进行交叉验证调优")
elif best_reg_model[1]['R2'] < 0.8:
print("✅ 模型拟合效果良好")
print("💡 可进一步优化:")
print(" 1. 精细调参")
print(" 2. 特征重要性分析")
print(" 3. 模型融合")
else:
print("🎉 模型拟合效果优秀!")
# 特征重要性分析(随机森林回归为例)
if '随机森林回归' in regression_results:
rf_reg = regressors['随机森林回归']
feature_importance = pd.DataFrame({
'feature': regression_features,
'importance': rf_reg.feature_importances_
}).sort_values('importance', ascending=False)
print("\n随机森林回归特征重要性:")
print(feature_importance)
# 可视化特征重要性
plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('特征重要性')
plt.title('随机森林回归特征重要性')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()
print("\n" + "="*60)
print("数据统计与机器学习建模完成!")
print("="*60)
# 保存处理后的数据集
df_ml.to_csv('movie_data_ml_processed.csv', index=False, encoding='utf-8')
print("\n处理后的数据集已保存为:movie_data_ml_processed.csv")
上述代码设置全局字体没有生效,但是像# 应用字体到所有标签,包括“其他”
for text in texts:
text.set_fontproperties(my_font) 单个设置是可以的
为什么呢
最新发布