请按照上面的方法,帮我补充并优化代码:import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import chardet
import gc
import joblib
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
# 配置Dask进度条显示
ProgressBar().register()
def detect_encoding(file_path):
"""自动检测文件编码"""
with open(file_path, 'rb') as f:
result = chardet.detect(f.read(10000))
return result['encoding'], result['confidence']
def load_data_with_dask(days=32):
"""使用Dask加载和处理大规模数据"""
see_dfs, click_dfs, play_dfs = [], [], []
# 并行加载32天数据
for i in range(1, days + 1):
day = f"{i:02d}"
# 加载曝光数据
see = dd.read_csv(
f'see_{day}.csv',
dtype={'did': 'str', 'vid': 'str'},
blocksize='128MB'
)
see = see.assign(day=day)
see_dfs.append(see)
# 加载点击数据
click = dd.read_csv(
f'click_{day}.csv',
dtype={'did': 'str', 'vid': 'str', 'item_cid': 'str'},
blocksize='128MB'
)
click = click[['did', 'vid', 'click_time']]
click_dfs.append(click)
# 加载播放数据
play = dd.read_csv(
f'playplus_{day}.csv',
dtype={'did': 'str', 'vid': 'str', 'item_cid': 'str'},
blocksize='128MB'
)
play = play[['did', 'vid', 'play_time']]
play_dfs.append(play)
# 合并所有数据
all_see = dd.concat(see_dfs).drop_duplicates(['did', 'vid'])
all_click = dd.concat(click_dfs).drop_duplicates(['did', 'vid'])
all_play = dd.concat(play_dfs)
# 计算基本统计数据
total_users = all_see['did'].nunique().compute()
total_videos = all_see['vid'].nunique().compute()
print(f"Total unique users: {total_users}, Total unique videos: {total_videos}")
return all_see, all_click, all_play
def prepare_user_features(all_see, all_click, all_play, video_info):
"""为有记录的用户准备特征"""
print("Preparing user behavior features for users with history...")
# 计算用户曝光统计
user_exposure = all_see.groupby('did').size().rename('user_exposure_count').compute().astype('int32')
# 计算用户点击统计
user_click = all_click.groupby('did').size().rename('user_click_count').compute().astype('int32')
# 计算用户播放时长
user_play = all_play.groupby('did')['play_time'].sum().rename('total_play_time').compute().astype('float32')
# 合并用户行为特征
user_features = pd.concat([user_exposure, user_click, user_play], axis=1).fillna(0)
user_features['user_ctr'] = user_features['user_click_count'] / (user_features['user_exposure_count'] + 1e-6)
user_features['avg_play_time'] = user_features['total_play_time'] / (user_features['user_click_count'] + 1e-6)
# 添加用户活跃天数
active_days = all_see.groupby('did')['day'].nunique().compute().rename('active_days').astype('int8')
user_features = user_features.merge(active_days, left_index=True, right_index=True, how='left').fillna(0)
return user_features.reset_index()
def prepare_video_features(all_see, all_click, all_play, video_info):
"""准备视频特征"""
print("Preparing video popularity features...")
# 计算视频曝光
video_exposure = all_see.groupby('vid').size().rename('video_exposure_count').compute().astype('int32')
# 计算视频点击
video_click = all_click.groupby('vid').size().rename('video_click_count').compute().astype('int32')
# 计算视频播放时长
video_play = all_play.groupby('vid')['play_time'].sum().rename('total_play_time').compute().astype('float32')
# 合并视频特征
video_features = pd.concat([video_exposure, video_click, video_play], axis=1).fillna(0)
video_features['video_ctr'] = video_features['video_click_count'] / (video_features['video_exposure_count'] + 1e-6)
video_features['avg_play_time'] = video_features['total_play_time'] / (video_features['video_click_count'] + 1e-6)
# 合并视频元数据
video_features = video_features.merge(video_info, left_index=True, right_on='vid', how='left')
# 类别特征编码
for cat_col in ['item_type', 'item_assetSource', 'item_classify']:
video_features[cat_col] = video_features[cat_col].astype('category')
return video_features
def prepare_cold_start_cluster(user_features_table, history_users):
"""为冷启动用户准备聚类模型"""
print("Preparing clustering model for cold-start users...")
# 只使用有记录的用户进行聚类训练
trained_users = history_users['did'].tolist()
user_features_table['has_history'] = user_features_table['did'].isin(trained_users)
# 提取有历史记录用户的特征
trained_user_features = user_features_table[user_features_table['has_history']]
feature_cols = [f'f{i}' for i in range(0, 87)]
X = trained_user_features[feature_cols].values
# 使用MiniBatchKMeans处理大数据
pipe = make_pipeline(
SimpleImputer(strategy='mean'),
StandardScaler(),
MiniBatchKMeans(n_clusters=100, batch_size=5000, n_init=3)
)
# 训练聚类模型
cluster_model = pipe.fit(X)
trained_user_features['cluster'] = cluster_model.labels_
# 保存模型
joblib.dump(cluster_model, 'cold_start_cluster_model.pkl')
return cluster_model
def prepare_samples(all_see, all_click, all_play, user_features_table):
"""准备训练样本,区分有记录和无记录用户"""
print("Preparing training samples...")
# 加载视频元数据
video_info = pd.read_csv('vid_info_table.csv', encoding='gbk', dtype={'vid': 'str'})
# 准备用户和视频特征
user_behavior_features = prepare_user_features(all_see, all_click, all_play, video_info)
video_features = prepare_video_features(all_see, all_click, all_play, video_info)
# 标记有历史记录的用户
history_users = all_see['did'].unique().compute().to_frame(name='did')
user_features_table['has_history'] = user_features_table['did'].isin(history_users['did'])
# 准备冷启动聚类模型
cluster_model = prepare_cold_start_cluster(user_features_table, history_users)
# 为有记录用户准备训练样本
train_samples = dd.merge(all_see, all_click, on=['did', 'vid'], how='left', suffixes=('', '_click'))
train_samples = dd.merge(train_samples, all_play, on=['did', 'vid'], how='left')
train_samples = dd.merge(train_samples, user_behavior_features, on='did', how='left')
train_samples = dd.merge(train_samples, video_features, on='vid', how='left')
# 创建标签(点击为1,否则为0)
train_samples['label'] = (~train_samples['click_time'].isnull()).astype('int8')
# 优化内存使用
train_samples = train_samples.compute()
for col in train_samples.select_dtypes(include='float64').columns:
train_samples[col] = train_samples[col].astype('float32')
print(f"Training samples shape: {train_samples.shape}")
return train_samples, cluster_model, video_features
def train_behavior_model(samples, feature_columns):
"""训练有记录用户的行为预测模型"""
print("Training behavior prediction model...")
# 准备特征和标签
X = samples[feature_columns]
y = samples['label']
# 划分训练验证集(时间序列分割)
days = samples['day'].unique()
train_days = days[:-3] # 前29天用于训练
test_days = days[-3:] # 最后3天用于验证
X_train = samples[samples['day'].isin(train_days)][feature_columns]
y_train = samples[samples['day'].isin(train_days)]['label']
X_val = samples[samples['day'].isin(test_days)][feature_columns]
y_val = samples[samples['day'].isin(test_days)]['label']
# LightGBM参数设置
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'learning_rate': 0.05,
'num_leaves': 63,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': -1,
'seed': 42,
'max_depth': 7,
'min_child_samples': 500,
'n_jobs': 8
}
# 训练模型
behavior_model = lgb.train(
params,
lgb.Dataset(X_train, label=y_train),
num_boost_round=1000,
valid_sets=[lgb.Dataset(X_val, label=y_val)],
callbacks=[
lgb.early_stopping(stopping_rounds=30, verbose=False),
lgb.log_evaluation(period=50)
]
)
# 保存模型
behavior_model.save_model('behavior_model.txt')
return behavior_model
def predict_behavior(model, test_data, feature_columns):
"""预测有记录用户的行为"""
print("Predicting behavior for users with history...")
# 准备特征矩阵
X = test_data[feature_columns]
# 预测点击概率
preds = model.predict(X)
return preds
def predict_cold_start(cluster_model, video_features, user_features_table):
"""预测冷启动用户的偏好"""
print("Predicting preferences for cold-start users...")
# 获取冷启动用户
cold_start_users = user_features_table[~user_features_table['has_history']]
feature_cols = [f'f{i}' for i in range(0, 87)]
# 预测用户所属聚类
X = cold_start_users[feature_cols].values
cold_start_users['cluster'] = cluster_model.predict(X)
# 加载热门视频(每个聚类Top 50视频)
cluster_top_videos = joblib.load('cluster_top_videos.pkl')
# 为每个用户生成推荐
cold_start_users['recommended_vid'] = cold_start_users['cluster'].map(
lambda c: cluster_top_videos.get(c, []).copy()
)
# 对推荐列表进行截断(每个用户最多100个推荐)
cold_start_users['recommended_vid'] = cold_start_users['recommended_vid'].apply(
lambda lst: lst[:min(100, len(lst))]
)
return cold_start_users[['did', 'recommended_vid']]
def save_cluster_top_videos(video_features, cluster_model, behavior_data):
"""保存每个聚类的热门视频"""
print("Saving top videos for each cluster...")
# 获取每个聚类的热门视频(基于播放时长和点击率)
video_cluster_score = video_features[['vid', 'video_ctr', 'avg_play_time']].copy()
video_cluster_score['popularity_score'] = (video_cluster_score['video_ctr'] *
video_cluster_score['avg_play_time'] * 1000)
# 获取训练数据中的聚类分配
cluster_model = joblib.load('cold_start_cluster_model.pkl')
behavior_data['cluster'] = cluster_model.predict(behavior_data.iloc[:, 5:92])
# 统计每个聚类的视频偏好
cluster_video_pref = behavior_data.groupby(['cluster', 'vid'])['play_time'].sum().reset_index()
cluster_video_pref = cluster_video_pref.merge(video_cluster_score, on='vid')
# 为每个聚类计算Top视频
cluster_top_videos = {}
for cluster_id in behavior_data['cluster'].unique():
cluster_vids = cluster_video_pref[cluster_video_pref['cluster'] == cluster_id]
top_vids = cluster_vids.sort_values('popularity_score', ascending=False)['vid'].head(100).tolist()
cluster_top_videos[cluster_id] = top_vids
# 保存聚类视频偏好
joblib.dump(cluster_top_videos, 'cluster_top_videos.pkl')
return cluster_top_videos
def main():
"""主执行流程"""
# 1. 自动检测编码
encoding, confidence = detect_encoding('see_01.csv')
print(f"Detected encoding: {encoding} (confidence: {confidence:.2f})")
# 2. 加载基础数据
print("Loading base data...")
all_see, all_click, all_play = load_data_with_dask(days=32)
# 3. 加载用户特征表
user_features_table = pd.read_csv('did_features_table.csv', encoding='gbk', dtype={'did': 'str'})
# 4. 准备样本和聚类模型
train_samples, cluster_model, video_features = prepare_samples(all_see, all_click, all_play, user_features_table)
# 5. 保存聚类热门视频
save_cluster_top_videos(video_features, cluster_model, train_samples)
# 6. 定义模型特征列
feature_columns = [
'user_exposure_count', 'user_click_count', 'user_ctr',
'video_exposure_count', 'video_click_count', 'video_ctr',
'item_duration', 'item_serialno', 'item_classify',
'item_type', 'item_assetSource'
]
# 7. 训练行为预测模型
behavior_model = train_behavior_model(train_samples, feature_columns)
# 8. 加载测试数据
print("Loading test data...")
test_data = dd.read_csv('testA_did_show.csv', dtype={'did': 'str', 'vid': 'str'})
test_data = test_data.compute()
# 9. 合并用户历史状态
history_users = train_samples['did'].unique()
test_data['has_history'] = test_data['did'].isin(history_users)
# 10. 对于有历史记录的用户 - 使用模型预测
if test_data['has_history'].any():
print("Processing users with history...")
history_users_test = test_data[test_data['has_history']].copy()
# 合并特征
history_users_test = history_users_test.merge(
user_features_table, on='did', how='left', suffixes=('', '_feat')
)
history_users_test = history_users_test.merge(
video_features, on='vid', how='left'
)
# 预测点击概率
history_preds = predict_behavior(behavior_model, history_users_test, feature_columns)
history_users_test['click_prob'] = history_preds
# 保存结果
history_users_test[['did', 'vid', 'click_prob']].to_csv('history_user_predictions.csv', index=False)
# 11. 对于无历史记录的用户 - 使用聚类推荐
if not test_data['has_history'].all():
print("Processing cold-start users...")
cold_start_preds = predict_cold_start(cluster_model, video_features, user_features_table)
cold_start_preds.to_csv('cold_start_predictions.csv', index=False)
print("Prediction completed!")
if __name__ == '__main__':
main()