请帮我检查优化代码,并完整输出结果:import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import chardet
def detect_encoding(file_path):
with open(file_path, 'rb') as f:
result = chardet.detect(f.read(10000))
return result['encoding'], result['confidence']
def load_all_data(days=32):
see_list, click_list, play_list = [], [], []
dtypes = {'did': 'category', 'vid': 'category'}
for i in range(1, days + 1):
day = f"{i:02d}"
# 加载 see 数据
see = pd.read_csv(f'see_{day}.csv', encoding='latin1', dtype=dtypes)
if 'did' not in see.columns or 'vid' not in see.columns:
raise ValueError(f"see_{day}.csv 缺少必要字段")
see['day'] = day
see_list.append(see)
# 加载 click 数据
click = pd.read_csv(
f'click_{day}.csv',
encoding='ISO-8859-1',
on_bad_lines='skip',
dtype=dtypes
)
if 'click_time' not in click.columns:
raise ValueError(f"click_{day}.csv 缺少 click_time 字段")
click['date'] = pd.to_datetime(click['click_time']).dt.date
click_list.append(click[['did', 'vid', 'date']])
# 加载 play 数据
play = pd.read_csv(
f'playplus_{day}.csv',
engine='python',
encoding_errors='ignore',
dtype=dtypes
)
if 'play_time' not in play.columns:
raise ValueError(f"playplus_{day}.csv 缺少 play_time 字段")
play_list.append(play[['did', 'vid', 'play_time']])
all_see = pd.concat(see_list).drop_duplicates(['did', 'vid'])
all_click = pd.concat(click_list).drop_duplicates(['did', 'vid'])
all_play = pd.concat(play_list).groupby(['did', 'vid'], observed=True).sum().reset_index()
return all_see, all_click, all_play
def prepare_samples(all_see, all_click, all_play):
video_info = pd.read_csv('vid_info_table.csv', encoding='gbk', dtype={'vid': 'category'})
# 合并基础数据
samples = all_see.merge(all_play, on=['did', 'vid'], how='left').fillna({'play_time': 0})
samples = samples.merge(video_info, on='vid', how='left')
# 计算完成率(仅用于分析,不用于预测)
samples['completion_rate'] = (samples['play_time'] / samples['item_duration']).clip(0, 1).astype(np.float32)
# 点击标记
click_flag = all_click.groupby(['did', 'vid']).size().reset_index(name='clicked')
click_flag['clicked'] = 1
samples = samples.merge(click_flag, on=['did', 'vid'], how='left').fillna({'clicked': 0})
samples['clicked'] = samples['clicked'].astype(np.int8)
# 标签定义
samples['label'] = np.select(
[
(samples['completion_rate'] > 0.9),
(samples['clicked'] == 1)
],
[2, 1], # 2=完成, 1=点击
default=0 # 0=曝光未点击
)
# 二分类目标(点击或完成为正类)
samples['binary_label'] = samples['label'].apply(lambda x: 1 if x >= 1 else 0).astype(int)
# 计算用户点击率(修正版)
user_exposure = all_see.groupby('did').size().rename('exposure_count')
user_click_count = all_click.groupby('did').size().rename('click_count')
user_click_rate = (user_click_count / user_exposure).fillna(0).astype(np.float32)
# 视频流行度
video_popularity = all_click.groupby('vid').size().rename('video_popularity')
# 映射特征
samples['user_click_rate'] = samples['did'].map(user_click_rate).fillna(0)
samples['video_popularity'] = samples['vid'].map(video_popularity).fillna(0)
# 修复:保存唯一用户点击率(关键修复点)
user_click_rate_df = pd.DataFrame({
'did': user_click_rate.index,
'user_click_rate': user_click_rate.values
}).drop_duplicates('did')
# 修复:保存唯一视频流行度
video_popularity_df = pd.DataFrame({
'vid': video_popularity.index,
'video_popularity': video_popularity.values
}).drop_duplicates('vid')
# 保存特征
user_click_rate_df.to_csv('user_click_rate.csv', index=False)
video_popularity_df.to_csv('video_popularity.csv', index=False)
return samples, user_click_rate, video_popularity
def train_model(samples):
# 仅使用可复现的特征
features = ['user_click_rate', 'video_popularity']
X = samples[features]
y = samples['binary_label']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': -1
}
model = lgb.train(
params,
lgb_train,
num_boost_round=100,
valid_sets=[lgb_train, lgb_eval],
callbacks=[
early_stopping(stopping_rounds=20),
log_evaluation(period=50)
]
)
y_pred = model.predict(X_test)
auc_score = roc_auc_score(y_test, y_pred)
print(f"Validation AUC: {auc_score:.4f}")
return model, features, auc_score
def predict_new_data(model, feature_columns, test_file):
# 读取测试数据
test_data = pd.read_csv(test_file, dtype={'did': 'category', 'vid': 'category'})
# 修复:正确读取特征映射
user_click_rate_df = pd.read_csv('user_click_rate.csv')
video_popularity_df = pd.read_csv('video_popularity.csv')
# 计算全局均值用于填充新用户/新视频
global_user_rate = user_click_rate_df['user_click_rate'].mean()
global_video_pop = video_popularity_df['video_popularity'].mean()
# 创建映射字典
user_click_map = user_click_rate_df.set_index('did')['user_click_rate'].to_dict()
video_pop_map = video_popularity_df.set_index('vid')['video_popularity'].to_dict()
# 映射特征
test_data['user_click_rate'] = test_data['did'].map(user_click_map).fillna(global_user_rate)
test_data['video_popularity'] = test_data['vid'].map(video_pop_map).fillna(global_video_pop)
# 预测
test_data['click_prob'] = model.predict(test_data[feature_columns])
# 生成结果
top_predictions = test_data.sort_values('click_prob', ascending=False).groupby('did').head(1)
result = top_predictions[['did', 'vid', 'click_prob']].copy()
result.columns = ['did', 'vid', 'click_prob']
result.to_csv('prediction_result.csv', index=False)
return result
if __name__ == '__main__':
encoding, confidence = detect_encoding('see_01.csv')
print(f"编码: {encoding}, 置信度: {confidence:.2f}")
all_see, all_click, all_play = load_all_data()
samples, _, _ = prepare_samples(all_see, all_click, all_play)
model, features, auc_score = train_model(samples)
result = predict_new_data(model, features, 'testA_did_show.csv')
最新发布