帮我检查代码,如有可以加速计算可以优化:import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
import gc
import os
import chardet
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import joblib
from datetime import datetime
from sklearn.metrics import roc_auc_score, mean_absolute_error
# 全局变量存储特征列表
click_features = []
play_features = []
base_categorical_features = []
# 优化内存使用:减小chunksize,添加更多内存清理
def load_data_safely(file_path, usecols=None, dtype=None, chunksize=50000):
"""安全加载大型CSV文件,优化内存使用"""
try:
if not os.path.exists(file_path):
print(f"⚠️ 文件不存在: {file_path}")
return pd.DataFrame()
# 自动检测编码
with open(file_path, 'rb') as f:
result = chardet.detect(f.read(100000))
encoding = result['encoding'] if result['confidence'] > 0.7 else 'latin1'
# 分批读取并优化内存 - 减小chunksize以降低内存峰值
chunks = []
reader = pd.read_csv(
file_path,
encoding=encoding,
usecols=usecols,
dtype=dtype,
chunksize=chunksize,
low_memory=False
)
for chunk in tqdm(reader, desc=f"加载 {os.path.basename(file_path)}"):
# 优化分类列内存
if dtype: # 确保dtype不为空
for col in chunk.columns:
if col in dtype and dtype[col] == 'category':
chunk[col] = chunk[col].astype('category').cat.as_ordered()
chunks.append(chunk)
if chunks:
result_df = pd.concat(chunks, ignore_index=True)
del chunks
gc.collect()
return result_df
return pd.DataFrame()
except Exception as e:
print(f"⚠️ 加载 {file_path} 失败: {str(e)}")
return pd.DataFrame()
# 修复播放数据加载问题
def load_historical_data(days=30):
"""高效加载历史数据,支持分批处理"""
see_list, click_list, play_list = [], [], []
for day in tqdm(range(1, days + 1), desc="加载历史数据"):
day_str = f"{day:02d}"
# 加载曝光数据
see_path = f'see_{day_str}.csv'
if os.path.exists(see_path):
see = load_data_safely(see_path, usecols=['did', 'vid'], dtype={'did': 'category', 'vid': 'category'})
if not see.empty and 'did' in see.columns and 'vid' in see.columns:
see_list.append(see)
del see
gc.collect()
else:
print(f"⚠️ 曝光数据文件不存在: {see_path}")
# 加载点击数据
click_path = f'click_{day_str}.csv'
if os.path.exists(click_path):
click = load_data_safely(click_path, usecols=['did', 'vid'], dtype={'did': 'category', 'vid': 'category'})
if not click.empty and 'did' in click.columns and 'vid' in click.columns:
click_list.append(click[['did', 'vid']])
del click
gc.collect()
else:
print(f"⚠️ 点击数据文件不存在: {click_path}")
# 修复播放数据加载问题 - 尝试多种可能的文件名格式
play_paths = [
f'playplus_{day_str}.csv', # 原始文件名
f'play_{day_str}.csv', # 可能的其他格式
f'playplus_{day}.csv', # 无前导零
f'play_{day}.csv' # 无前导零
]
play_loaded = False
for play_path in play_paths:
if os.path.exists(play_path):
play = load_data_safely(
play_path,
usecols=['did', 'vid', 'play_time'],
dtype={'did': 'category', 'vid': 'category'}
)
if not play.empty and 'play_time' in play.columns and 'did' in play.columns and 'vid' in play.columns:
play_list.append(play)
del play
play_loaded = True
print(f"✅ 成功加载播放数据: {play_path}")
break
if not play_loaded:
print(f"⚠️ 播放数据文件不存在: 尝试了 {play_paths}")
# 每处理3天数据清理一次内存
if day % 3 == 0:
gc.collect()
# 确保返回三个DataFrame,即使某些为空
return (
pd.concat(see_list).drop_duplicates(['did', 'vid']) if see_list else pd.DataFrame(),
pd.concat(click_list).drop_duplicates(['did', 'vid']) if click_list else pd.DataFrame(),
pd.concat(play_list).drop_duplicates(['did', 'vid']) if play_list else pd.DataFrame()
)
# 优化内存:使用更高效的方法处理负样本
def build_click_dataset(hist_exposure, hist_click, sample_ratio=0.1):
"""构建点击数据集,包含负样本采样"""
if hist_exposure.empty or hist_click.empty:
print("⚠️ 历史曝光或点击数据为空,无法构建数据集")
return pd.DataFrame()
# 标记正样本
hist_click = hist_click.copy()
hist_click['label'] = 1
# 高效标记负样本(使用索引加速操作)
exposure_index = hist_exposure.set_index(['did', 'vid']).index
click_index = hist_click.set_index(['did', 'vid']).index
# 找出未点击的曝光
negative_index = exposure_index.difference(click_index)
# 创建负样本DataFrame
if not negative_index.empty:
negative_samples = pd.DataFrame(
list(negative_index),
columns=['did', 'vid']
)
negative_samples['label'] = 0
# 采样负样本
if sample_ratio < 1.0:
negative_samples = negative_samples.sample(frac=sample_ratio, random_state=42)
else:
negative_samples = pd.DataFrame(columns=['did', 'vid', 'label'])
# 合并数据集
click_data = pd.concat([
hist_click[['did', 'vid', 'label']],
negative_samples
], ignore_index=True)
# 释放内存
del exposure_index, click_index, negative_index, negative_samples
gc.collect()
return click_data
# 优化内存:减少合并操作,使用更高效的特征添加方法
def add_click_features(df, did_features, vid_info, hist_click, hist_play):
"""添加关键特征,避免内存溢出"""
if df.empty:
return df
# 基础特征(使用索引加速合并)
if not did_features.empty and 'did' in did_features.columns:
# 只选择需要的列
did_cols = ['did'] + [col for col in did_features.columns if col.startswith('f')]
df = df.merge(did_features[did_cols], on='did', how='left')
if not vid_info.empty and 'vid' in vid_info.columns:
# 只选择分类特征
vid_cols = ['vid'] + [col for col in vid_info.columns if col in ['item_cid', 'item_type', 'item_assetSource', 'item_classify', 'item_isIntact']]
df = df.merge(vid_info[vid_cols], on='vid', how='left')
# 确保始终创建 'user_click_count' 列
df['user_click_count'] = 0
# 用户行为统计 - 使用预计算的统计量
if not hist_click.empty and 'did' in hist_click.columns:
# 计算用户点击次数
user_click_count = hist_click.groupby('did').size().rename('user_click_count')
# 直接添加到df,避免创建中间变量
df = df.join(user_click_count, on='did', how='left')
# 填充缺失值
df['user_click_count'] = df['user_click_count'].fillna(0)
# 确保始终创建 'user_total_play' 列
df['user_total_play'] = 0
if not hist_play.empty and 'did' in hist_play.columns and 'play_time' in hist_play.columns:
# 计算用户总播放时间
user_total_play = hist_play.groupby('did')['play_time'].sum().rename('user_total_play')
df = df.join(user_total_play, on='did', how='left')
# 填充缺失值
df['user_total_play'] = df['user_total_play'].fillna(0)
# 确保始终创建 'video_click_count' 列
df['video_click_count'] = 0
# 视频热度统计
if not hist_click.empty and 'vid' in hist_click.columns:
# 计算视频点击次数
video_click_count = hist_click.groupby('vid').size().rename('video_click_count')
df = df.join(video_click_count, on='vid', how='left')
# 填充缺失值
df['video_click_count'] = df['video_click_count'].fillna(0)
# 确保始终创建 'avg_play_time' 列
df['avg_play_time'] = 0
if not hist_play.empty and 'vid' in hist_play.columns and 'play_time' in hist_play.columns:
# 计算平均播放时间
avg_play_time = hist_play.groupby('vid')['play_time'].mean().rename('avg_play_time')
df = df.join(avg_play_time, on='vid', how='left')
# 填充缺失值
df['avg_play_time'] = df['avg_play_time'].fillna(0)
# 填充缺失值
fill_values = {
'user_click_count': 0,
'user_total_play': 0,
'video_click_count': df['video_click_count'].median() if 'video_click_count' in df else 0,
'avg_play_time': df['avg_play_time'].median() if 'avg_play_time' in df else 0
}
for col, value in fill_values.items():
if col in df:
df[col] = df[col].fillna(value)
# 移除日期相关特征
if 'date' in df:
df = df.drop(columns=['date'], errors='ignore')
return df
# 优化内存:使用更小的数据类型
def get_categorical_features(df, base_features):
"""动态获取存在的分类特征"""
existing_features = []
for feature in base_features:
if feature in df.columns:
try:
# 尝试转换为数值,如果是数值则跳过
pd.to_numeric(df[feature], errors='raise')
except:
existing_features.append(feature)
# 确保转换为category类型
df[feature] = df[feature].astype('category').cat.as_ordered()
return existing_features
# 优化内存:减少中间变量,使用transform避免创建大型临时DataFrame
def build_play_dataset(hist_play, vid_info, did_features, hist_click):
"""构建完播率数据集,优化内存使用"""
if hist_play.empty:
print("⚠️ 历史播放数据为空,无法构建完播率数据集")
return pd.DataFrame()
# 基础数据 - 只选择需要的列
play_data = hist_play[['did', 'vid', 'play_time']].copy()
# 添加视频时长信息
if not vid_info.empty and 'vid' in vid_info.columns and 'item_duration' in vid_info.columns:
play_data = play_data.merge(
vid_info[['vid', 'item_duration']],
on='vid',
how='left'
)
else:
play_data['item_duration'] = 1.0 # 默认值
# 计算完播率
play_data['completion_rate'] = play_data['play_time'] / play_data['item_duration']
play_data['completion_rate'] = play_data['completion_rate'].clip(upper=1.0)
# 添加用户特征 - 只选择数值特征
if not did_features.empty and 'did' in did_features.columns:
did_cols = ['did'] + [col for col in did_features.columns if col.startswith('f')]
play_data = play_data.merge(
did_features[did_cols],
on='did',
how='left'
)
# 添加视频特征 - 只选择分类特征
if not vid_info.empty and 'vid' in vid_info.columns:
vid_cols = ['vid'] + [col for col in vid_info.columns if col in ['item_cid', 'item_type', 'item_assetSource', 'item_classify', 'item_isIntact']]
play_data = play_data.merge(
vid_info[vid_cols],
on='vid',
how='left'
)
# 用户平均完播率 - 使用transform避免创建大型临时DataFrame
play_data['user_avg_completion'] = play_data.groupby('did')['completion_rate'].transform('mean')
play_data['user_play_count'] = play_data.groupby('did')['completion_rate'].transform('count')
# 视频平均完播率
play_data['video_avg_completion'] = play_data.groupby('vid')['completion_rate'].transform('mean')
play_data['video_completion_std'] = play_data.groupby('vid')['completion_rate'].transform('std')
# 用户-视频互动特征
if not hist_click.empty and 'did' in hist_click.columns and 'vid' in hist_click.columns:
# 使用transform避免创建大型临时DataFrame
user_vid_clicks = hist_click.groupby(['did', 'vid']).size().reset_index(name='user_vid_clicks')
play_data = play_data.merge(user_vid_clicks, on=['did', 'vid'], how='left')
else:
play_data['user_vid_clicks'] = 0
# 添加交互特征 - 确保训练和预测时特征一致
play_data['interaction_feature'] = (play_data['user_click_count'] * play_data['video_click_count']).astype('float32')
# 填充缺失值
play_data['user_avg_completion'].fillna(play_data['completion_rate'].mean(), inplace=True)
play_data['user_play_count'].fillna(1, inplace=True)
play_data['video_avg_completion'].fillna(play_data['completion_rate'].median(), inplace=True)
play_data['video_completion_std'].fillna(0, inplace=True)
play_data['user_vid_clicks'].fillna(0, inplace=True)
return play_data
# 优化内存:分批预测,避免一次性加载所有测试数据
def predict_for_test_data(test_users, test_exposure, did_features, vid_info):
"""为测试数据生成预测结果 - 确保结果行数与test_users一致"""
if test_users.empty:
print("⚠️ 测试用户数据为空,无法进行预测")
return pd.DataFrame()
# 确保每个测试用户都有记录
if test_exposure.empty:
# 如果没有曝光数据,使用默认视频
print("⚠️ 测试曝光数据为空,使用默认视频")
test_data = test_users.copy()
test_data['vid'] = vid_info['vid'].iloc[0] if not vid_info.empty else 'default_vid'
else:
# 合并测试数据,确保包含所有测试用户
test_data = test_users.merge(test_exposure, on='did', how='left')
# 处理可能缺失的vid
most_common_vid = test_exposure['vid'].mode()[0] if not test_exposure.empty else 'default_vid'
test_data['vid'] = test_data['vid'].fillna(most_common_vid)
# 分批处理测试数据以避免内存溢出
chunk_size = 50000 # 每批处理5万行
results = []
for i in tqdm(range(0, len(test_data), chunk_size), desc="分批预测"):
chunk = test_data.iloc[i:i+chunk_size].copy()
# 添加特征
chunk = add_click_features(
chunk,
did_features,
vid_info,
pd.DataFrame(), # 无历史点击
pd.DataFrame() # 无历史播放
)
# 动态获取分类特征
test_categorical_features = get_categorical_features(chunk, base_categorical_features)
# 预测点击率
X_chunk = chunk.drop(columns=['did', 'vid'], errors='ignore')
click_probs = []
if model_click and not X_chunk.empty:
# 确保特征数量一致
if len(X_chunk.columns) != len(click_features):
print(f"⚠️ 点击模型特征数量不一致: 训练时 {len(click_features)}, 预测时 {len(X_chunk.columns)}")
# 对齐特征
missing_features = set(click_features) - set(X_chunk.columns)
extra_features = set(X_chunk.columns) - set(click_features)
# 添加缺失特征
for feature in missing_features:
X_chunk[feature] = 0
# 移除多余特征
X_chunk = X_chunk[click_features]
click_probs = model_click.predict(X_chunk)
else:
click_probs = [0.5] * len(chunk) # 默认值
# 预测完播率
completion_rates = []
if model_play and not X_chunk.empty:
# 添加视频时长信息
if not vid_info.empty and 'vid' in vid_info.columns and 'item_duration' in vid_info.columns:
chunk = chunk.merge(vid_info[['vid', 'item_duration']], on='vid', how='left')
else:
chunk['item_duration'] = 1.0
# 添加交互特征 - 确保与训练时一致
chunk['interaction_feature'] = (chunk['user_click_count'] * chunk['video_click_count']).astype('float32')
# 准备预测数据
X_play_chunk = chunk.drop(columns=['did', 'vid'], errors='ignore')
# 确保特征数量一致
if len(X_play_chunk.columns) != len(play_features):
print(f"⚠️ 完播率模型特征数量不一致: 训练时 {len(play_features)}, 预测时 {len(X_play_chunk.columns)}")
# 对齐特征
missing_features = set(play_features) - set(X_play_chunk.columns)
extra_features = set(X_play_chunk.columns) - set(play_features)
# 添加缺失特征
for feature in missing_features:
X_play_chunk[feature] = 0
# 移除多余特征
X_play_chunk = X_play_chunk[play_features]
completion_rates = model_play.predict(X_play_chunk)
else:
completion_rates = [0.7] * len(chunk) # 默认值
# 存储预测结果
chunk['click_prob'] = click_probs
chunk['completion_rate'] = completion_rates
# 修改:确保每个did只有一行结果,选取点击概率最高的vid
chunk_result = chunk.sort_values('click_prob', ascending=False).groupby('did').head(1)
# 选择需要的列
chunk_result = chunk_result[['did', 'vid', 'completion_rate']].copy()
results.append(chunk_result)
# 清理内存
del chunk, X_chunk, click_probs, completion_rates, chunk_result
gc.collect()
# 合并所有批次结果
if results:
result = pd.concat(results, ignore_index=True)
else:
result = pd.DataFrame(columns=['did', 'vid', 'completion_rate'])
# 重命名列
result.columns = ['did', 'vid', 'predicted_completion_rate']
# 确保结果行数与测试用户一致
if len(result) != len(test_users):
missing_dids = set(test_users['did']) - set(result['did'])
print(f"⚠️ 警告: {len(missing_dids)} 个用户缺失预测结果,使用默认值填充")
default_df = pd.DataFrame({
'did': list(missing_dids),
'vid': most_common_vid,
'predicted_completion_rate': np.mean(result['predicted_completion_rate']) if not result.empty else 0.7
})
result = pd.concat([result, default_df], ignore_index=True)
return result
# 主程序流程
if __name__ == "__main__":
# 定义内存优化的数据类型
dtypes = {
'did': 'category',
'vid': 'category',
'play_time': 'float32'
}
# 可选特征 - 只有在数据中存在时才添加
optional_features = {
'item_cid': 'category',
'item_type': 'category',
'item_assetSource': 'category',
'item_classify': 'category',
'item_isIntact': 'category',
'sid': 'category',
'stype': 'category'
}
# 添加特征字段
for i in range(88):
dtypes[f'f{i}'] = 'float32'
# 加载核心数据 - 分批加载
print("开始加载核心数据...")
did_features = load_data_safely('did_features_table.csv', dtype=dtypes)
vid_info = load_data_safely('vid_info_table.csv', dtype=dtypes)
# 添加可选特征到dtypes(仅当列存在时)
for feature, dtype in optional_features.items():
if not vid_info.empty and feature in vid_info.columns:
dtypes[feature] = dtype
# 重新加载数据以确保所有列使用正确的数据类型
if os.path.exists('did_features_table.csv'):
did_features = load_data_safely('did_features_table.csv', dtype=dtypes)
else:
print("⚠️ did_features_table.csv 不存在")
did_features = pd.DataFrame()
if os.path.exists('vid_info_table.csv'):
vid_info = load_data_safely('vid_info_table.csv', dtype=dtypes)
else:
print("⚠️ vid_info_table.csv 不存在")
vid_info = pd.DataFrame()
# 加载历史数据 - 确保所有变量都被定义
print("开始加载历史数据...")
hist_exposure, hist_click, hist_play = load_historical_data(days=30)
# 打印历史数据状态
print(f"历史曝光数据形状: {hist_exposure.shape if not hist_exposure.empty else '空'}")
print(f"历史点击数据形状: {hist_click.shape if not hist_click.empty else '空'}")
print(f"历史播放数据形状: {hist_play.shape if not hist_play.empty else '空'}")
# 如果播放数据为空,尝试替代方案
if hist_play.empty:
print("⚠️ 警告: 历史播放数据为空,尝试使用点击数据作为替代")
# 使用点击数据作为播放数据的替代
hist_play = hist_click.copy()
hist_play['play_time'] = 1.0 # 添加默认播放时间
print(f"使用替代播放数据形状: {hist_play.shape}")
# 构建点击数据集
if not hist_exposure.empty and not hist_click.empty:
print("构建点击数据集...")
click_train_data = build_click_dataset(hist_exposure, hist_click, sample_ratio=0.1)
else:
print("⚠️ 无法构建点击数据集,因为历史曝光或点击数据为空")
click_train_data = pd.DataFrame()
# 添加特征 - 确保所有参数都已定义
if not click_train_data.empty:
print("开始构建点击特征...")
click_train_data = add_click_features(
click_train_data,
did_features,
vid_info,
hist_click, # 确保hist_click已定义
hist_play # 确保hist_play已定义
)
else:
print("⚠️ 点击数据集为空,跳过特征构建")
# 基础分类特征列表 - 移除日期相关特征
base_categorical_features = [
'item_cid', 'item_type', 'item_assetSource',
'item_classify', 'item_isIntact', 'sid', 'stype'
]
# 动态获取存在的分类特征
categorical_features = []
if not click_train_data.empty:
categorical_features = get_categorical_features(click_train_data, base_categorical_features)
print(f"使用的分类特征: {categorical_features}")
else:
print("⚠️ 点击训练数据为空,无法获取分类特征")
# 准备训练数据
if not click_train_data.empty:
# 移除所有日期相关字段
X = click_train_data.drop(columns=['did', 'vid', 'label'], errors='ignore')
y = click_train_data['label']
else:
X, y = pd.DataFrame(), pd.Series()
print("⚠️ 点击训练数据为空")
# 划分数据集
if not X.empty and not y.empty:
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
else:
print("⚠️ 训练数据为空,无法进行模型训练")
X_train, X_val, y_train, y_val = pd.DataFrame(), pd.DataFrame(), pd.Series(), pd.Series()
# 训练模型(优化参数)
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'boosting_type': 'gbdt',
'num_leaves': 63,
'learning_rate': 0.05,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'min_child_samples': 100,
'verbosity': -1,
'max_bin': 255 # 减少bin数量以降低内存
}
model_click = None
if not X_train.empty:
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=categorical_features)
print("开始训练点击预测模型...")
model_click = lgb.train(
params,
train_data,
num_boost_round=1500,
valid_sets=[val_data],
callbacks=[
early_stopping(stopping_rounds=100, verbose=True),
log_evaluation(period=50)
]
)
# 保存点击模型使用的特征
global click_features
click_features = list(X_train.columns)
joblib.dump(click_features, 'click_features.pkl')
# 计算并输出AUC
if not X_val.empty and not y_val.empty and model_click:
y_val_pred = model_click.predict(X_val)
auc_score = roc_auc_score(y_val, y_val_pred)
print(f"📊 点击率模型在验证集上的AUC: {auc_score:.6f}")
with open('model_metrics.txt', 'w') as f:
f.write(f"点击率模型AUC: {auc_score:.6f}\n")
# 清理内存
del X_train, X_val, y_train, y_val, train_data, val_data
gc.collect()
else:
print("⚠️ 训练数据为空,跳过点击预测模型训练")
# 构建完播率数据集
print("开始构建完播率数据集...")
if not hist_play.empty:
play_train_data = build_play_dataset(hist_play, vid_info, did_features, hist_click)
else:
print("⚠️ 无法构建完播率数据集,因为播放数据为空")
play_train_data = pd.DataFrame()
# 训练完播率模型
model_play = None
if not play_train_data.empty:
X_play = play_train_data.drop(columns=['did', 'vid', 'play_time', 'item_duration', 'completion_rate'], errors='ignore')
y_play = play_train_data['completion_rate']
else:
X_play, y_play = pd.DataFrame(), pd.Series()
print("⚠️ 完播率训练数据为空")
if not X_play.empty and not y_play.empty:
X_train_play, X_val_play, y_train_play, y_val_play = train_test_split(
X_play, y_play, test_size=0.2, random_state=42
)
else:
print("⚠️ 完播率训练数据为空,无法进行模型训练")
X_train_play, X_val_play, y_train_play, y_val_play = pd.DataFrame(), pd.DataFrame(), pd.Series(), pd.Series()
# 获取完播率模型的分类特征
play_categorical_features = []
if not play_train_data.empty:
play_categorical_features = get_categorical_features(play_train_data, base_categorical_features)
print(f"完播率模型使用的分类特征: {play_categorical_features}")
else:
print("⚠️ 完播率训练数据为空,无法获取分类特征")
# 训练参数 - 优化内存使用
params_reg = {
'objective': 'regression',
'metric': 'mae',
'boosting_type': 'gbdt',
'num_leaves': 63, # 减少树复杂度
'learning_rate': 0.03,
'feature_fraction': 0.7,
'bagging_fraction': 0.7,
'bagging_freq': 5,
'lambda_l1': 0.1,
'lambda_l2': 0.1,
'min_data_in_leaf': 100,
'verbosity': -1,
'max_bin': 255 # 减少bin数量以降低内存
}
if not X_train_play.empty:
train_data_play = lgb.Dataset(X_train_play, label=y_train_play, categorical_feature=play_categorical_features)
val_data_play = lgb.Dataset(X_val_play, label=y_val_play, categorical_feature=play_categorical_features)
print("开始训练完播率模型...")
model_play = lgb.train(
params_reg,
train_data_play,
num_boost_round=1000, # 减少迭代次数
valid_sets=[val_data_play],
callbacks=[
early_stopping(stopping_rounds=100, verbose=True),
log_evaluation(period=50)
]
)
# 保存完播率模型使用的特征
global play_features
play_features = list(X_train_play.columns)
joblib.dump(play_features, 'play_features.pkl')
# 评估模型
y_pred_val = model_play.predict(X_val_play)
mae = mean_absolute_error(y_val_play, y_pred_val)
print(f"📊 完播率模型在验证集上的MAE: {mae:.6f}")
with open('model_metrics.txt', 'a') as f:
f.write(f"完播率模型MAE: {mae:.6f}\n")
# 清理内存
del X_train_play, X_val_play, y_train_play, y_val_play, train_data_play, val_data_play
gc.collect()
else:
print("⚠️ 训练数据为空,跳过完播率模型训练")
# 保存模型
if model_click:
model_click.save_model('click_model.txt')
if model_play:
model_play.save_model('play_model.txt')
joblib.dump(base_categorical_features, 'categorical_features.pkl')
# 如果是从文件加载模型,需要加载特征列表
if not model_click:
try:
model_click = lgb.Booster(model_file='click_model.txt')
click_features = joblib.load('click_features.pkl')
print("✅ 从文件加载点击模型和特征")
except:
print("⚠️ 无法加载点击模型")
if not model_play:
try:
model_play = lgb.Booster(model_file='play_model.txt')
play_features = joblib.load('play_features.pkl')
print("✅ 从文件加载完播率模型和特征")
except:
print("⚠️ 无法加载完播率模型")
# 加载预测数据
print("开始加载预测数据...")
to_predict_users = load_data_safely('testA_pred_did.csv', dtype={'did': 'category'})
to_predict_exposure = load_data_safely('testA_did_show.csv', dtype={'did': 'category', 'vid': 'category'})
# 执行预测
if not to_predict_users.empty:
print("开始生成预测结果...")
submission = predict_for_test_data(to_predict_users, to_predict_exposure, did_features, vid_info)
# 验证行数一致性
if len(submission) != len(to_predict_users):
print(f"⚠️ 行数不一致: 预测结果 {len(submission)} 行, 测试用户 {len(to_predict_users)} 行")
# 处理缺失的DID
missing_dids = set(to_predict_users['did']) - set(submission['did'])
if missing_dids:
print(f"添加缺失的 {len(missing_dids)} 个用户")
default_vid = vid_info['vid'].iloc[0] if not vid_info.empty else 'default_vid'
missing_df = pd.DataFrame({
'did': list(missing_dids),
'vid': default_vid,
'predicted_completion_rate': submission['predicted_completion_rate'].mean()
})
submission = pd.concat([submission, missing_df], ignore_index=True)
# 保存结果
if not submission.empty:
print(f"预测结果行数: {len(submission)} (应与测试用户行数一致)")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f'submission_{timestamp}.csv'
# 保存为无表头CSV
submission.to_csv(output_file, index=False, header=False)
print(f"预测结果已保存至: {output_file}")
print(f"结果格式: 共 {len(submission)} 行")
print(f"列信息: [did, vid, predicted_completion_rate]")
# 验证结果分布
print(f"完播率预测值分布: min={submission['predicted_completion_rate'].min():.4f}, "
f"max={submission['predicted_completion_rate'].max():.4f}, "
f"mean={submission['predicted_completion_rate'].mean():.4f}")
else:
print("⚠️ 预测结果为空,未保存文件")
else:
print("⚠️ 预测数据加载失败,无法生成结果")
最新发布