这个模型是在干什么:import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import chardet
import os
import gc
import joblib
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
# 内存优化函数
def reduce_mem_usage(df, use_float16=False):
"""迭代降低DataFrame的内存占用"""
start_mem = df.memory_usage().sum() / 1024**2
print(f"内存优化前: {start_mem:.2f} MB")
for col in df.columns:
col_type = df[col].dtype
if col_type != object:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == "int":
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
print(f"内存优化后: {end_mem:.2f} MB ({100*(start_mem-end_mem)/start_mem:.1f}% 减少)")
return df
def detect_encoding(file_path):
with open(file_path, 'rb') as f:
result = chardet.detect(f.read(10000))
return result['encoding'], result['confidence']
def load_data_for_day(day):
"""逐天加载数据并进行基本处理,返回优化后的DataFrame"""
dtypes = {'did': 'category', 'vid': 'category'}
day_str = f"{day:02d}"
try:
# 加载 see 数据
see_path = f'see_{day_str}.csv'
if not os.path.exists(see_path):
print(f"⚠️ 警告: 文件 {see_path} 不存在,跳过该天数据")
return None, None, None
see = pd.read_csv(see_path, encoding='latin1', dtype=dtypes)
if 'did' not in see.columns or 'vid' not in see.columns:
print(f"⚠️ 警告: see_{day_str}.csv 缺少必要字段")
return None, None, None
see['day'] = day_str
see = reduce_mem_usage(see)
# 加载 click 数据
click_path = f'click_{day_str}.csv'
if os.path.exists(click_path):
click = pd.read_csv(
click_path,
encoding='ISO-8859-1',
on_bad_lines='skip',
dtype=dtypes
)
if 'click_time' not in click.columns:
print(f"⚠️ 警告: click_{day_str}.csv 缺少 click_time 字段")
click = None
else:
click = click[['did', 'vid']]
click = reduce_mem_usage(click)
else:
click = None
print(f"⚠️ 警告: click_{day_str}.csv 不存在")
# 加载 play 数据
play_path = f'playplus_{day_str}.csv'
if os.path.exists(play_path):
play = pd.read_csv(
play_path,
engine='python',
encoding_errors='ignore',
dtype=dtypes
)
if 'play_time' not in play.columns:
print(f"⚠️ 警告: playplus_{day_str}.csv 缺少 play_time 字段")
play = None
else:
play = play[['did', 'vid', 'play_time']]
play = reduce_mem_usage(play)
else:
play = None
print(f"⚠️ 警告: playplus_{day_str}.csv 不存在")
return see, click, play
except Exception as e:
print(f"⚠️ 加载第 {day_str} 天数据时出错: {str(e)}")
return None, None, None
def process_data_in_chunks(days, feature_builder=None):
"""分块处理数据,避免内存溢出"""
# 首先处理视频信息(一次性)
if not os.path.exists('vid_info_table.csv'):
raise FileNotFoundError("错误: vid_info_table.csv 文件不存在")
video_info = pd.read_csv('vid_info_table.csv', encoding='gbk', dtype={'vid': 'category'})
if 'item_duration' not in video_info.columns:
raise ValueError("vid_info_table.csv 缺少 item_duration 字段")
video_info = reduce_mem_usage(video_info)
video_info['vid'] = video_info['vid'].astype('category')
# 初始化全局数据结构
user_stats = {}
video_stats = {}
# 逐天处理数据
for day in tqdm(range(1, days + 1), desc="处理每日数据"):
see, click, play = load_data_for_day(day)
if see is None:
continue
# 处理曝光数据
see_grouped = see.groupby('did')['vid'].nunique().reset_index(name='exposure_count')
see_grouped = reduce_mem_usage(see_grouped)
# 合并播放数据(如果存在)
if play is not None:
see = pd.merge(see, play, on=['did', 'vid'], how='left')
see['play_time'] = see['play_time'].fillna(0).astype(np.float32)
else:
see['play_time'] = 0.0
# 合并点击数据(如果存在)
if click is not None:
click['clicked'] = 1
see = pd.merge(see, click, on=['did', 'vid'], how='left')
see['clicked'] = see['clicked'].fillna(0).astype(np.int8)
else:
see['clicked'] = 0
# 合并视频信息
see = pd.merge(see, video_info[['vid', 'item_duration']], on='vid', how='left')
see['item_duration'] = see['item_duration'].fillna(1.0)
see.loc[see['item_duration'] <= 0, 'item_duration'] = 1.0
# 计算完成率
see['completion_rate'] = (see['play_time'] / see['item_duration']).clip(0, 1).astype(np.float16)
# 创建标签
see['label'] = np.select(
[(see['completion_rate'] > 0.4), (see['clicked'] == 1)],
[2, 1], # 2=完成, 1=点击
default=0 # 0=曝光未点击
).astype(np.int8)
see['binary_label'] = see['label'].apply(lambda x: 1 if x >= 1 else 0).astype(np.int8)
# 更新用户统计
for _, row in see.iterrows():
did = row['did']
vid = row['vid']
# 初始化用户统计
if did not in user_stats:
user_stats[did] = {
'exposure_count': 0,
'click_count': 0,
'active_days': set()
}
# 更新曝光计数
user_stats[did]['exposure_count'] += 1
# 更新点击计数
if row['clicked'] == 1:
user_stats[did]['click_count'] += 1
# 更新活跃天数
user_stats[did]['active_days'].add(day)
# 初始化视频统计
if vid not in video_stats:
video_stats[vid] = {
'click_users': set()
}
# 更新视频点击用户
if row['clicked'] == 1:
video_stats[vid]['click_users'].add(did)
# 释放内存
del see
gc.collect()
# 计算全局特征
print("计算全局特征...")
user_features = []
for did, stats in user_stats.items():
active_days = len(stats['active_days'])
click_count = stats['click_count']
exposure_count = stats['exposure_count'] if stats['exposure_count'] > 0 else 1
user_click_rate = click_count / exposure_count
user_features.append({
'did': did,
'user_click_rate': user_click_rate,
'user_active_days': active_days
})
video_features = []
for vid, stats in video_stats.items():
video_popularity = len(stats['click_users'])
video_features.append({
'vid': vid,
'video_popularity': video_popularity
})
user_df = pd.DataFrame(user_features)
video_df = pd.DataFrame(video_features)
# 释放内存
del user_stats, video_stats
gc.collect()
# 保存特征
user_df = reduce_mem_usage(user_df)
video_df = reduce_mem_usage(video_df)
user_df.to_csv('user_click_rate.csv', index=False)
video_df.to_csv('video_popularity.csv', index=False)
return user_df, video_df
def prepare_samples(days=7):
"""准备训练样本(内存优化版本)"""
# 处理数据并获取全局特征
user_df, video_df = process_data_in_chunks(days)
# 读取并处理最近一天的数据作为样本
see, _, play = load_data_for_day(days)
if see is None:
raise ValueError("无法加载样本数据")
# 合并用户特征
see = pd.merge(see, user_df, on='did', how='left')
see['user_click_rate'] = see['user_click_rate'].fillna(0).astype(np.float32)
see['user_active_days'] = see['user_active_days'].fillna(1).astype(np.int16)
# 合并视频特征
see = pd.merge(see, video_df, on='vid', how='left')
see['video_popularity'] = see['video_popularity'].fillna(0).astype(np.float32)
# 特征交叉
see['user_video_interaction'] = (see['user_active_days'] * np.log1p(see['video_popularity'])).astype(np.float32)
see['user_video_affinity'] = (see['user_click_rate'] * see['video_popularity']).astype(np.float32)
# 处理视频信息
video_info = pd.read_csv('vid_info_table.csv', encoding='gbk', dtype={'vid': 'category'})
see = pd.merge(see, video_info[['vid', 'item_duration']], on='vid', how='left')
see['item_duration'] = see['item_duration'].fillna(1.0)
see.loc[see['item_duration'] <= 0, 'item_duration'] = 1.0
# 计算完成率
if 'play_time' not in see.columns:
see['play_time'] = 0.0
see['completion_rate'] = (see['play_time'] / see['item_duration']).clip(0, 1).astype(np.float16)
# 创建标签
see['label'] = np.select(
[(see['completion_rate'] > 0.4), (see['clicked'] == 1)],
[2, 1], # 2=完成, 1=点击
default=0 # 0=曝光未点击
).astype(np.int8)
see['binary_label'] = see['label'].apply(lambda x: 1 if x >= 1 else 0).astype(np.int8)
# 优化内存
see = reduce_mem_usage(see)
return see, user_df, video_df
def train_model(samples):
"""训练模型(内存优化版本)"""
print("准备训练数据...")
features = ['user_click_rate', 'video_popularity', 'user_active_days',
'user_video_interaction', 'user_video_affinity']
# 确保特征存在
available_features = [f for f in features if f in samples.columns]
print(f"使用的特征: {available_features}")
X = samples[available_features]
y = samples['binary_label']
# 检查标签分布
if len(y.unique()) < 2:
raise ValueError("标签数据不平衡,需要正负样本")
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# 优化内存
X_train = reduce_mem_usage(X_train)
X_test = reduce_mem_usage(X_test)
# 创建数据集
lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=True)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=True)
# 优化模型参数(降低复杂度)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'num_leaves': 31, # 减少叶子节点
'max_depth': 7, # 减少深度
'learning_rate': 0.05,
'feature_fraction': 0.7,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'min_child_samples': 100, # 增加以降低内存
'verbosity': -1,
'seed': 42
}
# 训练模型
print("训练模型...")
model = lgb.train(
params,
lgb_train,
num_boost_round=500, # 减少迭代次数
valid_sets=[lgb_train, lgb_eval],
callbacks=[
early_stopping(stopping_rounds=50, verbose=True),
log_evaluation(period=100)
]
)
# 评估模型
y_pred = model.predict(X_test)
auc_score = roc_auc_score(y_test, y_pred)
print(f"✅ 模型训练完成,验证集AUC: {auc_score:.4f}")
# 保存模型
joblib.dump(model, 'lightgbm_model.pkl')
print("💾 模型已保存")
# 保存特征列表
with open('feature_columns.txt', 'w') as f:
f.write('\n'.join(available_features))
return model, available_features, auc_score
def predict_new_data(model, feature_columns, test_file):
"""预测新数据(内存优化版本)"""
print("加载测试数据...")
test_data = pd.read_csv(test_file, dtype={'did': 'category', 'vid': 'category'})
test_data = reduce_mem_usage(test_data)
# 加载特征映射
user_df = pd.read_csv('user_click_rate.csv') if os.path.exists('user_click_rate.csv') else pd.DataFrame()
video_df = pd.read_csv('video_popularity.csv') if os.path.exists('video_popularity.csv') else pd.DataFrame()
# 使用全局均值用于填充新用户/新视频
global_user_rate = user_df['user_click_rate'].mean() if not user_df.empty else 0
global_video_pop = video_df['video_popularity'].mean() if not video_df.empty else 0
global_active_days = user_df['user_active_days'].mean() if not user_df.empty else 1
# 创建映射字典(减少内存)
user_click_map = user_df.set_index('did')['user_click_rate'].to_dict() if not user_df.empty else {}
video_pop_map = video_df.set_index('vid')['video_popularity'].to_dict() if not video_df.empty else {}
user_active_map = user_df.set_index('did')['user_active_days'].to_dict() if not user_df.empty else {}
# 添加特征
print("添加特征...")
test_data['user_click_rate'] = test_data['did'].map(user_click_map).fillna(global_user_rate).astype(np.float32)
test_data['video_popularity'] = test_data['vid'].map(video_pop_map).fillna(global_video_pop).astype(np.float32)
test_data['user_active_days'] = test_data['did'].map(user_active_map).fillna(global_active_days).astype(np.int16)
# 特征交叉
test_data['user_video_interaction'] = (test_data['user_active_days'] * np.log1p(test_data['video_popularity'])).astype(np.float32)
test_data['user_video_affinity'] = (test_data['user_click_rate'] * test_data['video_popularity']).astype(np.float32)
# 确保所有特征都存在
print("准备预测数据...")
test_features = test_data[feature_columns].copy()
# 释放内存
del test_data
gc.collect()
# 分批预测(避免内存溢出)
print("开始预测...")
batch_size = 100000
predictions = []
for i in tqdm(range(0, len(test_features), batch_size), desc="预测批次"):
batch = test_features.iloc[i:i+batch_size]
preds = model.predict(batch)
predictions.extend(preds.tolist())
del batch
gc.collect()
# 重新加载测试数据以获取did和vid
test_data = pd.read_csv(test_file, dtype={'did': 'category', 'vid': 'category'},
usecols=['did', 'vid'])
test_data['click_prob'] = predictions
# 生成并保存结果
print("生成最终结果...")
top_predictions = test_data.sort_values('click_prob', ascending=False).groupby('did').head(1)
result = top_predictions[['did', 'vid', 'click_prob']].copy()
result.to_csv('prediction_result.csv', index=False)
print(f"✅ 预测完成,结果已保存至 prediction_result.csv")
print(f"预测样本数量: {len(result)}")
# 释放内存
del test_features, predictions, top_predictions
gc.collect()
return result
if __name__ == '__main__':
try:
print("🚀 开始视频推荐模型训练与预测流程 (内存优化版)")
# 设置较小的天数
TRAIN_DAYS = 7 # 仅使用7天数据
print(f"⚙️ 配置: 使用{TRAIN_DAYS}天数据训练")
# 准备样本
print("🔧 准备训练样本...")
samples, _, _ = prepare_samples(days=TRAIN_DAYS)
if samples is None:
raise ValueError("样本准备失败")
print(f"✅ 样本准备完成 - 总样本数: {len(samples)}")
# 标签分布
label_dist = samples['binary_label'].value_counts(normalize=True)
print(f"📊 标签分布 - 正样本: {label_dist[1]:.2%}, 负样本: {label_dist[0]:.2%}")
# 训练模型
print("🤖 开始训练LightGBM模型...")
model, features, auc_score = train_model(samples)
print(f"🎯 最优模型AUC: {auc_score:.4f}")
# 释放内存
del samples
gc.collect()
# 预测新数据
print("🔮 开始预测新数据...")
test_file = 'testA_did_show.csv'
# 直接加载保存的模型(避免内存中的模型占用)
if not os.path.exists('lightgbm_model.pkl'):
raise FileNotFoundError("模型文件不存在")
model = joblib.load('lightgbm_model.pkl')
# 加载特征列表
if not os.path.exists('feature_columns.txt'):
raise FileNotFoundError("特征列表文件不存在")
with open('feature_columns.txt', 'r') as f:
features = f.read().splitlines()
result = predict_new_data(model, features, test_file)
print("✅ 流程成功完成!")
except Exception as e:
print(f"❌ 流程出错: {str(e)}")
import traceback
traceback.print_exc()