帮我检查优化代码,尤其是减少内存占用:import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
import gc
import os
import chardet
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import joblib
from datetime import datetime
from scipy.sparse import hstack, csr_matrix, save_npz, load_npz
import sys
import psutil
from sklearn.metrics import log_loss, mean_absolute_error
# 内存优化函数
def optimize_dtypes(df):
"""优化DataFrame的数据类型以减少内存占用"""
if df.empty:
return df
# 转换整数列为最小可用类型
int_cols = df.select_dtypes(include=['int']).columns
if not int_cols.empty:
df[int_cols] = df[int_cols].apply(pd.to_numeric, downcast='integer')
# 转换浮点列为最小可用类型
float_cols = df.select_dtypes(include=['float']).columns
if not float_cols.empty:
df[float_cols] = df[float_cols].apply(pd.to_numeric, downcast='float')
# 转换对象列为分类类型
obj_cols = df.select_dtypes(include=['object']).columns
for col in obj_cols:
num_unique = df[col].nunique()
num_total = len(df)
if num_unique / num_total < 0.5: # 如果唯一值比例小于50%
df[col] = df[col].astype('category')
return df
# 内存监控函数
def memory_monitor(step_name=""):
"""监控内存使用情况"""
process = psutil.Process(os.getpid())
mem_info = process.memory_info()
print(f"{step_name} 内存使用: {mem_info.rss / (1024 ** 2):.2f} MB")
return mem_info.rss / (1024 ** 2) # 返回MB
# 增强数据加载函数
def load_data_safely(file_path, usecols=None, dtype=None, chunksize=100000):
"""安全加载大型CSV文件,优化内存使用"""
try:
if not os.path.exists(file_path):
print(f"⚠️ 文件不存在: {file_path}")
return pd.DataFrame()
# 自动检测编码
with open(file_path, 'rb') as f:
result = chardet.detect(f.read(100000))
encoding = result['encoding'] if result['confidence'] > 0.7 else 'latin1'
# 分批读取并优化内存
chunks = []
reader = pd.read_csv(
file_path,
encoding=encoding,
usecols=usecols,
dtype=dtype,
chunksize=chunksize,
low_memory=False
)
for chunk in tqdm(reader, desc=f"加载 {os.path.basename(file_path)}"):
# 优化分类列内存
for col in chunk.columns:
if dtype and col in dtype and dtype[col] == 'category':
chunk[col] = chunk[col].astype('category').cat.as_ordered()
# 优化数据类型
chunk = optimize_dtypes(chunk)
chunks.append(chunk)
if chunks:
result = pd.concat(chunks, ignore_index=True)
# 再次整体优化
result = optimize_dtypes(result)
return result
return pd.DataFrame()
except Exception as e:
print(f"⚠️ 加载 {file_path} 失败: {str(e)}")
return pd.DataFrame()
# 稀疏矩阵转换函数
def to_sparse_matrix(df, columns):
"""将分类特征转换为稀疏矩阵表示"""
sparse_matrices = []
for col in columns:
if col in df.columns:
# 处理NaN值
df[col] = df[col].fillna('MISSING')
# 创建稀疏矩阵
sparse_mat = csr_matrix(pd.get_dummies(df[col], sparse=True).values)
sparse_matrices.append(sparse_mat)
# 水平堆叠所有稀疏矩阵
if sparse_matrices:
return hstack(sparse_matrices)
return None
# 增量训练函数
def train_incremental(X, y, categorical_features, params, num_rounds=1000, chunk_size=100000):
"""分块增量训练模型以减少内存占用"""
model = None
for i in tqdm(range(0, len(X), chunk_size), desc="增量训练"):
chunk_end = min(i + chunk_size, len(X))
X_chunk = X.iloc[i:chunk_end]
y_chunk = y.iloc[i:chunk_end]
train_data = lgb.Dataset(
X_chunk,
label=y_chunk,
categorical_feature=categorical_features
)
if model is None:
model = lgb.train(
params,
train_data,
num_boost_round=num_rounds,
keep_training_booster=True
)
else:
model = lgb.train(
params,
train_data,
num_boost_round=num_rounds,
init_model=model,
keep_training_booster=True
)
return model
# 主处理流程
def main():
"""主处理流程,包含完整的内存优化策略"""
# 初始内存监控
start_mem = memory_monitor("初始内存")
# 定义内存优化的数据类型
dtypes = {
'did': 'category',
'vid': 'category',
'play_time': 'float32'
}
# 可选特征
optional_features = {
'item_cid': 'category',
'item_type': 'category',
'item_assetSource': 'category',
'item_classify': 'category',
'item_isIntact': 'category',
'sid': 'category',
'stype': 'category'
}
# 添加特征字段
for i in range(88):
dtypes[f'f{i}'] = 'float32'
# 加载核心数据
print("开始加载核心数据...")
did_features = load_data_safely('did_features_table.csv', dtype=dtypes)
vid_info = load_data_safely('vid_info_table.csv', dtype=dtypes)
memory_monitor("加载核心数据后")
# 添加可选特征到dtypes
for feature, dtype in optional_features.items():
if not vid_info.empty and feature in vid_info.columns:
dtypes[feature] = dtype
# 重新加载数据以确保所有列使用正确的数据类型
if os.path.exists('did_features_table.csv'):
did_features = load_data_safely('did_features_table.csv', dtype=dtypes)
else:
print("⚠️ did_features_table.csv 不存在")
did_features = pd.DataFrame()
if os.path.exists('vid_info_table.csv'):
vid_info = load_data_safely('vid_info_table.csv', dtype=dtypes)
else:
print("⚠️ vid_info_table.csv 不存在")
vid_info = pd.DataFrame()
memory_monitor("重新加载数据后")
# 加载历史数据
print("开始加载历史数据...")
hist_exposure, hist_click, hist_play = load_historical_data(days=32)
memory_monitor("加载历史数据后")
# 构建点击数据集
if not hist_exposure.empty and not hist_click.empty:
print("构建点击数据集...")
click_train_data = build_click_dataset(hist_exposure, hist_click, sample_ratio=0.1)
else:
print("⚠️ 无法构建点击数据集")
click_train_data = pd.DataFrame()
memory_monitor("构建点击数据集后")
# 添加特征
if not click_train_data.empty:
print("开始构建点击特征...")
click_train_data = add_click_features(
click_train_data,
did_features,
vid_info,
hist_click,
hist_play
)
else:
print("⚠️ 点击数据集为空,跳过特征构建")
memory_monitor("添加特征后")
# 准备训练数据
if not click_train_data.empty:
if 'date' in click_train_data.columns:
X = click_train_data.drop(columns=['did', 'vid', 'label', 'date'], errors='ignore')
else:
X = click_train_data.drop(columns=['did', 'vid', 'label'], errors='ignore')
y = click_train_data['label']
else:
X, y = pd.DataFrame(), pd.Series()
print("⚠️ 点击训练数据为空")
# 划分数据集
if not X.empty and not y.empty:
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
else:
print("⚠️ 训练数据为空,无法进行模型训练")
X_train, X_val, y_train, y_val = pd.DataFrame(), pd.DataFrame(), pd.Series(), pd.Series()
memory_monitor("划分数据集后")
# 训练模型参数
params = {
'objective': 'binary',
'metric': 'binary_logloss',
'boosting_type': 'gbdt',
'num_leaves': 63,
'learning_rate': 0.05,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'min_child_samples': 100,
'verbosity': -1
}
# 增量训练点击模型
if not X_train.empty:
print("开始训练点击预测模型...")
model_click = train_incremental(X_train, y_train, categorical_features, params, num_rounds=1500, chunk_size=100000)
# 在验证集上评估
val_preds = model_click.predict(X_val)
val_logloss = log_loss(y_val, val_preds)
print(f"验证集LogLoss: {val_logloss:.4f}")
else:
model_click = None
print("⚠️ 训练数据为空,跳过点击预测模型训练")
memory_monitor("训练点击模型后")
# 构建完播率数据集
print("开始构建完播率数据集...")
play_train_data = build_play_dataset(hist_play, vid_info, did_features, hist_click)
memory_monitor("构建完播率数据集后")
# 训练完播率模型
if not play_train_data.empty:
X_play = play_train_data.drop(columns=['did', 'vid', 'play_time', 'item_duration', 'completion_rate'], errors='ignore')
y_play = play_train_data['completion_rate']
else:
X_play, y_play = pd.DataFrame(), pd.Series()
print("⚠️ 完播率训练数据为空")
if not X_play.empty and not y_play.empty:
X_train_play, X_val_play, y_train_play, y_val_play = train_test_split(
X_play, y_play, test_size=0.2, random_state=42
)
else:
print("⚠️ 完播率训练数据为空,无法进行模型训练")
X_train_play, X_val_play, y_train_play, y_val_play = pd.DataFrame(), pd.DataFrame(), pd.Series(), pd.Series()
# 训练参数
params_reg = {
'objective': 'regression',
'metric': 'mae',
'boosting_type': 'gbdt',
'num_leaves': 63,
'learning_rate': 0.03,
'feature_fraction': 0.8,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'lambda_l1': 0.1,
'lambda_l2': 0.1,
'min_data_in_leaf': 50,
'verbosity': -1
}
# 增量训练完播率模型
if not X_train_play.empty:
print("开始训练完播率模型...")
model_play = train_incremental(X_train_play, y_train_play, play_categorical_features, params_reg, num_rounds=2000, chunk_size=100000)
# 在验证集上评估
val_preds = model_play.predict(X_val_play)
val_mae = mean_absolute_error(y_val_play, val_preds)
print(f"验证集MAE: {val_mae:.4f}")
else:
model_play = None
print("⚠️ 训练数据为空,跳过完播率模型训练")
memory_monitor("训练完播率模型后")
# 保存模型
if model_click:
model_click.save_model('click_model.txt')
print("点击预测模型已保存")
if model_play:
model_play.save_model('play_model.txt')
print("完播率预测模型已保存")
# 预测流程
print("开始加载预测数据...")
to_predict_users = load_data_safely('testA_pred_did.csv', dtype={'did': 'category'})
to_predict_exposure = load_data_safely('testA_did_show.csv', dtype={'did': 'category', 'vid': 'category'})
# 执行预测
if not to_predict_users.empty and not to_predict_exposure.empty:
print("开始生成预测结果...")
submission = predict_for_test_data(to_predict_users, to_predict_exposure, did_features, vid_info)
# 保存结果
if not submission.empty:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = f'submission_{timestamp}.csv'
submission.to_csv(output_file, index=False)
print(f"预测结果已保存至: {output_file}")
else:
print("⚠️ 预测结果为空,未保存文件")
else:
print("⚠️ 预测数据加载失败,无法生成结果")
# 最终内存报告
end_mem = memory_monitor("处理完成")
print(f"总内存消耗: {end_mem - start_mem:.2f} MB")
# 历史数据加载函数
def load_historical_data(days=32):
"""高效加载历史数据,支持分批处理"""
see_list, click_list, play_list = [], [], []
for day in tqdm(range(1, days + 1), desc="加载历史数据"):
day_str = f"{day:02d}"
# 加载曝光数据
see_path = f'see_{day_str}.csv'
if os.path.exists(see_path):
see = load_data_safely(see_path, usecols=['did', 'vid'], dtype={'did': 'category', 'vid': 'category'})
if not see.empty and 'did' in see.columns and 'vid' in see.columns:
see_list.append(see)
del see
gc.collect()
# 加载点击数据
click_path = f'click_{day_str}.csv'
if os.path.exists(click_path):
click = load_data_safely(click_path, usecols=['did', 'vid', 'click_time'], dtype={'did': 'category', 'vid': 'category'})
if not click.empty and 'click_time' in click.columns and 'did' in click.columns and 'vid' in click.columns:
# 优化日期处理
click['date'] = pd.to_datetime(click['click_time'], errors='coerce').dt.date
click = click.drop(columns=['click_time'], errors='ignore')
click_list.append(click[['did', 'vid', 'date']])
del click
gc.collect()
# 加载播放数据
play_path = f'playplus_{day_str}.csv'
if os.path.exists(play_path):
play = load_data_safely(play_path, usecols=['did', 'vid', 'play_time'], dtype={'did': 'category', 'vid': 'category'})
if not play.empty and 'play_time' in play.columns and 'did' in play.columns and 'vid' in play.columns:
play_list.append(play)
del play
gc.collect()
gc.collect()
# 确保返回三个DataFrame
return (
pd.concat(see_list).drop_duplicates(['did', 'vid']) if see_list else pd.DataFrame(),
pd.concat(click_list).drop_duplicates(['did', 'vid']) if click_list else pd.DataFrame(),
pd.concat(play_list).drop_duplicates(['did', 'vid']) if play_list else pd.DataFrame()
)
# 点击数据集构建
def build_click_dataset(hist_exposure, hist_click, sample_ratio=0.1):
"""构建点击数据集,包含负样本采样"""
if hist_exposure.empty or hist_click.empty:
print("⚠️ 历史曝光或点击数据为空,无法构建数据集")
return pd.DataFrame()
# 标记正样本
hist_click = hist_click.copy()
hist_click['label'] = 1
# 高效标记负样本
exposure_set = set(zip(hist_exposure['did'], hist_exposure['vid']))
click_set = set(zip(hist_click['did'], hist_click['vid']))
# 找出未点击的曝光
negative_set = exposure_set - click_set
# 创建负样本DataFrame
if negative_set:
negative_dids, negative_vids = zip(*negative_set)
negative_samples = pd.DataFrame({
'did': list(negative_dids),
'vid': list(negative_vids),
'label': 0
})
# 采样负样本
if sample_ratio < 1.0:
negative_samples = negative_samples.sample(frac=sample_ratio, random_state=42)
else:
negative_samples = pd.DataFrame(columns=['did', 'vid', 'label'])
# 合并数据集
click_data = pd.concat([
hist_click[['did', 'vid', 'label']],
negative_samples
], ignore_index=True)
# 释放内存
del exposure_set, click_set, negative_set, negative_samples
gc.collect()
return click_data
# 特征工程函数
def add_click_features(df, did_features, vid_info, hist_click, hist_play):
"""添加关键特征,避免内存溢出"""
if df.empty:
return df
# 基础特征
if not did_features.empty and 'did' in did_features.columns:
# 只取需要的列
did_cols = [col for col in did_features.columns if col not in ['did'] or col == 'did']
df = df.merge(did_features[did_cols], on='did', how='left')
if not vid_info.empty and 'vid' in vid_info.columns:
vid_cols = [col for col in vid_info.columns if col not in ['vid'] or col == 'vid']
df = df.merge(vid_info[vid_cols], on='vid', how='left')
# 用户行为统计
if not hist_click.empty and 'did' in hist_click.columns:
user_click_count = hist_click.groupby('did').size().rename('user_click_count')
df = df.merge(user_click_count, on='did', how='left')
else:
df['user_click_count'] = 0
if not hist_play.empty and 'did' in hist_play.columns and 'play_time' in hist_play.columns:
user_total_play = hist_play.groupby('did')['play_time'].sum().rename('user_total_play')
df = df.merge(user_total_play, on='did', how='left')
else:
df['user_total_play'] = 0
if not hist_click.empty and 'vid' in hist_click.columns:
video_click_count = hist_click.groupby('vid').size().rename('video_click_count')
df = df.merge(video_click_count, on='vid', how='left')
else:
df['video_click_count'] = 0
if not hist_play.empty and 'vid' in hist_play.columns and 'play_time' in hist_play.columns:
avg_play_time = hist_play.groupby('vid')['play_time'].mean().rename('avg_play_time')
df = df.merge(avg_play_time, on='vid', how='left')
else:
df['avg_play_time'] = 0
# 填充缺失值
fill_values = {
'user_click_count': 0,
'user_total_play': 0,
'video_click_count': df['video_click_count'].median() if 'video_click_count' in df else 0,
'avg_play_time': df['avg_play_time'].median() if 'avg_play_time' in df else 0
}
for col, value in fill_values.items():
if col in df:
df[col] = df[col].fillna(value)
# 添加时间相关特征
if 'date' in df:
df['day_of_week'] = pd.to_datetime(df['date']).dt.dayofweek.astype('int8')
df['hour'] = pd.to_datetime(df['date']).dt.hour.astype('int8')
return df
# 预测函数
def predict_for_test_data(test_users, test_exposure, did_features, vid_info):
"""为测试数据生成预测结果"""
if test_users.empty or test_exposure.empty:
print("⚠️ 测试数据为空,无法进行预测")
return pd.DataFrame()
# 合并测试数据
test_data = test_exposure.merge(test_users, on='did', how='left')
# 添加特征
test_data = add_click_features(
test_data,
did_features,
vid_info,
pd.DataFrame(), # 无历史点击
pd.DataFrame() # 无历史播放
)
# 预测点击率
X_test = test_data.drop(columns=['did', 'vid', 'date'], errors='ignore')
click_probs = []
if model_click and not X_test.empty:
# 分块预测避免内存问题
click_probs = []
chunk_size = 100000
for i in range(0, len(X_test), chunk_size):
chunk = X_test.iloc[i:i+chunk_size]
click_probs.extend(model_click.predict(chunk))
else:
click_probs = [0.5] * len(test_data) # 默认值
# 预测完播率
completion_rates = []
if model_play and not X_test.empty:
# 添加视频时长信息
if not vid_info.empty and 'vid' in vid_info.columns and 'item_duration' in vid_info.columns:
test_data = test_data.merge(vid_info[['vid', 'item_duration']], on='vid', how='left')
else:
test_data['item_duration'] = 1.0
# 分块预测
completion_rates = []
for i in range(0, len(X_test), chunk_size):
chunk = X_test.iloc[i:i+chunk_size]
completion_rates.extend(model_play.predict(chunk))
else:
completion_rates = [0.7] * len(test_data) # 默认值
# 计算综合得分
test_data['click_prob'] = click_probs
test_data['completion_rate'] = completion_rates
test_data['score'] = test_data['click_prob'] * test_data['completion_rate']
# 为每个用户选择得分最高的视频
submission = test_data.sort_values('score', ascending=False).groupby('did').head(1)
# 选择需要的列
submission = submission[['did', 'vid', 'completion_rate']].copy()
# 重命名列
submission.columns = ['did', 'vid', 'completion_rate']
# 确保数据格式正确
submission['did'] = submission['did'].astype(str)
submission['vid'] = submission['vid'].astype(str)
submission['completion_rate'] = submission['completion_rate'].round(4)
return submission
# 主程序入口
if __name__ == "__main__":
main()