关于GC.Collect在不同机器上表现不一致问题

探讨了在不同机器上GC.Collect行为的不一致性,特别是在游戏开发中导致的性能问题。更新后的项目在某些设备上出现卡顿,归因于垃圾回收过程中的性能消耗波动。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

关于GC.Collect在不同机器上表现不一致问题

2019.1.17,昨天项目更新后策划反应: 游戏卡,表现为:每走几步就卡一下。前天还没有。

但在我本机测试中,却没有任何卡顿感。

QC同学的机器也卡。我去看了下,打开Profiler,没开deep,发现是 GC.collect产生了极大波峰。

看来不同的机器上GC回收完全可能不同。

posted on 2019-01-17 16:41 时空观察者9号 阅读(...) 评论(...) 编辑 收藏

帮我检查代码,如有可以加速计算可以优化:import pandas as pd import numpy as np import lightgbm as lgb from lightgbm import early_stopping, log_evaluation import gc import os import chardet from sklearn.model_selection import train_test_split from tqdm import tqdm import joblib from datetime import datetime from sklearn.metrics import roc_auc_score, mean_absolute_error # 全局变量存储特征列表 click_features = [] play_features = [] base_categorical_features = [] # 优化内存使用:减小chunksize,添加更多内存清理 def load_data_safely(file_path, usecols=None, dtype=None, chunksize=50000): """安全加载大型CSV文件,优化内存使用""" try: if not os.path.exists(file_path): print(f"⚠️ 文件存在: {file_path}") return pd.DataFrame() # 自动检测编码 with open(file_path, 'rb') as f: result = chardet.detect(f.read(100000)) encoding = result['encoding'] if result['confidence'] > 0.7 else 'latin1' # 分批读取并优化内存 - 减小chunksize以降低内存峰值 chunks = [] reader = pd.read_csv( file_path, encoding=encoding, usecols=usecols, dtype=dtype, chunksize=chunksize, low_memory=False ) for chunk in tqdm(reader, desc=f"加载 {os.path.basename(file_path)}"): # 优化分类列内存 if dtype: # 确保dtype为空 for col in chunk.columns: if col in dtype and dtype[col] == 'category': chunk[col] = chunk[col].astype('category').cat.as_ordered() chunks.append(chunk) if chunks: result_df = pd.concat(chunks, ignore_index=True) del chunks gc.collect() return result_df return pd.DataFrame() except Exception as e: print(f"⚠️ 加载 {file_path} 失败: {str(e)}") return pd.DataFrame() # 修复播放数据加载问题 def load_historical_data(days=30): """高效加载历史数据,支持分批处理""" see_list, click_list, play_list = [], [], [] for day in tqdm(range(1, days + 1), desc="加载历史数据"): day_str = f"{day:02d}" # 加载曝光数据 see_path = f'see_{day_str}.csv' if os.path.exists(see_path): see = load_data_safely(see_path, usecols=['did', 'vid'], dtype={'did': 'category', 'vid': 'category'}) if not see.empty and 'did' in see.columns and 'vid' in see.columns: see_list.append(see) del see gc.collect() else: print(f"⚠️ 曝光数据文件存在: {see_path}") # 加载点击数据 click_path = f'click_{day_str}.csv' if os.path.exists(click_path): click = load_data_safely(click_path, usecols=['did', 'vid'], dtype={'did': 'category', 'vid': 'category'}) if not click.empty and 'did' in click.columns and 'vid' in click.columns: click_list.append(click[['did', 'vid']]) del click gc.collect() else: print(f"⚠️ 点击数据文件存在: {click_path}") # 修复播放数据加载问题 - 尝试多种可能的文件名格式 play_paths = [ f'playplus_{day_str}.csv', # 原始文件名 f'play_{day_str}.csv', # 可能的其他格式 f'playplus_{day}.csv', # 无前导零 f'play_{day}.csv' # 无前导零 ] play_loaded = False for play_path in play_paths: if os.path.exists(play_path): play = load_data_safely( play_path, usecols=['did', 'vid', 'play_time'], dtype={'did': 'category', 'vid': 'category'} ) if not play.empty and 'play_time' in play.columns and 'did' in play.columns and 'vid' in play.columns: play_list.append(play) del play play_loaded = True print(f"✅ 成功加载播放数据: {play_path}") break if not play_loaded: print(f"⚠️ 播放数据文件存在: 尝试了 {play_paths}") # 每处理3天数据清理一次内存 if day % 3 == 0: gc.collect() # 确保返回三个DataFrame,即使某些为空 return ( pd.concat(see_list).drop_duplicates(['did', 'vid']) if see_list else pd.DataFrame(), pd.concat(click_list).drop_duplicates(['did', 'vid']) if click_list else pd.DataFrame(), pd.concat(play_list).drop_duplicates(['did', 'vid']) if play_list else pd.DataFrame() ) # 优化内存:使用更高效的方法处理负样本 def build_click_dataset(hist_exposure, hist_click, sample_ratio=0.1): """构建点击数据集,包含负样本采样""" if hist_exposure.empty or hist_click.empty: print("⚠️ 历史曝光或点击数据为空,无法构建数据集") return pd.DataFrame() # 标记正样本 hist_click = hist_click.copy() hist_click['label'] = 1 # 高效标记负样本(使用索引加速操作) exposure_index = hist_exposure.set_index(['did', 'vid']).index click_index = hist_click.set_index(['did', 'vid']).index # 找出未点击的曝光 negative_index = exposure_index.difference(click_index) # 创建负样本DataFrame if not negative_index.empty: negative_samples = pd.DataFrame( list(negative_index), columns=['did', 'vid'] ) negative_samples['label'] = 0 # 采样负样本 if sample_ratio < 1.0: negative_samples = negative_samples.sample(frac=sample_ratio, random_state=42) else: negative_samples = pd.DataFrame(columns=['did', 'vid', 'label']) # 合并数据集 click_data = pd.concat([ hist_click[['did', 'vid', 'label']], negative_samples ], ignore_index=True) # 释放内存 del exposure_index, click_index, negative_index, negative_samples gc.collect() return click_data # 优化内存:减少合并操作,使用更高效的特征添加方法 def add_click_features(df, did_features, vid_info, hist_click, hist_play): """添加关键特征,避免内存溢出""" if df.empty: return df # 基础特征(使用索引加速合并) if not did_features.empty and 'did' in did_features.columns: # 只选择需要的列 did_cols = ['did'] + [col for col in did_features.columns if col.startswith('f')] df = df.merge(did_features[did_cols], on='did', how='left') if not vid_info.empty and 'vid' in vid_info.columns: # 只选择分类特征 vid_cols = ['vid'] + [col for col in vid_info.columns if col in ['item_cid', 'item_type', 'item_assetSource', 'item_classify', 'item_isIntact']] df = df.merge(vid_info[vid_cols], on='vid', how='left') # 确保始终创建 'user_click_count' 列 df['user_click_count'] = 0 # 用户行为统计 - 使用预计算的统计量 if not hist_click.empty and 'did' in hist_click.columns: # 计算用户点击次数 user_click_count = hist_click.groupby('did').size().rename('user_click_count') # 直接添加到df,避免创建中间变量 df = df.join(user_click_count, on='did', how='left') # 填充缺失值 df['user_click_count'] = df['user_click_count'].fillna(0) # 确保始终创建 'user_total_play' 列 df['user_total_play'] = 0 if not hist_play.empty and 'did' in hist_play.columns and 'play_time' in hist_play.columns: # 计算用户总播放时间 user_total_play = hist_play.groupby('did')['play_time'].sum().rename('user_total_play') df = df.join(user_total_play, on='did', how='left') # 填充缺失值 df['user_total_play'] = df['user_total_play'].fillna(0) # 确保始终创建 'video_click_count' 列 df['video_click_count'] = 0 # 视频热度统计 if not hist_click.empty and 'vid' in hist_click.columns: # 计算视频点击次数 video_click_count = hist_click.groupby('vid').size().rename('video_click_count') df = df.join(video_click_count, on='vid', how='left') # 填充缺失值 df['video_click_count'] = df['video_click_count'].fillna(0) # 确保始终创建 'avg_play_time' 列 df['avg_play_time'] = 0 if not hist_play.empty and 'vid' in hist_play.columns and 'play_time' in hist_play.columns: # 计算平均播放时间 avg_play_time = hist_play.groupby('vid')['play_time'].mean().rename('avg_play_time') df = df.join(avg_play_time, on='vid', how='left') # 填充缺失值 df['avg_play_time'] = df['avg_play_time'].fillna(0) # 填充缺失值 fill_values = { 'user_click_count': 0, 'user_total_play': 0, 'video_click_count': df['video_click_count'].median() if 'video_click_count' in df else 0, 'avg_play_time': df['avg_play_time'].median() if 'avg_play_time' in df else 0 } for col, value in fill_values.items(): if col in df: df[col] = df[col].fillna(value) # 移除日期相关特征 if 'date' in df: df = df.drop(columns=['date'], errors='ignore') return df # 优化内存:使用更小的数据类型 def get_categorical_features(df, base_features): """动态获取存在的分类特征""" existing_features = [] for feature in base_features: if feature in df.columns: try: # 尝试转换为数值,如果是数值则跳过 pd.to_numeric(df[feature], errors='raise') except: existing_features.append(feature) # 确保转换为category类型 df[feature] = df[feature].astype('category').cat.as_ordered() return existing_features # 优化内存:减少中间变量,使用transform避免创建大型临时DataFrame def build_play_dataset(hist_play, vid_info, did_features, hist_click): """构建完播率数据集,优化内存使用""" if hist_play.empty: print("⚠️ 历史播放数据为空,无法构建完播率数据集") return pd.DataFrame() # 基础数据 - 只选择需要的列 play_data = hist_play[['did', 'vid', 'play_time']].copy() # 添加视频时长信息 if not vid_info.empty and 'vid' in vid_info.columns and 'item_duration' in vid_info.columns: play_data = play_data.merge( vid_info[['vid', 'item_duration']], on='vid', how='left' ) else: play_data['item_duration'] = 1.0 # 默认值 # 计算完播率 play_data['completion_rate'] = play_data['play_time'] / play_data['item_duration'] play_data['completion_rate'] = play_data['completion_rate'].clip(upper=1.0) # 添加用户特征 - 只选择数值特征 if not did_features.empty and 'did' in did_features.columns: did_cols = ['did'] + [col for col in did_features.columns if col.startswith('f')] play_data = play_data.merge( did_features[did_cols], on='did', how='left' ) # 添加视频特征 - 只选择分类特征 if not vid_info.empty and 'vid' in vid_info.columns: vid_cols = ['vid'] + [col for col in vid_info.columns if col in ['item_cid', 'item_type', 'item_assetSource', 'item_classify', 'item_isIntact']] play_data = play_data.merge( vid_info[vid_cols], on='vid', how='left' ) # 用户平均完播率 - 使用transform避免创建大型临时DataFrame play_data['user_avg_completion'] = play_data.groupby('did')['completion_rate'].transform('mean') play_data['user_play_count'] = play_data.groupby('did')['completion_rate'].transform('count') # 视频平均完播率 play_data['video_avg_completion'] = play_data.groupby('vid')['completion_rate'].transform('mean') play_data['video_completion_std'] = play_data.groupby('vid')['completion_rate'].transform('std') # 用户-视频互动特征 if not hist_click.empty and 'did' in hist_click.columns and 'vid' in hist_click.columns: # 使用transform避免创建大型临时DataFrame user_vid_clicks = hist_click.groupby(['did', 'vid']).size().reset_index(name='user_vid_clicks') play_data = play_data.merge(user_vid_clicks, on=['did', 'vid'], how='left') else: play_data['user_vid_clicks'] = 0 # 添加交互特征 - 确保训练和预测时特征一致 play_data['interaction_feature'] = (play_data['user_click_count'] * play_data['video_click_count']).astype('float32') # 填充缺失值 play_data['user_avg_completion'].fillna(play_data['completion_rate'].mean(), inplace=True) play_data['user_play_count'].fillna(1, inplace=True) play_data['video_avg_completion'].fillna(play_data['completion_rate'].median(), inplace=True) play_data['video_completion_std'].fillna(0, inplace=True) play_data['user_vid_clicks'].fillna(0, inplace=True) return play_data # 优化内存:分批预测,避免一次性加载所有测试数据 def predict_for_test_data(test_users, test_exposure, did_features, vid_info): """为测试数据生成预测结果 - 确保结果行数与test_users一致""" if test_users.empty: print("⚠️ 测试用户数据为空,无法进行预测") return pd.DataFrame() # 确保每个测试用户都有记录 if test_exposure.empty: # 如果没有曝光数据,使用默认视频 print("⚠️ 测试曝光数据为空,使用默认视频") test_data = test_users.copy() test_data['vid'] = vid_info['vid'].iloc[0] if not vid_info.empty else 'default_vid' else: # 合并测试数据,确保包含所有测试用户 test_data = test_users.merge(test_exposure, on='did', how='left') # 处理可能缺失的vid most_common_vid = test_exposure['vid'].mode()[0] if not test_exposure.empty else 'default_vid' test_data['vid'] = test_data['vid'].fillna(most_common_vid) # 分批处理测试数据以避免内存溢出 chunk_size = 50000 # 每批处理5万行 results = [] for i in tqdm(range(0, len(test_data), chunk_size), desc="分批预测"): chunk = test_data.iloc[i:i+chunk_size].copy() # 添加特征 chunk = add_click_features( chunk, did_features, vid_info, pd.DataFrame(), # 无历史点击 pd.DataFrame() # 无历史播放 ) # 动态获取分类特征 test_categorical_features = get_categorical_features(chunk, base_categorical_features) # 预测点击率 X_chunk = chunk.drop(columns=['did', 'vid'], errors='ignore') click_probs = [] if model_click and not X_chunk.empty: # 确保特征数量一致 if len(X_chunk.columns) != len(click_features): print(f"⚠️ 点击模型特征数量一致: 训练时 {len(click_features)}, 预测时 {len(X_chunk.columns)}") # 对齐特征 missing_features = set(click_features) - set(X_chunk.columns) extra_features = set(X_chunk.columns) - set(click_features) # 添加缺失特征 for feature in missing_features: X_chunk[feature] = 0 # 移除多余特征 X_chunk = X_chunk[click_features] click_probs = model_click.predict(X_chunk) else: click_probs = [0.5] * len(chunk) # 默认值 # 预测完播率 completion_rates = [] if model_play and not X_chunk.empty: # 添加视频时长信息 if not vid_info.empty and 'vid' in vid_info.columns and 'item_duration' in vid_info.columns: chunk = chunk.merge(vid_info[['vid', 'item_duration']], on='vid', how='left') else: chunk['item_duration'] = 1.0 # 添加交互特征 - 确保与训练时一致 chunk['interaction_feature'] = (chunk['user_click_count'] * chunk['video_click_count']).astype('float32') # 准备预测数据 X_play_chunk = chunk.drop(columns=['did', 'vid'], errors='ignore') # 确保特征数量一致 if len(X_play_chunk.columns) != len(play_features): print(f"⚠️ 完播率模型特征数量一致: 训练时 {len(play_features)}, 预测时 {len(X_play_chunk.columns)}") # 对齐特征 missing_features = set(play_features) - set(X_play_chunk.columns) extra_features = set(X_play_chunk.columns) - set(play_features) # 添加缺失特征 for feature in missing_features: X_play_chunk[feature] = 0 # 移除多余特征 X_play_chunk = X_play_chunk[play_features] completion_rates = model_play.predict(X_play_chunk) else: completion_rates = [0.7] * len(chunk) # 默认值 # 存储预测结果 chunk['click_prob'] = click_probs chunk['completion_rate'] = completion_rates # 修改:确保每个did只有一行结果,选取点击概率最高的vid chunk_result = chunk.sort_values('click_prob', ascending=False).groupby('did').head(1) # 选择需要的列 chunk_result = chunk_result[['did', 'vid', 'completion_rate']].copy() results.append(chunk_result) # 清理内存 del chunk, X_chunk, click_probs, completion_rates, chunk_result gc.collect() # 合并所有批次结果 if results: result = pd.concat(results, ignore_index=True) else: result = pd.DataFrame(columns=['did', 'vid', 'completion_rate']) # 重命名列 result.columns = ['did', 'vid', 'predicted_completion_rate'] # 确保结果行数与测试用户一致 if len(result) != len(test_users): missing_dids = set(test_users['did']) - set(result['did']) print(f"⚠️ 警告: {len(missing_dids)} 个用户缺失预测结果,使用默认值填充") default_df = pd.DataFrame({ 'did': list(missing_dids), 'vid': most_common_vid, 'predicted_completion_rate': np.mean(result['predicted_completion_rate']) if not result.empty else 0.7 }) result = pd.concat([result, default_df], ignore_index=True) return result # 主程序流程 if __name__ == "__main__": # 定义内存优化的数据类型 dtypes = { 'did': 'category', 'vid': 'category', 'play_time': 'float32' } # 可选特征 - 只有在数据中存在时才添加 optional_features = { 'item_cid': 'category', 'item_type': 'category', 'item_assetSource': 'category', 'item_classify': 'category', 'item_isIntact': 'category', 'sid': 'category', 'stype': 'category' } # 添加特征字段 for i in range(88): dtypes[f'f{i}'] = 'float32' # 加载核心数据 - 分批加载 print("开始加载核心数据...") did_features = load_data_safely('did_features_table.csv', dtype=dtypes) vid_info = load_data_safely('vid_info_table.csv', dtype=dtypes) # 添加可选特征到dtypes(仅当列存在时) for feature, dtype in optional_features.items(): if not vid_info.empty and feature in vid_info.columns: dtypes[feature] = dtype # 重新加载数据以确保所有列使用正确的数据类型 if os.path.exists('did_features_table.csv'): did_features = load_data_safely('did_features_table.csv', dtype=dtypes) else: print("⚠️ did_features_table.csv 存在") did_features = pd.DataFrame() if os.path.exists('vid_info_table.csv'): vid_info = load_data_safely('vid_info_table.csv', dtype=dtypes) else: print("⚠️ vid_info_table.csv 存在") vid_info = pd.DataFrame() # 加载历史数据 - 确保所有变量都被定义 print("开始加载历史数据...") hist_exposure, hist_click, hist_play = load_historical_data(days=30) # 打印历史数据状态 print(f"历史曝光数据形状: {hist_exposure.shape if not hist_exposure.empty else '空'}") print(f"历史点击数据形状: {hist_click.shape if not hist_click.empty else '空'}") print(f"历史播放数据形状: {hist_play.shape if not hist_play.empty else '空'}") # 如果播放数据为空,尝试替代方案 if hist_play.empty: print("⚠️ 警告: 历史播放数据为空,尝试使用点击数据作为替代") # 使用点击数据作为播放数据的替代 hist_play = hist_click.copy() hist_play['play_time'] = 1.0 # 添加默认播放时间 print(f"使用替代播放数据形状: {hist_play.shape}") # 构建点击数据集 if not hist_exposure.empty and not hist_click.empty: print("构建点击数据集...") click_train_data = build_click_dataset(hist_exposure, hist_click, sample_ratio=0.1) else: print("⚠️ 无法构建点击数据集,因为历史曝光或点击数据为空") click_train_data = pd.DataFrame() # 添加特征 - 确保所有参数都已定义 if not click_train_data.empty: print("开始构建点击特征...") click_train_data = add_click_features( click_train_data, did_features, vid_info, hist_click, # 确保hist_click已定义 hist_play # 确保hist_play已定义 ) else: print("⚠️ 点击数据集为空,跳过特征构建") # 基础分类特征列表 - 移除日期相关特征 base_categorical_features = [ 'item_cid', 'item_type', 'item_assetSource', 'item_classify', 'item_isIntact', 'sid', 'stype' ] # 动态获取存在的分类特征 categorical_features = [] if not click_train_data.empty: categorical_features = get_categorical_features(click_train_data, base_categorical_features) print(f"使用的分类特征: {categorical_features}") else: print("⚠️ 点击训练数据为空,无法获取分类特征") # 准备训练数据 if not click_train_data.empty: # 移除所有日期相关字段 X = click_train_data.drop(columns=['did', 'vid', 'label'], errors='ignore') y = click_train_data['label'] else: X, y = pd.DataFrame(), pd.Series() print("⚠️ 点击训练数据为空") # 划分数据集 if not X.empty and not y.empty: X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) else: print("⚠️ 训练数据为空,无法进行模型训练") X_train, X_val, y_train, y_val = pd.DataFrame(), pd.DataFrame(), pd.Series(), pd.Series() # 训练模型(优化参数) params = { 'objective': 'binary', 'metric': 'binary_logloss', 'boosting_type': 'gbdt', 'num_leaves': 63, 'learning_rate': 0.05, 'feature_fraction': 0.8, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'min_child_samples': 100, 'verbosity': -1, 'max_bin': 255 # 减少bin数量以降低内存 } model_click = None if not X_train.empty: train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features) val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=categorical_features) print("开始训练点击预测模型...") model_click = lgb.train( params, train_data, num_boost_round=1500, valid_sets=[val_data], callbacks=[ early_stopping(stopping_rounds=100, verbose=True), log_evaluation(period=50) ] ) # 保存点击模型使用的特征 global click_features click_features = list(X_train.columns) joblib.dump(click_features, 'click_features.pkl') # 计算并输出AUC if not X_val.empty and not y_val.empty and model_click: y_val_pred = model_click.predict(X_val) auc_score = roc_auc_score(y_val, y_val_pred) print(f"📊 点击率模型在验证集上的AUC: {auc_score:.6f}") with open('model_metrics.txt', 'w') as f: f.write(f"点击率模型AUC: {auc_score:.6f}\n") # 清理内存 del X_train, X_val, y_train, y_val, train_data, val_data gc.collect() else: print("⚠️ 训练数据为空,跳过点击预测模型训练") # 构建完播率数据集 print("开始构建完播率数据集...") if not hist_play.empty: play_train_data = build_play_dataset(hist_play, vid_info, did_features, hist_click) else: print("⚠️ 无法构建完播率数据集,因为播放数据为空") play_train_data = pd.DataFrame() # 训练完播率模型 model_play = None if not play_train_data.empty: X_play = play_train_data.drop(columns=['did', 'vid', 'play_time', 'item_duration', 'completion_rate'], errors='ignore') y_play = play_train_data['completion_rate'] else: X_play, y_play = pd.DataFrame(), pd.Series() print("⚠️ 完播率训练数据为空") if not X_play.empty and not y_play.empty: X_train_play, X_val_play, y_train_play, y_val_play = train_test_split( X_play, y_play, test_size=0.2, random_state=42 ) else: print("⚠️ 完播率训练数据为空,无法进行模型训练") X_train_play, X_val_play, y_train_play, y_val_play = pd.DataFrame(), pd.DataFrame(), pd.Series(), pd.Series() # 获取完播率模型的分类特征 play_categorical_features = [] if not play_train_data.empty: play_categorical_features = get_categorical_features(play_train_data, base_categorical_features) print(f"完播率模型使用的分类特征: {play_categorical_features}") else: print("⚠️ 完播率训练数据为空,无法获取分类特征") # 训练参数 - 优化内存使用 params_reg = { 'objective': 'regression', 'metric': 'mae', 'boosting_type': 'gbdt', 'num_leaves': 63, # 减少树复杂度 'learning_rate': 0.03, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 5, 'lambda_l1': 0.1, 'lambda_l2': 0.1, 'min_data_in_leaf': 100, 'verbosity': -1, 'max_bin': 255 # 减少bin数量以降低内存 } if not X_train_play.empty: train_data_play = lgb.Dataset(X_train_play, label=y_train_play, categorical_feature=play_categorical_features) val_data_play = lgb.Dataset(X_val_play, label=y_val_play, categorical_feature=play_categorical_features) print("开始训练完播率模型...") model_play = lgb.train( params_reg, train_data_play, num_boost_round=1000, # 减少迭代次数 valid_sets=[val_data_play], callbacks=[ early_stopping(stopping_rounds=100, verbose=True), log_evaluation(period=50) ] ) # 保存完播率模型使用的特征 global play_features play_features = list(X_train_play.columns) joblib.dump(play_features, 'play_features.pkl') # 评估模型 y_pred_val = model_play.predict(X_val_play) mae = mean_absolute_error(y_val_play, y_pred_val) print(f"📊 完播率模型在验证集上的MAE: {mae:.6f}") with open('model_metrics.txt', 'a') as f: f.write(f"完播率模型MAE: {mae:.6f}\n") # 清理内存 del X_train_play, X_val_play, y_train_play, y_val_play, train_data_play, val_data_play gc.collect() else: print("⚠️ 训练数据为空,跳过完播率模型训练") # 保存模型 if model_click: model_click.save_model('click_model.txt') if model_play: model_play.save_model('play_model.txt') joblib.dump(base_categorical_features, 'categorical_features.pkl') # 如果是从文件加载模型,需要加载特征列表 if not model_click: try: model_click = lgb.Booster(model_file='click_model.txt') click_features = joblib.load('click_features.pkl') print("✅ 从文件加载点击模型和特征") except: print("⚠️ 无法加载点击模型") if not model_play: try: model_play = lgb.Booster(model_file='play_model.txt') play_features = joblib.load('play_features.pkl') print("✅ 从文件加载完播率模型和特征") except: print("⚠️ 无法加载完播率模型") # 加载预测数据 print("开始加载预测数据...") to_predict_users = load_data_safely('testA_pred_did.csv', dtype={'did': 'category'}) to_predict_exposure = load_data_safely('testA_did_show.csv', dtype={'did': 'category', 'vid': 'category'}) # 执行预测 if not to_predict_users.empty: print("开始生成预测结果...") submission = predict_for_test_data(to_predict_users, to_predict_exposure, did_features, vid_info) # 验证行数一致性 if len(submission) != len(to_predict_users): print(f"⚠️ 行数一致: 预测结果 {len(submission)} 行, 测试用户 {len(to_predict_users)} 行") # 处理缺失的DID missing_dids = set(to_predict_users['did']) - set(submission['did']) if missing_dids: print(f"添加缺失的 {len(missing_dids)} 个用户") default_vid = vid_info['vid'].iloc[0] if not vid_info.empty else 'default_vid' missing_df = pd.DataFrame({ 'did': list(missing_dids), 'vid': default_vid, 'predicted_completion_rate': submission['predicted_completion_rate'].mean() }) submission = pd.concat([submission, missing_df], ignore_index=True) # 保存结果 if not submission.empty: print(f"预测结果行数: {len(submission)} (应与测试用户行数一致)") timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_file = f'submission_{timestamp}.csv' # 保存为无表头CSV submission.to_csv(output_file, index=False, header=False) print(f"预测结果已保存至: {output_file}") print(f"结果格式: 共 {len(submission)} 行") print(f"列信息: [did, vid, predicted_completion_rate]") # 验证结果分布 print(f"完播率预测值分布: min={submission['predicted_completion_rate'].min():.4f}, " f"max={submission['predicted_completion_rate'].max():.4f}, " f"mean={submission['predicted_completion_rate'].mean():.4f}") else: print("⚠️ 预测结果为空,未保存文件") else: print("⚠️ 预测数据加载失败,无法生成结果")
最新发布
07-13
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值