每日一练_165 统计天数。

此篇博客介绍如何使用Java编程统计给定整数数组中连续递增子数组的最大长度。通过遍历数组并维护当前递增序列的长度,最后输出最大长度。

package cn.itcast.girl.TheBlueCup_02;

import java.util.Scanner;

public class TheStatisticalNumbersOfDays {

    public static void main(String[] args) {
        // TODO Auto-generated method stub
        Scanner sc = new Scanner(System.in);
        int m = sc.nextInt();
        int a[] = new int[m];
        for(int i=0;i<m;i++) {
            a[i] = sc.nextInt();
        }
        int max = 1;
        int sum = 1;
        for(int i=1;i<m;i++) {
            if(a[i]>a[i-1]) {
                sum++;
                max = Math.max(max, sum);
            }else {
                sum=1;
            }
        }
        System.out.println(max);
        sc.close();
    }

}
 

这个模型是在干什么:import pandas as pd import numpy as np import lightgbm as lgb from lightgbm import early_stopping, log_evaluation from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score import chardet import os import gc import joblib from tqdm import tqdm import warnings warnings.filterwarnings(&#39;ignore&#39;) # 内存优化函数 def reduce_mem_usage(df, use_float16=False): """迭代降低DataFrame的内存占用""" start_mem = df.memory_usage().sum() / 1024**2 print(f"内存优化前: {start_mem:.2f} MB") for col in df.columns: col_type = df[col].dtype if col_type != object: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == "int": if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max: df[col] = df[col].astype(np.float16) elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max: df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024**2 print(f"内存优化后: {end_mem:.2f} MB ({100*(start_mem-end_mem)/start_mem:.1f}% 减少)") return df def detect_encoding(file_path): with open(file_path, &#39;rb&#39;) as f: result = chardet.detect(f.read(10000)) return result[&#39;encoding&#39;], result[&#39;confidence&#39;] def load_data_for_day(day): """逐天加载数据并进行基本处理,返回优化后的DataFrame""" dtypes = {&#39;did&#39;: &#39;category&#39;, &#39;vid&#39;: &#39;category&#39;} day_str = f"{day:02d}" try: # 加载 see 数据 see_path = f&#39;see_{day_str}.csv&#39; if not os.path.exists(see_path): print(f"⚠️ 警告: 文件 {see_path} 不存在,跳过该天数据") return None, None, None see = pd.read_csv(see_path, encoding=&#39;latin1&#39;, dtype=dtypes) if &#39;did&#39; not in see.columns or &#39;vid&#39; not in see.columns: print(f"⚠️ 警告: see_{day_str}.csv 缺少必要字段") return None, None, None see[&#39;day&#39;] = day_str see = reduce_mem_usage(see) # 加载 click 数据 click_path = f&#39;click_{day_str}.csv&#39; if os.path.exists(click_path): click = pd.read_csv( click_path, encoding=&#39;ISO-8859-1&#39;, on_bad_lines=&#39;skip&#39;, dtype=dtypes ) if &#39;click_time&#39; not in click.columns: print(f"⚠️ 警告: click_{day_str}.csv 缺少 click_time 字段") click = None else: click = click[[&#39;did&#39;, &#39;vid&#39;]] click = reduce_mem_usage(click) else: click = None print(f"⚠️ 警告: click_{day_str}.csv 不存在") # 加载 play 数据 play_path = f&#39;playplus_{day_str}.csv&#39; if os.path.exists(play_path): play = pd.read_csv( play_path, engine=&#39;python&#39;, encoding_errors=&#39;ignore&#39;, dtype=dtypes ) if &#39;play_time&#39; not in play.columns: print(f"⚠️ 警告: playplus_{day_str}.csv 缺少 play_time 字段") play = None else: play = play[[&#39;did&#39;, &#39;vid&#39;, &#39;play_time&#39;]] play = reduce_mem_usage(play) else: play = None print(f"⚠️ 警告: playplus_{day_str}.csv 不存在") return see, click, play except Exception as e: print(f"⚠️ 加载第 {day_str} 天数据时出错: {str(e)}") return None, None, None def process_data_in_chunks(days, feature_builder=None): """分块处理数据,避免内存溢出""" # 首先处理视频信息(一次性) if not os.path.exists(&#39;vid_info_table.csv&#39;): raise FileNotFoundError("错误: vid_info_table.csv 文件不存在") video_info = pd.read_csv(&#39;vid_info_table.csv&#39;, encoding=&#39;gbk&#39;, dtype={&#39;vid&#39;: &#39;category&#39;}) if &#39;item_duration&#39; not in video_info.columns: raise ValueError("vid_info_table.csv 缺少 item_duration 字段") video_info = reduce_mem_usage(video_info) video_info[&#39;vid&#39;] = video_info[&#39;vid&#39;].astype(&#39;category&#39;) # 初始化全局数据结构 user_stats = {} video_stats = {} # 逐天处理数据 for day in tqdm(range(1, days + 1), desc="处理每日数据"): see, click, play = load_data_for_day(day) if see is None: continue # 处理曝光数据 see_grouped = see.groupby(&#39;did&#39;)[&#39;vid&#39;].nunique().reset_index(name=&#39;exposure_count&#39;) see_grouped = reduce_mem_usage(see_grouped) # 合并播放数据(如果存在) if play is not None: see = pd.merge(see, play, on=[&#39;did&#39;, &#39;vid&#39;], how=&#39;left&#39;) see[&#39;play_time&#39;] = see[&#39;play_time&#39;].fillna(0).astype(np.float32) else: see[&#39;play_time&#39;] = 0.0 # 合并点击数据(如果存在) if click is not None: click[&#39;clicked&#39;] = 1 see = pd.merge(see, click, on=[&#39;did&#39;, &#39;vid&#39;], how=&#39;left&#39;) see[&#39;clicked&#39;] = see[&#39;clicked&#39;].fillna(0).astype(np.int8) else: see[&#39;clicked&#39;] = 0 # 合并视频信息 see = pd.merge(see, video_info[[&#39;vid&#39;, &#39;item_duration&#39;]], on=&#39;vid&#39;, how=&#39;left&#39;) see[&#39;item_duration&#39;] = see[&#39;item_duration&#39;].fillna(1.0) see.loc[see[&#39;item_duration&#39;] <= 0, &#39;item_duration&#39;] = 1.0 # 计算完成率 see[&#39;completion_rate&#39;] = (see[&#39;play_time&#39;] / see[&#39;item_duration&#39;]).clip(0, 1).astype(np.float16) # 创建标签 see[&#39;label&#39;] = np.select( [(see[&#39;completion_rate&#39;] > 0.4), (see[&#39;clicked&#39;] == 1)], [2, 1], # 2=完成, 1=点击 default=0 # 0=曝光未点击 ).astype(np.int8) see[&#39;binary_label&#39;] = see[&#39;label&#39;].apply(lambda x: 1 if x >= 1 else 0).astype(np.int8) # 更新用户统计 for _, row in see.iterrows(): did = row[&#39;did&#39;] vid = row[&#39;vid&#39;] # 初始化用户统计 if did not in user_stats: user_stats[did] = { &#39;exposure_count&#39;: 0, &#39;click_count&#39;: 0, &#39;active_days&#39;: set() } # 更新曝光计数 user_stats[did][&#39;exposure_count&#39;] += 1 # 更新点击计数 if row[&#39;clicked&#39;] == 1: user_stats[did][&#39;click_count&#39;] += 1 # 更新活跃天数 user_stats[did][&#39;active_days&#39;].add(day) # 初始化视频统计 if vid not in video_stats: video_stats[vid] = { &#39;click_users&#39;: set() } # 更新视频点击用户 if row[&#39;clicked&#39;] == 1: video_stats[vid][&#39;click_users&#39;].add(did) # 释放内存 del see gc.collect() # 计算全局特征 print("计算全局特征...") user_features = [] for did, stats in user_stats.items(): active_days = len(stats[&#39;active_days&#39;]) click_count = stats[&#39;click_count&#39;] exposure_count = stats[&#39;exposure_count&#39;] if stats[&#39;exposure_count&#39;] > 0 else 1 user_click_rate = click_count / exposure_count user_features.append({ &#39;did&#39;: did, &#39;user_click_rate&#39;: user_click_rate, &#39;user_active_days&#39;: active_days }) video_features = [] for vid, stats in video_stats.items(): video_popularity = len(stats[&#39;click_users&#39;]) video_features.append({ &#39;vid&#39;: vid, &#39;video_popularity&#39;: video_popularity }) user_df = pd.DataFrame(user_features) video_df = pd.DataFrame(video_features) # 释放内存 del user_stats, video_stats gc.collect() # 保存特征 user_df = reduce_mem_usage(user_df) video_df = reduce_mem_usage(video_df) user_df.to_csv(&#39;user_click_rate.csv&#39;, index=False) video_df.to_csv(&#39;video_popularity.csv&#39;, index=False) return user_df, video_df def prepare_samples(days=7): """准备训练样本(内存优化版本)""" # 处理数据并获取全局特征 user_df, video_df = process_data_in_chunks(days) # 读取并处理最近一天的数据作为样本 see, _, play = load_data_for_day(days) if see is None: raise ValueError("无法加载样本数据") # 合并用户特征 see = pd.merge(see, user_df, on=&#39;did&#39;, how=&#39;left&#39;) see[&#39;user_click_rate&#39;] = see[&#39;user_click_rate&#39;].fillna(0).astype(np.float32) see[&#39;user_active_days&#39;] = see[&#39;user_active_days&#39;].fillna(1).astype(np.int16) # 合并视频特征 see = pd.merge(see, video_df, on=&#39;vid&#39;, how=&#39;left&#39;) see[&#39;video_popularity&#39;] = see[&#39;video_popularity&#39;].fillna(0).astype(np.float32) # 特征交叉 see[&#39;user_video_interaction&#39;] = (see[&#39;user_active_days&#39;] * np.log1p(see[&#39;video_popularity&#39;])).astype(np.float32) see[&#39;user_video_affinity&#39;] = (see[&#39;user_click_rate&#39;] * see[&#39;video_popularity&#39;]).astype(np.float32) # 处理视频信息 video_info = pd.read_csv(&#39;vid_info_table.csv&#39;, encoding=&#39;gbk&#39;, dtype={&#39;vid&#39;: &#39;category&#39;}) see = pd.merge(see, video_info[[&#39;vid&#39;, &#39;item_duration&#39;]], on=&#39;vid&#39;, how=&#39;left&#39;) see[&#39;item_duration&#39;] = see[&#39;item_duration&#39;].fillna(1.0) see.loc[see[&#39;item_duration&#39;] <= 0, &#39;item_duration&#39;] = 1.0 # 计算完成率 if &#39;play_time&#39; not in see.columns: see[&#39;play_time&#39;] = 0.0 see[&#39;completion_rate&#39;] = (see[&#39;play_time&#39;] / see[&#39;item_duration&#39;]).clip(0, 1).astype(np.float16) # 创建标签 see[&#39;label&#39;] = np.select( [(see[&#39;completion_rate&#39;] > 0.4), (see[&#39;clicked&#39;] == 1)], [2, 1], # 2=完成, 1=点击 default=0 # 0=曝光未点击 ).astype(np.int8) see[&#39;binary_label&#39;] = see[&#39;label&#39;].apply(lambda x: 1 if x >= 1 else 0).astype(np.int8) # 优化内存 see = reduce_mem_usage(see) return see, user_df, video_df def train_model(samples): """训练模型(内存优化版本)""" print("准备训练数据...") features = [&#39;user_click_rate&#39;, &#39;video_popularity&#39;, &#39;user_active_days&#39;, &#39;user_video_interaction&#39;, &#39;user_video_affinity&#39;] # 确保特征存在 available_features = [f for f in features if f in samples.columns] print(f"使用的特征: {available_features}") X = samples[available_features] y = samples[&#39;binary_label&#39;] # 检查标签分布 if len(y.unique()) < 2: raise ValueError("标签数据不平衡,需要正负样本") # 划分训练集和测试集 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # 优化内存 X_train = reduce_mem_usage(X_train) X_test = reduce_mem_usage(X_test) # 创建数据集 lgb_train = lgb.Dataset(X_train, y_train, free_raw_data=True) lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train, free_raw_data=True) # 优化模型参数(降低复杂度) params = { &#39;boosting_type&#39;: &#39;gbdt&#39;, &#39;objective&#39;: &#39;binary&#39;, &#39;metric&#39;: &#39;auc&#39;, &#39;num_leaves&#39;: 31, # 减少叶子节点 &#39;max_depth&#39;: 7, # 减少深度 &#39;learning_rate&#39;: 0.05, &#39;feature_fraction&#39;: 0.7, &#39;bagging_fraction&#39;: 0.8, &#39;bagging_freq&#39;: 5, &#39;min_child_samples&#39;: 100, # 增加以降低内存 &#39;verbosity&#39;: -1, &#39;seed&#39;: 42 } # 训练模型 print("训练模型...") model = lgb.train( params, lgb_train, num_boost_round=500, # 减少迭代次数 valid_sets=[lgb_train, lgb_eval], callbacks=[ early_stopping(stopping_rounds=50, verbose=True), log_evaluation(period=100) ] ) # 评估模型 y_pred = model.predict(X_test) auc_score = roc_auc_score(y_test, y_pred) print(f"✅ 模型训练完成,验证集AUC: {auc_score:.4f}") # 保存模型 joblib.dump(model, &#39;lightgbm_model.pkl&#39;) print("💾 模型已保存") # 保存特征列表 with open(&#39;feature_columns.txt&#39;, &#39;w&#39;) as f: f.write(&#39;\n&#39;.join(available_features)) return model, available_features, auc_score def predict_new_data(model, feature_columns, test_file): """预测新数据(内存优化版本)""" print("加载测试数据...") test_data = pd.read_csv(test_file, dtype={&#39;did&#39;: &#39;category&#39;, &#39;vid&#39;: &#39;category&#39;}) test_data = reduce_mem_usage(test_data) # 加载特征映射 user_df = pd.read_csv(&#39;user_click_rate.csv&#39;) if os.path.exists(&#39;user_click_rate.csv&#39;) else pd.DataFrame() video_df = pd.read_csv(&#39;video_popularity.csv&#39;) if os.path.exists(&#39;video_popularity.csv&#39;) else pd.DataFrame() # 使用全局均值用于填充新用户/新视频 global_user_rate = user_df[&#39;user_click_rate&#39;].mean() if not user_df.empty else 0 global_video_pop = video_df[&#39;video_popularity&#39;].mean() if not video_df.empty else 0 global_active_days = user_df[&#39;user_active_days&#39;].mean() if not user_df.empty else 1 # 创建映射字典(减少内存) user_click_map = user_df.set_index(&#39;did&#39;)[&#39;user_click_rate&#39;].to_dict() if not user_df.empty else {} video_pop_map = video_df.set_index(&#39;vid&#39;)[&#39;video_popularity&#39;].to_dict() if not video_df.empty else {} user_active_map = user_df.set_index(&#39;did&#39;)[&#39;user_active_days&#39;].to_dict() if not user_df.empty else {} # 添加特征 print("添加特征...") test_data[&#39;user_click_rate&#39;] = test_data[&#39;did&#39;].map(user_click_map).fillna(global_user_rate).astype(np.float32) test_data[&#39;video_popularity&#39;] = test_data[&#39;vid&#39;].map(video_pop_map).fillna(global_video_pop).astype(np.float32) test_data[&#39;user_active_days&#39;] = test_data[&#39;did&#39;].map(user_active_map).fillna(global_active_days).astype(np.int16) # 特征交叉 test_data[&#39;user_video_interaction&#39;] = (test_data[&#39;user_active_days&#39;] * np.log1p(test_data[&#39;video_popularity&#39;])).astype(np.float32) test_data[&#39;user_video_affinity&#39;] = (test_data[&#39;user_click_rate&#39;] * test_data[&#39;video_popularity&#39;]).astype(np.float32) # 确保所有特征都存在 print("准备预测数据...") test_features = test_data[feature_columns].copy() # 释放内存 del test_data gc.collect() # 分批预测(避免内存溢出) print("开始预测...") batch_size = 100000 predictions = [] for i in tqdm(range(0, len(test_features), batch_size), desc="预测批次"): batch = test_features.iloc[i:i+batch_size] preds = model.predict(batch) predictions.extend(preds.tolist()) del batch gc.collect() # 重新加载测试数据以获取did和vid test_data = pd.read_csv(test_file, dtype={&#39;did&#39;: &#39;category&#39;, &#39;vid&#39;: &#39;category&#39;}, usecols=[&#39;did&#39;, &#39;vid&#39;]) test_data[&#39;click_prob&#39;] = predictions # 生成并保存结果 print("生成最终结果...") top_predictions = test_data.sort_values(&#39;click_prob&#39;, ascending=False).groupby(&#39;did&#39;).head(1) result = top_predictions[[&#39;did&#39;, &#39;vid&#39;, &#39;click_prob&#39;]].copy() result.to_csv(&#39;prediction_result.csv&#39;, index=False) print(f"✅ 预测完成,结果已保存至 prediction_result.csv") print(f"预测样本数量: {len(result)}") # 释放内存 del test_features, predictions, top_predictions gc.collect() return result if __name__ == &#39;__main__&#39;: try: print("🚀 开始视频推荐模型训练与预测流程 (内存优化版)") # 设置较小的天数 TRAIN_DAYS = 7 # 仅使用7天数据 print(f"⚙️ 配置: 使用{TRAIN_DAYS}天数据训练") # 准备样本 print("🔧 准备训练样本...") samples, _, _ = prepare_samples(days=TRAIN_DAYS) if samples is None: raise ValueError("样本准备失败") print(f"✅ 样本准备完成 - 总样本数: {len(samples)}") # 标签分布 label_dist = samples[&#39;binary_label&#39;].value_counts(normalize=True) print(f"📊 标签分布 - 正样本: {label_dist[1]:.2%}, 负样本: {label_dist[0]:.2%}") # 训练模型 print("🤖 开始训练LightGBM模型...") model, features, auc_score = train_model(samples) print(f"🎯 最优模型AUC: {auc_score:.4f}") # 释放内存 del samples gc.collect() # 预测新数据 print("🔮 开始预测新数据...") test_file = &#39;testA_did_show.csv&#39; # 直接加载保存的模型(避免内存中的模型占用) if not os.path.exists(&#39;lightgbm_model.pkl&#39;): raise FileNotFoundError("模型文件不存在") model = joblib.load(&#39;lightgbm_model.pkl&#39;) # 加载特征列表 if not os.path.exists(&#39;feature_columns.txt&#39;): raise FileNotFoundError("特征列表文件不存在") with open(&#39;feature_columns.txt&#39;, &#39;r&#39;) as f: features = f.read().splitlines() result = predict_new_data(model, features, test_file) print("✅ 流程成功完成!") except Exception as e: print(f"❌ 流程出错: {str(e)}") import traceback traceback.print_exc()
07-10
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

是壮壮没错了丶

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值