2025 iFLYTEK AI开发者大赛&Datawhale AI夏令营：新增用户预测挑战赛

本文链接：https://blog.youkuaiyun.com/Aurrorra/article/details/149370849

本次参加的比赛为科大讯飞的新增用户预测挑战赛

一、背景

科大讯飞主办的一项数据科学竞赛，旨在通过机器学习方法预测用户是否为新增用户。

比赛属于二分类任务，评价指标采用F1分数，分数越高表示模型性能越好。

行业价值：
精准预测用户增长趋势，优化产品迭代方向
降低用户获取成本，提高营销转化率
为AI能力落地提供量化评估依据

技术价值：
解决实际业务场景中的用户增长预测问题
验证AI在用户行为分析领域的有效性
建立可复用的用户增长预测方法论

二、数据介绍

参与算法赛事，一定要仔细理解赛事的 输入-输出 究竟是什么，尤其是提交的格式。

三、baseline以及优化baseline

import pandas as pd
import numpy as np
import json
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


# 1. 数据加载
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./testA_data.csv')
submit = test_df[['did']]

full_df = pd.concat([train_df, test_df], axis=0)

# 2. 时间特征工程
for df in [train_df, test_df, full_df]:
    # 转换为时间戳
    df['ts'] = pd.to_datetime(df['common_ts'], unit='ms')
    # 提取时间特征
    df['day'] = df['ts'].dt.day
    df['dayofweek'] = df['ts'].dt.dayofweek
    df['hour'] = df['ts'].dt.hour
    # 删除原始时间列
    df.drop(['ts'], axis=1, inplace=True)

# =================== 后续所有操作放在 for 循环外 ===================

############################### 简单分析
# 获取 train 和 test 中唯一的 did
train_dids = set(train_df['did'].unique())
test_dids = set(test_df['did'].unique())
# 计算交集
overlap_dids = train_dids & test_dids
# 数量统计
num_overlap = len(overlap_dids)
num_train = len(train_dids)
num_test = len(test_dids)
# 占比
ratio_in_train = num_overlap / num_train if num_train > 0 else 0
ratio_in_test = num_overlap / num_test if num_test > 0 else 0
# 输出结果
print(f"重叠 did 数量: {num_overlap}")
print(f"占 train 比例: {ratio_in_train:.4f} ({num_overlap}/{num_train})")
print(f"占 test 比例: {ratio_in_test:.4f} ({num_overlap}/{num_test})")

# 需要编码的特征列表
cat_features = [
    'device_brand', 'ntt', 'operator', 'common_country',
    'common_province', 'common_city', 'appver', 'channel',
    'os_type', 'udmap'
]
# 初始化编码器字典
label_encoders = {}
for feature in cat_features:
    # 创建编码器，将类别特征转为0-N的自然数
    le = LabelEncoder()
    # 合并训练集和测试集的所有类别
    all_values = pd.concat([train_df[feature], test_df[feature]]).astype(str)
    # 训练编码器（使用所有可能值）
    le.fit(all_values)
    # 保存编码器
    label_encoders[feature] = le
    # 应用编码
    train_df[feature] = le.transform(train_df[feature].astype(str))
    test_df[feature] = le.transform(test_df[feature].astype(str))

# 基础特征 + 目标编码特征 + 聚合特征
features = [
    # 原始特征
    'mid', 'eid', 'device_brand', 'ntt', 'operator',
    'common_country', 'common_province', 'common_city',
    'appver', 'channel', 'os_type', 'udmap',
    # 时间特征
    'hour', 'dayofweek', 'day', 'common_ts'
]
# 准备训练和测试数据
X_train = train_df[features]
y_train = train_df['is_new_did']
X_test = test_df[features]

# 6. F1阈值优化函数
def find_optimal_threshold(y_true, y_pred_proba):
    """寻找最大化F1分数的阈值"""
    best_threshold = 0.5
    best_f1 = 0
    for threshold in [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4]:
        y_pred = (y_pred_proba >= threshold).astype(int)
        f1 = f1_score(y_true, y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
    return best_threshold, best_f1

# 7. 模型训练与交叉验证
import time
# 动态生成随机种子（基于当前时间）
seed = int(time.time()) % 1000000  # 取当前时间戳模一个数，避免太大
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'max_depth': '12',
    'num_leaves': 63,
    'learning_rate': 0.1,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'min_child_samples': 10,
    'verbose': -1,
    'n_jobs': 8,
    'seed': seed  # 使用动态生成的 seed
}
# 五折交叉验证，使用五折构建特征时的切分规则，保证切分一致
n_folds = 5
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
test_preds = np.zeros(len(X_test))
fold_thresholds = []
fold_f1_scores = []
models = []
oof_preds = np.zeros(len(X_train))
oof_probas = np.zeros(len(X_train))
print("\n开始模型训练...")
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    print(f"\n======= Fold {fold + 1}/{n_folds} =======")
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    # 创建数据集（指定类别特征）
    train_set = lgb.Dataset(X_tr, label=y_tr)
    val_set = lgb.Dataset(X_val, label=y_val)
    # 模型训练
    model = lgb.train(
        params, train_set,
        num_boost_round=1000,
        valid_sets=[train_set, val_set],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50, verbose=False),
            lgb.log_evaluation(period=100)
        ]
    )
    models.append(model)
    # 验证集预测
    val_pred_proba = model.predict(X_val)
    oof_probas[val_idx] = val_pred_proba
    # 阈值优化
    best_threshold, best_f1 = find_optimal_threshold(y_val, val_pred_proba)
    fold_thresholds.append(best_threshold)
    # 使用优化阈值计算F1
    val_pred_labels = (val_pred_proba >= best_threshold).astype(int)
    fold_f1 = f1_score(y_val, val_pred_labels)
    fold_f1_scores.append(fold_f1)
    oof_preds[val_idx] = val_pred_labels
    print(f"Fold {fold + 1} Optimal Threshold: {best_threshold:.4f}")
    print(f"Fold {fold + 1} F1 Score: {fold_f1:.5f}")
    # 测试集预测
    test_preds += model.predict(X_test) / n_folds
# 8. 整体结果评估
# 使用交叉验证平均阈值
avg_threshold = np.mean(fold_thresholds)
final_oof_preds = (oof_probas >= avg_threshold).astype(int)
final_f1 = f1_score(y_train, final_oof_preds)
print("\n===== Final Results =====")
print(f"Average Optimal Threshold: {avg_threshold:.4f}")
print(f"Fold F1 Scores: {[f'{s:.5f}' for s in fold_f1_scores]}")
print(f"Average Fold F1: {np.mean(fold_f1_scores):.5f}")
print(f"OOF F1 Score: {final_f1:.5f}")
# 9. 测试集预测与提交文件生成
# 使用平均阈值进行预测
test_pred_labels = (test_preds >= avg_threshold).astype(int)
submit['is_new_did'] = test_pred_labels
# 保存提交文件
submit[['is_new_did']].to_csv('submit.csv', index=False)
print("\nSubmission file saved: submit.csv")
print(f"Predicted new user ratio: {test_pred_labels.mean():.4f}")
print(f"Test set size: {len(test_pred_labels)}")
# 10. 特征重要性分析
feature_importance = pd.DataFrame({
    'Feature': features,
    'Importance': models[0].feature_importance(importance_type='gain')
}).sort_values('Importance', ascending=False)
print("\nTop 10 Features:")
print(feature_importance.head(10))

优化基本是按照Datawhale学习手册里面的task2优化，评分只涨了0.04，感觉没怎么调明白，变成dirty-work了。

后续再补充