Kaggle-Binary Prediction with a Rainfall Dataset-(回归+特征工程+xgb)

Binary Prediction with a Rainfall Dataset

题意:

给你每天的天气信息,让你预测降雨量。

数据处理:

1.根据特征值构造天气降雨量的新特征值
2.根据时间构造月和季节特征
3.处理缺失值

建立模型:

1.建立lightgbm模型
2.建立xgboost模型,并进行网格搜索最佳参数模型
3.进行模型融合

代码:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn
from matplotlib import pyplot as plt
import lightgbm
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from xgboost import XGBRegressor


def init():
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # 仅输出错误日志
    warnings.simplefilter('ignore')  # 忽略警告日志
    pd.set_option('display.width', 1000)
    pd.set_option('display.max_colwidth', 1000)
    pd.set_option("display.max_rows", 1000)
    pd.set_option("display.max_columns", 1000)

def show_dataframe():
    print("查看特征值和特征值类型\n" + str(df_train.dtypes) + "\n" + "-"*100)
    print("查看前10行信息\n" + str(df_train.head()) + "\n" + "-"*100)
    print("查看每个特征值的各种数据统计信息\n" + str(df_train.describe()) + "\n" + "-" * 100)
    print("输出重复行的个数\n" + str(df_train.duplicated().sum()) + "\n" + "-" * 100)
    print("查看每列的缺失值个数\n" + str(df_train.isnull().sum()) + "\n" + "-" * 100)
    print("查看缺失值的具体信息\n" + str(df_train.info()) + "\n" + "-" * 100)
    print("输出X所有值出现的是什么,还有对应出现的次数\n" + str(df_train['X'].value_count()) + "\n" + "-" * 100)

def show_relation(data, colx, coly): #输出某一特征值与目标值的关系
    if data[colx].dtype == 'object' or data[colx].dtype == 'category' or len(data[colx].unique()) < 20:
        seaborn.boxplot(x=colx, y=coly, data=data)
    else:
        plt.scatter(data[colx], data[coly])
    plt.xlabel(colx)
    plt.ylabel(coly)
    plt.show()

def show_score(model_name,pred):
    mse = mean_squared_error(y_train, pred)
    mae = mean_absolute_error(y_train, pred)
    score = r2_score(y_train, pred)
    print(model_name)
    print(f"{'MSE':<10}{mse:<15.4f}")
    print(f"{'MAE':<10}{mae:<15.4f}")
    print(f"{'R²':<10}{score:<15.4f}")
    print("-"*100)

if __name__ == '__main__':
    df_train = pd.read_csv('/kaggle/input/playground-series-s5e3/train.csv')
    df_test = pd.read_csv('/kaggle/input/playground-series-s5e3/test.csv')

    init()
    #df_show()

    df_all = pd.concat([df_train.drop(['id', 'rainfall'], axis=1), df_test.drop(['id'], axis=1)], axis=0)
    df_all['cha_temp'] = df_all['maxtemp']-df_all['mintemp'] #温差
    df_all['cha_temp_dew'] = df_all['temparature'] - df_all['dewpoint'] #温度湿度差
    df_all['humidity_norm'] = df_all['humidity'] / 100  #湿度转换为0-1范围
    df_all['precip_potential'] = (df_all['humidity_norm'] * df_all['cloud']) / (df_all['windspeed'] + 1e-5) #降水潜力指数
    df_all['solar_energy'] = df_all['sunshine'] * (df_all['maxtemp'] - df_all['mintemp']) #日照能量
    df_all['month'] = pd.to_datetime(df_all['day']).dt.month  #假设有date列
    df_all['season'] = df_all['month'] % 12 // 3 + 1  #季节特征
    
    X_train = df_all[:df_train.shape[0]]
    Y_train = df_train['rainfall']
    x_train,x_val,y_train,y_test = train_test_split(X_train,Y_train,test_size=0.2,random_state=42)

    x_test = df_all[df_train.shape[0]:]

    model_lgb = lightgbm.LGBMRegressor(
        n_estimators=3000,  # 增加迭代次数配合早停
        learning_rate=0.03,  # 减小学习率
        num_leaves=15,  # 限制模型复杂度
        min_child_samples=25,  # 增加最小叶子样本数
        reg_alpha=0.1,  # L1正则化
        reg_lambda=0.1,  # L2正则化
        objective='regression_l1',  # 改用MAE损失
        verbose=100
    )
    model_lgb.fit(x_train, y_train)
    pred_lgb = model_lgb.predict(x_test)
    show_score(model_name='lgb', pred=model_lgb.predict(x_train))

    xgb_param_grid = {
        'n_estimators': [50, 100, 200,250],  # 树的数量
        'max_depth': [3, 4, 5, 6],  # 树的最大深度
        'learning_rate': [0.01,0.05,0.1, 0.3],  # 学习速率
        'subsample': [0.8, 1.0,1.3],  # 指定每次迭代中用于训练每棵树的数据比例
        #'colsample_bytree': [0.8, 1.0], #指定每次迭代中用于训练每棵树的特征比例
        #'gamma': [0, 0.1, 0.2], #最小损失减少值
        #'min_child_weight': [1, 3, 5], #子节点所需的最小样本权重和
        #'reg_alpha': [0, 0.1, 1], #控制模型的正则化强度
        #'reg_lambda': [0, 0.1, 1] #控制模型的正则化强度
    }
    model_xgb = GridSearchCV(
        estimator=XGBRegressor(random_state=42),  # 对什么模型进行搜索超参数
        param_grid=xgb_param_grid,  # 超参数的候选值
        cv=4,  # 使用3折交叉验证
    )
    model_xgb.fit(x_train, y_train)
    pred_xgb = model_xgb.predict(x_test)
    show_score(model_name='xgb', pred=model_xgb.predict(x_train))

    pred_all = pred_xgb
    submission = pd.DataFrame({
        'id': df_test['id'],
        'sales': np.where(pred_all < 0, 0, pred_all)
    })
    submission.to_csv('/kaggle/working/submission.csv', index=False)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值