Kaggle - Bike Sharing Prediction

本研究深入分析了自行车共享需求的预测模型,利用历史数据探讨了季节、天气、工作日等因素对需求的影响,并通过多种机器学习算法进行模型训练,包括线性回归、岭回归、Lasso回归和梯度提升等,旨在提高预测精度。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

import pylab
import calendar
import numpy as np
import pandas as pd
import seaborn as sn
from scipy import stats
import missingno as msno
from datetime import datetime
import matplotlib.pyplot as plt
import warnings
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline

In [2]:

dailyData = pd.read_csv("C:/Users/_lc/BikeSharingDemand/train.csv")
dailyData.head(2)

Out[2]:

 datetimeseasonholidayworkingdayweathertempatemphumiditywindspeedcasualregisteredcount
02011-01-01 00:00:0010019.8414.395810.031316
12011-01-01 01:00:0010019.0213.635800.083240

 

Out[58]:

 timeday
02011-01-01 00:00:001
12011-01-01 01:00:001
22011-01-01 02:00:001
32011-01-01 03:00:001
42011-01-01 04:00:001

In [5]:

datetime - hourly date + timestamp
season - 1 = spring, 2 = summer, 3 = fall, 4 = winter
holiday - whether the day is considered a holiday
workingday - whether the day is neither a weekend nor holiday
weather -
1: Clear, Few clouds, Partly cloudy, Partly cloudy
2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
temp - temperature in Celsius
atemp - "feels like" temperature in Celsius
humidity - relative humidity
windspeed - wind speed
casual - number of non-registered user rentals initiated
registered - number of registered user rentals initiated
count - number of total rentals (Dependent Variable)

dailyData.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
datetime      10886 non-null object
season        10886 non-null int64
holiday       10886 non-null int64
workingday    10886 non-null int64
weather       10886 non-null int64
temp          10886 non-null float64
atemp         10886 non-null float64
humidity      10886 non-null int64
windspeed     10886 non-null float64
casual        10886 non-null int64
registered    10886 non-null int64
count         10886 non-null int64
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.6+ KB

In [3]:

dailyData['date'] = dailyData['datetime'].map(lambda x: x.split()[0])
dailyData['datetime'] = pd.to_datetime(dailyData.datetime)
dailyData['month'] = dailyData['datetime'].map(lambda x: x.month)
dailyData['day'] = dailyData['datetime'].map(lambda x: x.day)
dailyData['hour'] = dailyData['datetime'].map(lambda x: x.hour)
dailyData['weekday'] = dailyData['datetime'].map(lambda x: x.weekday())
dailyData.head(2)

Out[3]:

 datetimeseasonholidayworkingdayweathertempatemphumiditywindspeedcasualregisteredcountdatemonthdayhourweekday
02011-01-01 00:00:0010019.8414.395810.0313162011-01-011105
12011-01-01 01:00:0010019.0213.635800.0832402011-01-011115

In [4]:

dailyData["season"] = dailyData.season.map({1: "Spring", 2 : "Summer", 3 : "Fall", 4 :"Winter" })
dailyData["weather"] = dailyData.weather.map({1: " Clear + Few clouds + Partly cloudy + Partly cloudy",\
                                        2 : " Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist ", \
                                        3 : " Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds", \
                                        4 :" Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog " })
dailyData.head(2)

Out[4]:

 datetimeseasonholidayworkingdayweathertempatemphumiditywindspeedcasualregisteredcountdatemonthdayhourweekday
02011-01-01 00:00:00Spring00Clear + Few clouds + Partly cloudy + Partly c...9.8414.395810.0313162011-01-011105
12011-01-01 01:00:00Spring00Clear + Few clouds + Partly cloudy + Partly c...9.0213.635800.0832402011-01-011115

In [5]:

categoryVariableList = ["hour","weekday","month","season","weather","holiday","workingday"]
for var in categoryVariableList:
    dailyData[var] = dailyData[var].astype("category")
dailyData.head(2)

Out[5]:

 datetimeseasonholidayworkingdayweathertempatemphumiditywindspeedcasualregisteredcountdatemonthdayhourweekday
02011-01-01 00:00:00Spring00Clear + Few clouds + Partly cloudy + Partly c...9.8414.395810.0313162011-01-011105
12011-01-01 01:00:00Spring00Clear + Few clouds + Partly cloudy + Partly c...9.0213.635800.0832402011-01-011115

In [6]:

dailyData.drop('datetime', axis=1, inplace=True)
dailyData.head(2)

Out[6]:

 seasonholidayworkingdayweathertempatemphumiditywindspeedcasualregisteredcountdatemonthdayhourweekday
0Spring00Clear + Few clouds + Partly cloudy + Partly c...9.8414.395810.0313162011-01-011105
1Spring00Clear + Few clouds + Partly cloudy + Partly c...9.0213.635800.0832402011-01-011115

In [7]:

dataTypeDf = dailyData.dtypes.value_counts().to_frame().reset_index().rename(columns={'index':'variableType', 0:'count'})
sn.set_style('darkgrid')
ax = plt.figure(figsize=(10,4)).add_subplot(111)
dataTypeDf.plot(kind='bar',x="variableType",y="count",ax=ax)
ax.set(xlabel='variableTypeariable Type', ylabel='Count',title="Variables DataType Count")

Out[7]:

[Text(0, 0.5, 'Count'),
 Text(0.5, 0, 'variableTypeariable Type'),
 Text(0.5, 1.0, 'Variables DataType Count')]

In [8]:

msno.matrix(dailyData,figsize=(12,5))

Out[8]:

In [8]:

fig, axes = plt.subplots(nrows=2,ncols=2)
fig.set_size_inches(12, 10)
sn.boxplot(data=dailyData,y="count",orient="v",ax=axes[0][0])
sn.boxplot(data=dailyData,y="count",x="season",orient="v",ax=axes[0][1])
sn.boxplot(data=dailyData,y="count",x="hour",orient="v",ax=axes[1][0])
sn.boxplot(data=dailyData,y="count",x="workingday",orient="v",ax=axes[1][1])

axes[0][0].set(ylabel='Count',title="Box Plot On Count")
axes[0][1].set(xlabel='Season', ylabel='Count',title="Box Plot On Count Across Season")
axes[1][0].set(xlabel='Hour Of The Day', ylabel='Count',title="Box Plot On Count Across Hour Of The Day")
axes[1][1].set(xlabel='Working Day', ylabel='Count',title="Box Plot On Count Across Working Day")

Out[8]:

[Text(0, 0.5, 'Count'),
 Text(0.5, 0, 'Working Day'),
 Text(0.5, 1.0, 'Box Plot On Count Across Working Day')]

In [9]:

dailyDataWithoutOutliers = dailyData[np.abs(dailyData["count"]-dailyData["count"].mean())<=(3*dailyData["count"].std())]
len(dailyDataWithoutOutliers)

Out[9]:

10739

In [10]:

corrMatt = dailyData[["temp","atemp","casual","registered","humidity","windspeed","count"]].corr()
mask = np.array(corrMatt)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sn.heatmap(corrMatt, mask=mask,vmax=.8, square=True,annot=True)

Out[10]:

<matplotlib.axes._subplots.AxesSubplot at 0xc355dd8>

In [11]:

fig,(ax1,ax2,ax3) = plt.subplots(ncols=3)
fig.set_size_inches(12, 5)
sn.regplot(x="temp", y="count", data=dailyData,ax=ax1)
sn.regplot(x="windspeed", y="count", data=dailyData,ax=ax2)
sn.regplot(x="humidity", y="count", data=dailyData,ax=ax3)

Out[11]:

<matplotlib.axes._subplots.AxesSubplot at 0xa761cf8>

In [12]:

fig,axes = plt.subplots(ncols=2,nrows=2)
fig.set_size_inches(12, 10)
sn.distplot(dailyData["count"],ax=axes[0][0])
stats.probplot(dailyData["count"], dist='norm', fit=True, plot=axes[0][1])
sn.distplot(np.log(dailyDataWithoutOutliers["count"]),ax=axes[1][0])
stats.probplot(np.log1p(dailyDataWithoutOutliers["count"]), dist='norm', fit=True, plot=axes[1][1])

Out[12]:

((array([-3.82819677, -3.60401975, -3.48099008, ...,  3.48099008,
          3.60401975,  3.82819677]),
  array([0.69314718, 0.69314718, 0.69314718, ..., 6.5971457 , 6.59850903,
         6.5998705 ])),
 (1.3486990121229765, 4.562423868087808, 0.9581176780909608))

In [13]:

fig,(ax1,ax2,ax3,ax4)= plt.subplots(nrows=4)
fig.set_size_inches(12,20)
sortOrder = ["January","February","March","April","May","June","July","August","September","October","November","December"]
hueOrder = ["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"]

monthAggregated = pd.DataFrame(dailyData.groupby("month")["count"].mean()).reset_index()
sn.barplot(data=monthAggregated,x="month",y="count",ax=ax1)
ax1.set(xlabel='Month', ylabel='Avearage Count',title="Average Count By Month")

hourAggregated = pd.DataFrame(dailyData.groupby(["hour","season"],sort=True)["count"].mean()).reset_index()
sn.pointplot(x=hourAggregated["hour"], y=hourAggregated["count"],hue=hourAggregated["season"], data=hourAggregated, join=True,ax=ax2)
ax2.set(xlabel='Hour Of The Day', ylabel='Users Count',title="Average Users Count By Hour Of The Day Across Season",label='big')

hourAggregated = pd.DataFrame(dailyData.groupby(["hour","weekday"],sort=True)["count"].mean()).reset_index()
sn.pointplot(x=hourAggregated["hour"], y=hourAggregated["count"],hue=hourAggregated["weekday"], data=hourAggregated, join=True,ax=ax3)
ax3.set(xlabel='Hour Of The Day', ylabel='Users Count',title="Average Users Count By Hour Of The Day Across Weekdays",label='big')

hourTransformed = pd.melt(dailyData[["hour","casual","registered"]], id_vars=['hour'], value_vars=['casual', 'registered'])
hourAggregated = pd.DataFrame(hourTransformed.groupby(["hour","variable"],sort=True)["value"].mean()).reset_index()
sn.pointplot(x=hourAggregated["hour"], y=hourAggregated["value"],hue=hourAggregated["variable"],hue_order=["casual","registered"], data=hourAggregated, join=True,ax=ax4)
ax4.set(xlabel='Hour Of The Day', ylabel='Users Count',title="Average Users Count By Hour Of The Day Across User Type",label='big')

Out[13]:

[Text(0, 0.5, 'Users Count'),
 Text(0.5, 0, 'Hour Of The Day'),
 Text(0.5, 1.0, 'Average Users Count By Hour Of The Day Across User Type'),
 None]

In [14]:

dataTrain = pd.read_csv("C:/Users/_lc/BikeSharingDemand/train.csv")
dataTest = pd.read_csv("C:/Users/_lc/BikeSharingDemand/test.csv")
dataTest.head(2)

Out[14]:

 datetimeseasonholidayworkingdayweathertempatemphumiditywindspeed
02011-01-20 00:00:00101110.6611.3655626.0027
12011-01-20 01:00:00101110.6613.635560.0000

In [35]:

data = dataTrain.append(dataTest)
data.reset_index(inplace=True)
data.drop('index',inplace=True,axis=1)
data.head()

Out[35]:

 atempcasualcountdatetimeholidayhumidityregisteredseasontempweatherwindspeedworkingday
014.3953.016.02011-01-01 00:00:0008113.019.8410.00
113.6358.040.02011-01-01 01:00:0008032.019.0210.00
213.6355.032.02011-01-01 02:00:0008027.019.0210.00
314.3953.013.02011-01-01 03:00:0007510.019.8410.00
414.3950.01.02011-01-01 04:00:000751.019.8410.00

In [36]:

data["datetime"] = pd.to_datetime(data.datetime)
data["year"] = data.datetime.map(lambda x : x.year)
data["month"] = data.datetime.apply(lambda x: x.month)
data["day"] = data.datetime.apply(lambda x : x.day)
data["hour"] = data.datetime.apply(lambda x : x.hour)
data['weekday'] = data.datetime.map(lambda x: x.weekday())
data.head()

Out[36]:

 atempcasualcountdatetimeholidayhumidityregisteredseasontempweatherwindspeedworkingdayyearmonthdayhourweekday
014.3953.016.02011-01-01 00:00:0008113.019.8410.0020111105
113.6358.040.02011-01-01 01:00:0008032.019.0210.0020111115
213.6355.032.02011-01-01 02:00:0008027.019.0210.0020111125
314.3953.013.02011-01-01 03:00:0007510.019.8410.0020111135
414.3950.01.02011-01-01 04:00:000751.019.8410.0020111145

In [37]:

# ---------预测风速为零的---------
from sklearn.ensemble import RandomForestRegressor
dataWind0 = data[data["windspeed"]==0]
dataWindNot0 = data[data["windspeed"]!=0]
rfModel_wind = RandomForestRegressor()
windColumns = ["season","weather","humidity","month","temp","year","atemp"]
rfModel_wind.fit(dataWindNot0[windColumns], dataWindNot0["windspeed"])

wind0Values = rfModel_wind.predict(X= dataWind0[windColumns])
dataWind0["windspeed"] = wind0Values
data = dataWindNot0.append(dataWind0)
data.reset_index(inplace=True)
data.drop('index',inplace=True,axis=1)
data.head()
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\ensemble\forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)

Out[37]:

 atempcasualcountdatetimeholidayhumidityregisteredseasontempweatherwindspeedworkingdayyearmonthdayhourweekday
012.8800.01.02011-01-01 05:00:000751.019.8426.0032020111155
119.69512.036.02011-01-01 10:00:0007624.0115.58116.99790201111105
216.66526.056.02011-01-01 11:00:0008130.0114.76119.00120201111115
321.21029.084.02011-01-01 12:00:0007755.0117.22119.00120201111125
422.72547.094.02011-01-01 13:00:0007247.0118.86219.99950201111135

In [39]:

categoricalFeatureNames = ["season","holiday","workingday","weather","weekday","month","year","hour"]
numericalFeatureNames = ["temp","humidity","windspeed","atemp"]
dropFeatures = ['casual',"count","datetime","day","registered"]
for var in categoricalFeatureNames:
    data[var] = data[var].astype("category")

In [40]:

dataTrain = data[pd.notnull(data['count'])].sort_values(by=["datetime"])
dataTest = data[~pd.notnull(data['count'])].sort_values(by=["datetime"])
datetimecol = dataTest["datetime"]
yLabels = dataTrain["count"]
yLablesRegistered = dataTrain["registered"]
yLablesCasual = dataTrain["casual"]

dataTrain  = dataTrain.drop(dropFeatures,axis=1)
dataTest  = dataTest.drop(dropFeatures,axis=1)

In [47]:

# 对数均方误差
def rmsle(y, y_,convertExp=True):
    if convertExp:
        y = np.exp(y),
        y_ = np.exp(y_)
    log1 = np.nan_to_num(np.array([np.log(v + 1) for v in y]))
    log2 = np.nan_to_num(np.array([np.log(v + 1) for v in y_]))
    calc = (log1 - log2) ** 2
    return np.sqrt(np.mean(calc))

In [48]:

from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
import warnings
pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=DeprecationWarning)

lModel = LinearRegression()

yLabelsLog = np.log1p(yLabels)
lModel.fit(X = dataTrain,y = yLabelsLog)

preds = lModel.predict(X= dataTrain)
print ("RMSLE Value For Linear Regression: ",rmsle(np.exp(yLabelsLog),np.exp(preds),False))
RMSLE Value For Linear Regression:  0.9779688131592883

In [53]:

ridge_m_ = Ridge()
ridge_params_ = { 'max_iter':[3000],'alpha':[0.1, 1, 2, 3, 4, 10, 30,100,200,300,400,800,900,1000]}
rmsle_scorer = metrics.make_scorer(rmsle, greater_is_better=False)
grid_ridge_m = GridSearchCV( ridge_m_,
                          ridge_params_,
                          scoring = rmsle_scorer,
                          cv=5)
yLabelsLog = np.log1p(yLabels)
grid_ridge_m.fit( dataTrain, yLabelsLog )
preds = grid_ridge_m.predict(X= dataTrain)
print (grid_ridge_m.best_params_)
print ("RMSLE Value For Ridge Regression: ",rmsle(np.exp(yLabelsLog),np.exp(preds),False))

fig,ax= plt.subplots()
fig.set_size_inches(12,5)
df = pd.DataFrame(grid_ridge_m.cv_results_)
df["alpha"] = df["params"].apply(lambda x:x["alpha"])
df["rmsle"] = df["mean_test_score"].apply(lambda x:-x)
sn.pointplot(data=df,x="alpha",y="rmsle",ax=ax)
{'alpha': 0.1, 'max_iter': 3000}
RMSLE Value For Ridge Regression:  0.9779687981095982

Out[53]:

<matplotlib.axes._subplots.AxesSubplot at 0x12c49cc0>

In [55]:

lasso_m_ = Lasso()

alpha  = 1/np.array([0.1, 1, 2, 3, 4, 10, 30,100,200,300,400,800,900,1000])
lasso_params_ = { 'max_iter':[3000],'alpha':alpha}

grid_lasso_m = GridSearchCV( lasso_m_,lasso_params_,scoring = rmsle_scorer,cv=5)
yLabelsLog = np.log1p(yLabels)
grid_lasso_m.fit( dataTrain, yLabelsLog )
preds = grid_lasso_m.predict(X= dataTrain)
print (grid_lasso_m.best_params_)
print ("RMSLE Value For Lasso Regression: ",rmsle(np.exp(yLabelsLog),np.exp(preds),False))

fig,ax= plt.subplots()
fig.set_size_inches(12,5)
df = pd.DataFrame(grid_lasso_m.cv_results_)
df["alpha"] = df["params"].apply(lambda x:x["alpha"])
df["rmsle"] = df["mean_test_score"].apply(lambda x:-x)
sn.pointplot(data=df,x="alpha",y="rmsle",ax=ax)
{'alpha': 0.005, 'max_iter': 3000}
RMSLE Value For Lasso Regression:  0.9781068303163186

Out[55]:

<matplotlib.axes._subplots.AxesSubplot at 0x12dbd080>

In [56]:

from sklearn.ensemble import GradientBoostingRegressor
gbm = GradientBoostingRegressor(n_estimators=4000,alpha=0.01); ### Test 0.41
yLabelsLog = np.log1p(yLabels)
gbm.fit(dataTrain,yLabelsLog)
preds = gbm.predict(X= dataTrain)
print ("RMSLE Value For Gradient Boost: ",rmsle(np.exp(yLabelsLog),np.exp(preds),False))

predsTest = gbm.predict(X= dataTest)
fig,(ax1,ax2)= plt.subplots(ncols=2)
fig.set_size_inches(12,5)
sn.distplot(yLabels,ax=ax1,bins=50)
sn.distplot(np.exp(predsTest),ax=ax2,bins=50)
RMSLE Value For Gradient Boost:  0.1896091619650034

Out[56]:

<matplotlib.axes._subplots.AxesSubplot at 0x14104278>

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值