回归分析:二手车价格预测

项目说明:数据来源阿里天池的一个挖掘比赛:预测二手车交易价格

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LassoCV,RidgeCV
# from sklearn.ensemble import GradientBoostingRegressor  # 集成方法回归库
# from sklearn.model_selection import GridSearchCV

filename = r'C:\Users\liuhao\Desktop\新建文件夹\二手车\used_car_train_20200313.csv'
train = pd.read_csv(filename, sep=' ')

iqr = train['price'].quantile(0.75) + ((train['price'].quantile(0.75) - train['price'].quantile(0.25))*1.5)
train.drop(train['price'][train['price'] > iqr].index,inplace=True)

train['price'] = np.log1p(train['price'])

bra_p = train['price'].groupby(train['brand']).mean()
train['b_p'] = train['brand'].apply(lambda x:bra_p.iloc[x])

train['used_months'] = ((pd.to_datetime(train['creatDate'], format='%Y%m%d', errors='coerce') - 
                         pd.to_datetime(train['regDate'], format='%Y%m%d', errors='coerce')).dt.days)/30
train['used_months'].fillna(train['used_months'].mean(),inplace=True)

def fill_missing(df):
    df['fuelType'] = df['fuelType'].fillna(train['fuelType'].value_counts().index[0])
    df['gearbox'] = df['gearbox'].fillna(train['gearbox'].value_counts().index[0])
    df['bodyType'] = df['bodyType'].fillna(train['bodyType'].value_counts().index[0])
    df['model'] = df['model'].fillna(train['model'].value_counts().index[0])
    df['brand'] = df['brand'].fillna(train['brand'].value_counts().index[0])
    return df

ndata = fill_missing(train)

ndata['notRepairedDamage'].replace('-',ndata['notRepairedDamage'].value_counts().index[0],inplace=True)
ndata['power'] = ndata['power'].map(lambda x: 600 if x>600 else x)

all_features = ndata.drop(['SaleID', 'name', 'regDate', 'model', 'seller',
                  'offerType', 'creatDate','regionCode',], axis=1)


def data_astype(df):

    df['brand'] = df['brand'].astype(str)
    df['bodyType'] = df['bodyType'].astype(str)
    df['fuelType'] = df['fuelType'].astype(str)
    df['gearbox'] = df['gearbox'].astype(str)
    df['notRepairedDamage'] = df['notRepairedDamage'].astype(str)

    return df

columns = ['b_p','used_months','power', 'kilometer','v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5',
           'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14']
for i in columns:
    all_features[i] = (all_features[i] - all_features[i].mean())/all_features[i].std()

all_features = data_astype(all_features)
all_features = pd.get_dummies(all_features).reset_index(drop=True)

X = all_features[all_features['price'].notnull()].drop(['price'], axis=1)
y = all_features[all_features['price'].notnull()]['price']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=15, shuffle=True)

model = RidgeCV(cv=5)

# model_gbr = GradientBoostingRegressor()  # 建立GradientBoostingRegressor回归对象
# parameters = {'loss': ['ls', 'lad', 'huber', 'quantile'],
#               'learning_rate': [0.05, 0.1, 0.15],
#               'max_depth': [2, 3, 4],
#               'min_samples_split': [2, 3],
#               'min_samples_leaf': [1, 2, 4]}  # 定义要优化的参数信息
# model = GridSearchCV(estimator=model_gbr, param_grid=parameters, cv=5, n_jobs=-1) 

model.fit(X_train, y_train)

mae_train = mean_absolute_error(np.expm1(y_train), np.expm1(model.predict(X_train)))
mae_valid = mean_absolute_error(np.expm1(y_valid), np.expm1(model.predict(X_valid)))

print('训练集MAE: {}'.format(mae_train))
print('验证集MAE: {}'.format(mae_valid))


c = dict(zip(X.columns.values,model.coef_))
for k,v in c.items():
	if v != 0:
		print(k,v)
训练集MAE: 716.1868040375566
验证集MAE: 718.820363265542
power 0.052122587581387714
kilometer -0.03193013183220526
v_0 0.38228811355042663
v_1 4.558832354583367
v_2 0.08293099366726327
v_3 -0.5048855281691552
v_4 0.11524750030241149
v_5 -2.1300547170679587
v_6 -9.290451991109782
v_7 -1.7258869116610935
v_8 -1.2923321301431097
v_9 -0.18154290052862374
v_10 -4.497229173748067
v_11 1.0198586068731832
v_12 1.227659570720378
v_13 -0.2645348897326188
v_14 0.11129048287003387
b_p 0.006604455657593905
used_months 0.015219686514453932
brand_0 0.05292687430882881
brand_1 0.04575110664354303
brand_10 0.0722468605832188
brand_11 0.010055176597968252
brand_12 0.05199725275729203
brand_13 -0.03972812033405547
brand_14 0.010467904061676693
brand_15 -0.009915577881234297
brand_16 -0.03703759209615248
brand_17 -0.0053859944729151895
brand_18 0.10144654689267484
brand_19 0.06007720315684757
brand_2 0.05157665152837241
brand_20 -0.009430799126492022
brand_21 -0.13956525030839328
brand_22 -0.09669271749775515
brand_23 0.02855133658616288
brand_24 -0.11768907151105801
brand_25 -0.06071317794087798
brand_26 0.01632228846152352
brand_27 0.047039694412737275
brand_28 -0.07697190664035312
brand_29 -0.17151723203242997
brand_3 -0.028244006173499956
brand_30 -0.0335614380714301
brand_31 -0.044441364281496774
brand_32 0.031515977103858006
brand_33 0.14316135714985093
brand_34 -0.03942328303452528
brand_35 0.04233062291162051
brand_36 0.04962154881806041
brand_37 0.18677443115899794
brand_38 -0.15175325537240178
brand_39 -0.015775154215465794
brand_4 0.06724640178952512
brand_5 -0.03941058254698612
brand_6 -0.023032960236029205
brand_7 0.018980519868415147
brand_8 0.040473314656164806
brand_9 0.011726417193814356
bodyType_0.0 -0.017355460429348785
bodyType_1.0 -0.004216524436321329
bodyType_2.0 -0.041867591705686644
bodyType_3.0 0.005792701118562396
bodyType_4.0 0.06369450765574143
bodyType_5.0 -0.023472833990457995
bodyType_6.0 -0.00044665077139483764
bodyType_7.0 0.017871844217337573
fuelType_0.0 0.03421980511350734
fuelType_1.0 0.03293748982363767
fuelType_2.0 0.07679801069422088
fuelType_3.0 -0.008773899133628988
fuelType_4.0 -0.035047366805926156
fuelType_5.0 0.001823478738499934
fuelType_6.0 -0.10195751367725783
gearbox_0.0 -0.011315881809687038
gearbox_1.0 0.011315879154786226
notRepairedDamage_0.0 0.11539945680861217
notRepairedDamage_1.0 -0.11539945679215818

初始MAE是1000多,最终最好的训练结果 MAE 是岭回归700多。
没有上传预测结果,因为排行榜榜首只有300多,入榜也才400多。
GridSearchCV数据量太大,电脑好久都跑不出来结果
手动设定GBR参数,最终结果比岭回归强一点

model = GradientBoostingRegressor(learning_rate=0.15,max_depth=4,loss="ls")

训练集MAE: 561.901109523035
验证集MAE: 579.8975947651696
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值