1 Lasso与多重共线性
1.1 Lasso 强大的特征选择能力
import numpy as np
import pandas as pd
from sklearn. linear_model import Ridge, LinearRegression, Lasso
from sklearn. model_selection import train_test_split as TTS
from sklearn. datasets import fetch_california_housing as fch
import matplotlib. pyplot as plt
housevalue = fch( )
X = pd. DataFrame( housevalue. data)
y = housevalue. target
X. columns = [ "住户收入中位数" , "房屋使用年代中位数" , "平均房间数目"
, "平均卧室数目" , "街区人口" , "平均入住率" , "街区的纬度" , "街区的经度" ]
X. head( )
住户收入中位数
房屋使用年代中位数
平均房间数目
平均卧室数目
街区人口
平均入住率
街区的纬度
街区的经度
0
8.3252
41.0
6.984127
1.023810
322.0
2.555556
37.88
-122.23
1
8.3014
21.0
6.238137
0.971880
2401.0
2.109842
37.86
-122.22
2
7.2574
52.0
8.288136
1.073446
496.0
2.802260
37.85
-122.24
3
5.6431
52.0
5.817352
1.073059
558.0
2.547945
37.85
-122.25
4
3.8462
52.0
6.281853
1.081081
565.0
2.181467
37.85
-122.25
Xtrain, Xtest, Ytrain, Ytest = TTS( X, y, test_size= 0.3 , random_state= 420 )
for i in [ Xtrain, Xtest] :
i. index = range ( i. shape[ 0 ] )
reg = LinearRegression( ) . fit( Xtrain, Ytrain)
( reg. coef_* 100 ) . tolist( )
[43.735893059684074,
1.0211268294494147,
-10.780721617317637,
62.64338275363747,
5.21612535296645e-05,
-0.33485096463334924,
-41.30959378947715,
-42.62109536208473]
Ridge_ = Ridge( alpha= 0 ) . fit( Xtrain, Ytrain)
( Ridge_. coef_* 100 ) . tolist( )
[43.73589305968356,
1.0211268294493694,
-10.780721617316962,
62.6433827536353,
5.2161253532548055e-05,
-0.3348509646333529,
-41.30959378947995,
-42.62109536208777]
lasso_ = Lasso( alpha= 0 ) . fit( Xtrain, Ytrain)
( lasso_. coef_* 100 ) . tolist( )
C:\Users\chen'bu'rong\AppData\Local\Temp\ipykernel_17824\3627946873.py:2: UserWarning: With alpha=0, this algorithm does not converge well. You are advised to use the LinearRegression estimator
lasso_ = Lasso(alpha=0).fit(Xtrain,Ytrain)
D:\py1.1\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:648: UserWarning: Coordinate descent with no regularization may lead to unexpected results and is discouraged.
model = cd_fast.enet_coordinate_descent(
D:\py1.1\lib\site-packages\sklearn\linear_model\_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.770e+03, tolerance: 1.917e+00 Linear regression models with null weight for the l1 regularization term are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
model = cd_fast.enet_coordinate_descent(
[43.73589305968407,
1.0211268294494082,
-10.780721617317662,
62.6433827536377,
5.21612535327013e-05,
-0.33485096463335806,
-41.30959378947693,
-42.621095362084496]
Ridge_ = Ridge( alpha= 0.1 ) . fit( Xtrain, Ytrain)
( Ridge_. coef_* 100 ) . tolist( )
[43.734534807869196,
1.0211508518425259,
-10.778109335481327,
62.62978997580269,
5.2255520319229976e-05,
-0.3348478363544321,
-41.3093700653905,
-42.62068050768773]
lasso_ = Lasso( alpha= 0.1 ) . fit( Xtrain, Ytrain)
( lasso_. coef_* 100 ) . tolist( )
[39.08851438329682,
1.6054695654279867,
-0.0,
0.0,
0.0023777014839091335,
-0.3050186895638112,
-10.771509301655538,
-9.294344477958074]
[ 43.73589305968403 ,
1.0211268294494038 ,
- 10.780721617317715 ,
62.64338275363783 ,
5.216125353178735e-05 ,
- 0.33485096463336095 ,
- 41.30959378947711 ,
- 42.621095362084674 ]
[43.73589305968403,
1.0211268294494038,
-10.780721617317715,
62.64338275363783,
5.216125353178735e-05,
-0.33485096463336095,
-41.30959378947711,
-42.621095362084674]
Ridge_ = Ridge( alpha= 10 ** 10 ) . fit( Xtrain, Ytrain)
( Ridge_. coef_* 100 ) . tolist( )
[0.00021838533330206371,
0.00021344956264503437,
6.213673042878628e-05,
-3.828084920732722e-06,
-0.001498408728695283,
-4.175243714653839e-05,
-5.295061194474971e-05,
-1.3268982521957738e-05]
Ridge_ = Ridge( alpha= 10 ** 4 ) . fit( Xtrain, Ytrain)
( Ridge_. coef_* 100 ) . tolist( )
[34.62081517607707,
1.5196170869238759,
0.3968610529209999,
0.915181251035547,
0.0021739238012248533,
-0.34768660148101127,
-14.73696347421548,
-13.435576102527182]
lasso_ = Lasso( alpha= 10 ** 4 ) . fit( Xtrain, Ytrain)
( lasso_. coef_* 100 ) . tolist( )
[0.0, 0.0, 0.0, -0.0, -0.0, -0.0, -0.0, -0.0]
lasso_ = Lasso( alpha= 1 ) . fit( Xtrain, Ytrain)
( lasso_. coef_* 100 ) . tolist( )
[14.581141247629409,
0.6209347344423877,
0.0,
-0.0,
-0.0002806598632900984,
-0.0,
-0.0,
-0.0]
plt. plot( range ( 1 , 9 ) , ( reg. coef_* 100 ) . tolist( ) , color= "red" , label= "LR" )
plt. plot( range ( 1 , 9 ) , ( Ridge_. coef_* 100 ) . tolist( ) , color= "orange" , label= "Ridge" )
plt. plot( range ( 1 , 9 ) , ( lasso_. coef_* 100 ) . tolist( ) , color= "k" , label= "Lasso" )
plt. plot( range ( 1 , 9 ) , [ 0 ] * 8 , color= "grey" , linestyle= "--" )
plt. xlabel( 'w' )
plt. legend( )
plt. show( )
1.2 选取最佳正则化参数
from sklearn. linear_model import LassoCV
alpharange = np. logspace( - 10 , - 2 , 200 , base= 10 )
alpharange
array([1.00000000e-10, 1.09698580e-10, 1.20337784e-10, 1.32008840e-10,
1.44811823e-10, 1.58856513e-10, 1.74263339e-10, 1.91164408e-10,
2.09704640e-10, 2.30043012e-10, 2.52353917e-10, 2.76828663e-10,
3.03677112e-10, 3.33129479e-10, 3.65438307e-10, 4.00880633e-10,
4.39760361e-10, 4.82410870e-10, 5.29197874e-10, 5.80522552e-10,
6.36824994e-10, 6.98587975e-10, 7.66341087e-10, 8.40665289e-10,
9.22197882e-10, 1.01163798e-09, 1.10975250e-09, 1.21738273e-09,
1.33545156e-09, 1.46497140e-09, 1.60705282e-09, 1.76291412e-09,
1.93389175e-09, 2.12145178e-09, 2.32720248e-09, 2.55290807e-09,
2.80050389e-09, 3.07211300e-09, 3.37006433e-09, 3.69691271e-09,
4.05546074e-09, 4.44878283e-09, 4.88025158e-09, 5.35356668e-09,
5.87278661e-09, 6.44236351e-09, 7.06718127e-09, 7.75259749e-09,
8.50448934e-09, 9.32930403e-09, 1.02341140e-08, 1.12266777e-08,
1.23155060e-08, 1.35099352e-08, 1.48202071e-08, 1.62575567e-08,
1.78343088e-08, 1.95639834e-08, 2.14614120e-08, 2.35428641e-08,
2.58261876e-08, 2.83309610e-08, 3.10786619e-08, 3.40928507e-08,
3.73993730e-08, 4.10265811e-08, 4.50055768e-08, 4.93704785e-08,
5.41587138e-08, 5.94113398e-08, 6.51733960e-08, 7.14942899e-08,
7.84282206e-08, 8.60346442e-08, 9.43787828e-08, 1.03532184e-07,
1.13573336e-07, 1.24588336e-07, 1.36671636e-07, 1.49926843e-07,
1.64467618e-07, 1.80418641e-07, 1.97916687e-07, 2.17111795e-07,
2.38168555e-07, 2.61267523e-07, 2.86606762e-07, 3.14403547e-07,
3.44896226e-07, 3.78346262e-07, 4.15040476e-07, 4.55293507e-07,
4.99450512e-07, 5.47890118e-07, 6.01027678e-07, 6.59318827e-07,
7.23263390e-07, 7.93409667e-07, 8.70359136e-07, 9.54771611e-07,
1.04737090e-06, 1.14895100e-06, 1.26038293e-06, 1.38262217e-06,
1.51671689e-06, 1.66381689e-06, 1.82518349e-06, 2.00220037e-06,
2.19638537e-06, 2.40940356e-06, 2.64308149e-06, 2.89942285e-06,
3.18062569e-06, 3.48910121e-06, 3.82749448e-06, 4.19870708e-06,
4.60592204e-06, 5.05263107e-06, 5.54266452e-06, 6.08022426e-06,
6.66991966e-06, 7.31680714e-06, 8.02643352e-06, 8.80488358e-06,
9.65883224e-06, 1.05956018e-05, 1.16232247e-05, 1.27505124e-05,
1.39871310e-05, 1.53436841e-05, 1.68318035e-05, 1.84642494e-05,
2.02550194e-05, 2.22194686e-05, 2.43744415e-05, 2.67384162e-05,
2.93316628e-05, 3.21764175e-05, 3.52970730e-05, 3.87203878e-05,
4.24757155e-05, 4.65952567e-05, 5.11143348e-05, 5.60716994e-05,
6.15098579e-05, 6.74754405e-05, 7.40196000e-05, 8.11984499e-05,
8.90735464e-05, 9.77124154e-05, 1.07189132e-04, 1.17584955e-04,
1.28989026e-04, 1.41499130e-04, 1.55222536e-04, 1.70276917e-04,
1.86791360e-04, 2.04907469e-04, 2.24780583e-04, 2.46581108e-04,
2.70495973e-04, 2.96730241e-04, 3.25508860e-04, 3.57078596e-04,
3.91710149e-04, 4.29700470e-04, 4.71375313e-04, 5.17092024e-04,
5.67242607e-04, 6.22257084e-04, 6.82607183e-04, 7.48810386e-04,
8.21434358e-04, 9.01101825e-04, 9.88495905e-04, 1.08436597e-03,
1.18953407e-03, 1.30490198e-03, 1.43145894e-03, 1.57029012e-03,
1.72258597e-03, 1.88965234e-03, 2.07292178e-03, 2.27396575e-03,
2.49450814e-03, 2.73644000e-03, 3.00183581e-03, 3.29297126e-03,
3.61234270e-03, 3.96268864e-03, 4.34701316e-03, 4.76861170e-03,
5.23109931e-03, 5.73844165e-03, 6.29498899e-03, 6.90551352e-03,
7.57525026e-03, 8.30994195e-03, 9.11588830e-03, 1.00000000e-02])
alpharange. shape
(200,)
Xtrain. head( )
住户收入中位数
房屋使用年代中位数
平均房间数目
平均卧室数目
街区人口
平均入住率
街区的纬度
街区的经度
0
4.1776
35.0
4.425172
1.030683
5380.0
3.368817
37.48
-122.19
1
5.3261
38.0
6.267516
1.089172
429.0
2.732484
37.53
-122.30
2
1.9439
26.0
5.768977
1.141914
891.0
2.940594
36.02
-119.08
3
2.5000
22.0
4.916000
1.012000
733.0
2.932000
38.57
-121.31
4
3.8250
34.0
5.036765
1.098039
1134.0
2.779412
33.91
-118.35
lasso_ = LassoCV( alphas= alpharange
, cv= 5
) . fit( Xtrain, Ytrain)