一、原理
pandas中使用corr()这个函数,可以查看数据之间的相关性,检查两个变量之间变化趋势的方向以及程度,值范围-1到+1,0表示两个变量不相关,正值表示正相关,负值表示负相关,值越大相关性越强。
线性回归,多个特征找与目标值之间的关系,有的是没有关系的,这部分没有关系的就可以进行删除。
找到目标值关系很小的特征值,删除这些特征值。
二、代码实例
# 天池工业蒸汽量预测
# 里面有38特征,必然有一些特征不重要!
import numpy as np
import pandas as pd
import seaborn as sns
data = pd.read_csv('./zhengqi_train.txt',sep = '\t')
data.shape
(2888, 39)
# V0~ V37特征
# target目标值
# 算法,使用找V0~V37 和target关系
# 38个特征,难道都有关系,将关系特别小的属性删除
# 可以使用,今天所学的方法
data.head()
X = data.iloc[:,:-1]
y = data['target']
from sklearn.linear_model import Ridge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.feature_selection import SelectFromModel
model1 = SelectFromModel(estimator=Ridge(),threshold='median',max_features=25)
model1.fit(X,y)
X2 = model1.transform(X)
display(X2.shape,X2[:5])
(2888, 19)
array([[ 0.566, 0.016, -0.143, 0.407, 0.452, -0.901, -1.812, -2.36 ,
-0.436, -0.94 , -0.307, -0.073, -0.484, -1.162, 0.8 , 0.168,
0.136, -2.608, -3.508],
[ 0.968, 0.437, 0.066, 0.566, 0.194, -0.893, -1.566, -2.36 ,
0.332, 0.188, -0.455, -0.134, -0.488, -1.162, 0.801, 0.338,
-0.128, -0.335, -0.73 ],
[ 1.013, 0.568, 0.235, 0.37 , 0.112, -0.797, -1.367, -2.36 ,
0.396, 0.874, -0.051, -0.072, -0.493, -0.897, 0.961, 0.326,
-0.009, 0.765, -0.589],
[ 0.733, 0.368, 0.283, 0.165, 0.599, -0.679, -1.2 , -2.086,
0.403, 0.011, 0.102, -0.014, -0.371, -0.897, 1.435, 0.277,
0.015, 0.333, -0.112],
[ 0.684, 0.638, 0.26 , 0.209, 0.337, -0.454, -1.073, -2.086,
0.314, -0.251, 0.57 , 0.199, -0.342, -0.897, 0.881, 0.332,
0.183, -0.28 , -0.028]])
model1.get_support()
array([ True, True, True, True, True, True, True, True, True,
False, True, True, True, False, True, False, False, True,
False, False, False, False, False, False, True, False, False,
True, False, True, False, False, False, False, False, False,
True, True])
model2 = SelectFromModel(estimator=AdaBoostRegressor(),threshold='median')
model2.fit(X,y)
X2 = model2.transform(X)
display(X2.shape,X2[:5])
(2888, 19)
array([[ 0.566, 0.016, -0.143, 0.407, -0.436, -0.94 , -0.073, -1.707,
-1.162, -0.573, -0.991, 0.61 , -0.223, 0.796, 0.168, -0.45 ,
0.136, -0.615, -3.508],
[ 0.968, 0.437, 0.066, 0.566, 0.332, 0.188, -0.134, -0.977,
-1.162, -0.571, -0.836, 0.588, -0.144, 1.057, 0.338, 0.671,
-0.128, 0.032, -0.73 ],
[ 1.013, 0.568, 0.235, 0.37 , 0.396, 0.874, -0.072, -0.618,
-0.897, -0.564, -0.558, 0.576, -0.067, 0.915, 0.326, 1.287,
-0.009, 0.277, -0.589],
[ 0.733, 0.368, 0.283, 0.165, 0.403, 0.011, -0.014, -0.429,
-0.897, -0.574, -0.564, 0.272, 0.113, 0.898, 0.277, 1.298,
0.015, 0.279, -0.112],
[ 0.684, 0.638, 0.26 , 0.209, 0.314, -0.251, 0.199, -0.391,
-0.897, -0.572, -0.394, 0.106, 0.221, 0.386, 0.332, 1.289,
0.183, 0.328, -0.028]])
model2.get_support()
array([ True, True, True, True, False, False, False, False, True,
False, True, False, True, False, False, False, True, True,
True, True, True, False, False, False, False, True, True,
True, True, True, False, True, False, False, False, False,
False, True])
相关性系数
corr = data.corr()
# 特征V32 和目标值target相关性0.066606
# 特征V34 和目标值target相关性-0.006034
# 类似这样的特征,可以删除掉!!!
import matplotlib.pyplot as plt
plt.figure(figsize = (16,16))
sns.heatmap(corr,annot = True)
<matplotlib.axes._subplots.AxesSubplot at 0x1d756270688>

output_10_1.png