利用Scikit-learn预测波士顿房价：线性回归与数据标准化-优快云博客

本文链接：https://blog.youkuaiyun.com/Lemon_Review/article/details/120852919

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.datasets import load_iris

plt.rcParams['font.sans-serif'] = ['STFangsong']
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'

# y_hat = a0 + a1*x1

# L = (a0 + a1*1.91 - 86)^2 + (a0 + a1*2.03 - 113)^2 + (a0 + a1*2.03 - 112)^2
# L'(a0)  =2(a0 + a1*1.91 - 86) + 2(a0 + a1*2.03 - 113)+2(a0 + a1*2.03 - 112) =0
#L'(a1) = 2(a0 + a1*1.91 - 86)*1.91 + 2(a0 + a1*2.03 - 113)*2.03+2(a0 + a1*2.03 - 112)*2.03 =0
# a0 = 103
# a1 = 0.12

# y_hat = 103+0.12*x1


# 截距项  X1 X2 X3 X4 Y 
# 标准化 0-1 标准化缩放  
# MinMaxScaler    # StandardScaler

# MinMaxScaler  : Xi - Xmin/(Xmax - Xmin)
# StandardScaler  : Xi -Xmean/ Xstd

# 模型训练过程
# 1. X,Y
# 2. 划分数据集 xtrain，xtest，ytrain，ytest ,数据相关性检查
# 3. xtrain  xtest 标准化：计算xtrain 的mean std 用作x计算标准化的依据
# 4. 模型训练
# 5. 模型评估：R^2

from sklearn.datasets import load_boston

# boston 房价数据集
boston = load_boston()

x = boston.data
y = boston.target

# data = pd.DataFrame(boston.data, columns = boston.feature_names)
data = pd.DataFrame(x, columns = boston.feature_names)
data

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT
0	0.00632	18.0	2.31	0.0	0.538	6.575	65.2	4.0900	1.0	296.0	15.3	396.90	4.98
1	0.02731	0.0	7.07	0.0	0.469	6.421	78.9	4.9671	2.0	242.0	17.8	396.90	9.14
2	0.02729	0.0	7.07	0.0	0.469	7.185	61.1	4.9671	2.0	242.0	17.8	392.83	4.03
3	0.03237	0.0	2.18	0.0	0.458	6.998	45.8	6.0622	3.0	222.0	18.7	394.63	2.94
4	0.06905	0.0	2.18	0.0	0.458	7.147	54.2	6.0622	3.0	222.0	18.7	396.90	5.33
...	...	...	...	...	...	...	...	...	...	...	...	...	...
501	0.06263	0.0	11.93	0.0	0.573	6.593	69.1	2.4786	1.0	273.0	21.0	391.99	9.67
502	0.04527	0.0	11.93	0.0	0.573	6.120	76.7	2.2875	1.0	273.0	21.0	396.90	9.08
503	0.06076	0.0	11.93	0.0	0.573	6.976	91.0	2.1675	1.0	273.0	21.0	396.90	5.64
504	0.10959	0.0	11.93	0.0	0.573	6.794	89.3	2.3889	1.0	273.0	21.0	393.45	6.48
505	0.04741	0.0	11.93	0.0	0.573	6.030	80.8	2.5050	1.0	273.0	21.0	396.90	7.88

506 rows × 13 columns

# 查看特征相关性
import seaborn as sns

sns.heatmap(data.corr(),center = 0)

<AxesSubplot:>

在这里插入图片描述

# 划分数据集

from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = 0.3)

# 标准化
from sklearn.preprocessing import MinMaxScaler   # 0-1缩放
from sklearn.preprocessing import StandardScaler   # 标准化缩放

std = StandardScaler().fit(xtrain)
# 实例化 + 数据导入：std:xtrain(mean,std)

xtrain_ = std.transform(xtrain)
xtest_ = std.transform(xtest)

from sklearn.linear_model import LinearRegression

# 训练
lr = LinearRegression().fit(xtrain_,ytrain)
lr.coef_   # 参数[a1,a2,a3...am]  m个特征

array([-1.12267664,  0.97439663, -0.3459865 ,  0.41380091, -2.10474456,
        2.43778686, -0.22264589, -3.56271303,  3.34309993, -2.51830569,
       -1.93298784,  0.7963353 , -3.7752077 ])