import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.datasets import load_iris
plt.rcParams['font.sans-serif'] = ['STFangsong']
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
# y_hat = a0 + a1*x1
# L = (a0 + a1*1.91 - 86)^2 + (a0 + a1*2.03 - 113)^2 + (a0 + a1*2.03 - 112)^2
# L'(a0) =2(a0 + a1*1.91 - 86) + 2(a0 + a1*2.03 - 113)+2(a0 + a1*2.03 - 112) =0
#L'(a1) = 2(a0 + a1*1.91 - 86)*1.91 + 2(a0 + a1*2.03 - 113)*2.03+2(a0 + a1*2.03 - 112)*2.03 =0
# a0 = 103
# a1 = 0.12
# y_hat = 103+0.12*x1
# 截距项 X1 X2 X3 X4 Y
# 标准化 0-1 标准化缩放
# MinMaxScaler # StandardScaler
# MinMaxScaler : Xi - Xmin/(Xmax - Xmin)
# StandardScaler : Xi -Xmean/ Xstd
# 模型训练过程
# 1. X,Y
# 2. 划分数据集 xtrain,xtest,ytrain,ytest ,数据相关性检查
# 3. xtrain xtest 标准化:计算xtrain 的mean std 用作x计算标准化的依据
# 4. 模型训练
# 5. 模型评估:R^2
from sklearn.datasets import load_boston
# boston 房价数据集
boston = load_boston()
x = boston.data
y = boston.target
# data = pd.DataFrame(boston.data, columns = boston.feature_names)
data = pd.DataFrame(x, columns = boston.feature_names)
data
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 396.90 | 4.98 |
1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 396.90 | 9.14 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 392.83 | 4.03 |
3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 394.63 | 2.94 |
4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 396.90 | 5.33 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
501 | 0.06263 | 0.0 | 11.93 | 0.0 | 0.573 | 6.593 | 69.1 | 2.4786 | 1.0 | 273.0 | 21.0 | 391.99 | 9.67 |
502 | 0.04527 | 0.0 | 11.93 | 0.0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1.0 | 273.0 | 21.0 | 396.90 | 9.08 |
503 | 0.06076 | 0.0 | 11.93 | 0.0 | 0.573 | 6.976 | 91.0 | 2.1675 | 1.0 | 273.0 | 21.0 | 396.90 | 5.64 |
504 | 0.10959 | 0.0 | 11.93 | 0.0 | 0.573 | 6.794 | 89.3 | 2.3889 | 1.0 | 273.0 | 21.0 | 393.45 | 6.48 |
505 | 0.04741 | 0.0 | 11.93 | 0.0 | 0.573 | 6.030 | 80.8 | 2.5050 | 1.0 | 273.0 | 21.0 | 396.90 | 7.88 |
506 rows × 13 columns
# 查看特征相关性
import seaborn as sns
sns.heatmap(data.corr(),center = 0)
<AxesSubplot:>
# 划分数据集
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size = 0.3)
# 标准化
from sklearn.preprocessing import MinMaxScaler # 0-1缩放
from sklearn.preprocessing import StandardScaler # 标准化缩放
std = StandardScaler().fit(xtrain)
# 实例化 + 数据导入:std:xtrain(mean,std)
xtrain_ = std.transform(xtrain)
xtest_ = std.transform(xtest)
from sklearn.linear_model import LinearRegression
# 训练
lr = LinearRegression().fit(xtrain_,ytrain)
lr.coef_ # 参数[a1,a2,a3...am] m个特征
array([-1.12267664, 0.97439663, -0.3459865 , 0.41380091, -2.10474456,
2.43778686, -0.22264589, -3.56271303, 3.34309993, -2.51830569,
-1.93298784, 0.7963353 , -3.7752077 ])
lr.intercept_ # 截距项 ao
22.541525423728853
lr.score(xtest_,ytest)
0.7705065219692282