随书代码,阅读笔记。
- 线性回归拟合正弦函数
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
n_dots = 200
X = np.linspace(-2 * np.pi, 2 * np.pi, n_dots)
Y = np.sin(X) + 0.2 * np.random.rand(n_dots) - 0.1
X = X.reshape(-1, 1)
Y = Y.reshape(-1, 1);
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
def polynomial_model(degree=1):
polynomial_features = PolynomialFeatures(degree=degree,
include_bias=False)
linear_regression = LinearRegression(normalize=True)
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
return pipeline
from sklearn.metrics import mean_squared_error
degrees = [2, 3, 5, 10]
results = []
for d in degrees:
model = polynomial_model(degree=d)
model.fit(X, Y)
train_score = model.score(X, Y)
mse = mean_squared_error(Y, model.predict(X))
results.append({"model": model, "degree": d, "score": train_score, "mse": mse})
for r in results:
print("degree: {}; train score: {}; mean squared error: {}".format(r["degree"], r["score"], r["mse"]))
:
from matplotlib.figure import SubplotParams
plt.figure(figsize=(12, 6), dpi=200, subplotpars=SubplotParams(hspace=0.3))
for i, r in enumerate(results):
fig = plt.subplot(2, 2, i+1)
plt.xlim(-8, 8)
plt.title("LinearRegression degree={}".format(r["degree"]))
plt.scatter(X, Y, s=5, c='b', alpha=0.5)
plt.plot(X, r["model"].predict(X), 'r-')
- 预测房价
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_boston
boston = load_boston()
X = boston.data
y = boston.target
X.shape
boston.feature_names
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
import time
from sklearn.linear_model import LinearRegression
model = LinearRegression()
#model = LinearRegression(normalize=True) #归一化,能加快算法收敛速度,优化算法训练效率,无法提升算法准确性
start = time.clock()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
cv_score = model.score(X_test, y_test)
print('elaspe: {0:.6f}; train_score: {1:0.6f}; cv_score: {2:.6f}'.format(time.clock()-start, train_score, cv_score))
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
def polynomial_model(degree=1):
polynomial_features = PolynomialFeatures(degree=degree,
include_bias=False)
linear_regression = LinearRegression(normalize=True)
pipeline = Pipeline([("polynomial_features", polynomial_features),
("linear_regression", linear_regression)])
return pipeline
model = polynomial_model(degree=2)
start = time.clock()
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
cv_score = model.score(X_test, y_test)
print('elaspe: {0:.6f}; train_score: {1:0.6f}; cv_score: {2:.6f}'.format(time.clock()-start, train_score, cv_score))
#elaspe: 0.016412; train_score: 0.930547; cv_score: 0.860465
#画出学习曲线
from common.utils import plot_learning_curve
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
plt.figure(figsize=(18, 4), dpi=200)
title = 'Learning Curves (degree={0})'
degrees = [1, 2, 3]
start = time.clock()
plt.figure(figsize=(18, 4), dpi=200)
for i in range(len(degrees)):
plt.subplot(1, 3, i + 1)
plot_learning_curve(plt, polynomial_model(degrees[i]), title.format(degrees[i]), X, y, ylim=(0.01, 1.01), cv=cv)
print('elaspe: {0:.6f}'.format(time.clock()-start))
多项式的阶数对训练模型性能影响很大,阶数低,容易欠拟合,阶数高,容易过拟合。