线性回归及RANSAC异常值清除算法案例

最新推荐文章于 2025-01-24 09:56:23 发布

原创最新推荐文章于 2025-01-24 09:56:23 发布 · 6.2k 阅读

25 ·

CC 4.0 BY-SA版权

文章标签：

#RANSAC #线性回归

机器学习实战系列专栏收录该内容

28 篇文章

订阅专栏

本文介绍了一种使用线性回归进行数据预测的方法，并通过引入RANSAC算法来清除数据集中的异常值，提高模型的预测准确性。通过对波士顿房价数据集的应用，展示了线性回归模型在异常值存在时的表现，以及RANSAC算法如何帮助模型更加鲁棒。

线性回归及RANSAC异常值清除算法案例

1、常规线性回归

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor

def lin_regplot(X, y, model):
    plt.scatter(X, y, c='blue')
    plt.plot(X, model.predict(X), color='red')
    plt.savefig('result/Linear.png')
    plt.show()
    return None

### 线性回归模型
# 数据读取
df = pd.read_csv('dataset/boston.csv', sep=',')
df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS',
              'NOX', 'RM', 'AGE', 'DIS', 'RAD',
              'TAX', 'PTRATIO', 'LSTAT', 'MEDV']
# print(df.head())

X = df[['RM']].values
y = df[['MEDV']].values

slr = LinearRegression()
slr.fit(X, y)
print("Slope: %.3f" % slr.coef_[0])
print("intercept: %.3f" % slr.intercept_)

lin_regplot(X, y, slr)

可视化：

2、RANSAC异常值清除后线性回归

### 使用RANSAC清除异常值高鲁棒对的线性回归模型
ransac = RANSACRegressor(LinearRegression(),
                         max_trials=100,
                         min_samples=50,
                         # residual_metric=lambda x: np.sum(np.abs(x), axis=1),
                         residual_threshold=5.0,
                         random_state=0)
ransac.fit(X, y)
# 可视化
inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)
line_X = np.arange(3, 10, 1)
line_y_ransac = ransac.predict(line_X[:, np.newaxis])
plt.scatter(X[inlier_mask], y[inlier_mask],
            c='blue', marker='o', label='Inliers')
plt.scatter(X[outlier_mask], y[outlier_mask],
            c='lightgreen', marker='s', label='Outliers')

plt.plot(line_X, line_y_ransac, color='red')
plt.xlabel('Average number of rooms [RM]')
plt.ylabel('Price in $ 1000 [MEDV]')
plt.savefig('result/ransac.png')
plt.show()

可视化

3、所有代码

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RANSACRegressor

def lin_regplot(X, y, model):
    plt.scatter(X, y, c='blue')
    plt.plot(X, model.predict(X), color='red')
    plt.savefig('result/Linear.png')
    plt.show()
    return None

### 线性回归模型
# 数据读取
df = pd.read_csv('dataset/boston.csv', sep=',')
df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS',
              'NOX', 'RM', 'AGE', 'DIS', 'RAD',
              'TAX', 'PTRATIO', 'LSTAT', 'MEDV']
# print(df.head())

X = df[['RM']].values
y = df[['MEDV']].values

slr = LinearRegression()
slr.fit(X, y)
print("Slope: %.3f" % slr.coef_[0])
print("intercept: %.3f" % slr.intercept_)

lin_regplot(X, y, slr)

### 使用RANSAC清除异常值高鲁棒对的线性回归模型
ransac = RANSACRegressor(LinearRegression(),
                         max_trials=100,
                         min_samples=50,
                         # residual_metric=lambda x: np.sum(np.abs(x), axis=1),
                         residual_threshold=5.0,
                         random_state=0)
ransac.fit(X, y)
# 可视化
inlier_mask = ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)
line_X = np.arange(3, 10, 1)
line_y_ransac = ransac.predict(line_X[:, np.newaxis])
plt.scatter(X[inlier_mask], y[inlier_mask],
            c='blue', marker='o', label='Inliers')
plt.scatter(X[outlier_mask], y[outlier_mask],
            c='lightgreen', marker='s', label='Outliers')

plt.plot(line_X, line_y_ransac, color='red')
plt.xlabel('Average number of rooms [RM]')
plt.ylabel('Price in $ 1000 [MEDV]')
plt.savefig('result/ransac.png')
plt.show()