house price

Darren_zeng

于 2020-10-13 17:47:19 发布

阅读量406

点赞数

分类专栏：笔记文章标签：机器学习深度学习

本文链接：https://blog.youkuaiyun.com/weixin_44691087/article/details/109056792

版权

笔记专栏收录该内容

8 篇文章

订阅专栏

Kaggle房价预测

链接：link

供个人学习复习用

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import ensemble, tree, linear_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.utils import shuffle

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

train_data = pd.read_csv('F:/跨媒体计算实验组/NLP/数据集/house price/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('F:/跨媒体计算实验组/NLP/数据集/house price/house-prices-advanced-regression-techniques/test.csv')
train = train_data.copy()
test = test_data.copy()

在这里插入图片描述

train.shape,test.shape

在这里插入图片描述

#check for dupes for Id
idsUnique = len(set(train.Id))#set是集合
idsTotal = train.shape[0]
#这里是集合过滤重复id，只余下唯一值，然后总数减去唯一值查看不重复的数量
idsdupe = idsTotal - idsUnique
print(idsdupe)  #输出是0
#drop id col
train.drop(['Id'],axis=1,inplace=True)

进行可视化

#correlation matrix相关矩阵
corrmat = train.corr()
f,ax = plt.subplots(figsize=(20,9))
sns.heatmap(corrmat,vmax=.8,annot=True)

在这里插入图片描述

# most correlated features
corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat['SalePrice'])>0.5]#corrmat.index取出所有特征名，然后取出与特征SalePrice相关性大于0.5的其他特征
plt.figure(figsize=(10,10))
g = sns.heatmap(train[top_corr_features].corr(),annot=True,cmap='RdYlGn')#再查看这些特征与特征之间的相关性

在这里插入图片描述

#我们将在下图中看到OverallQual如何影响销售价格。(因为它与销售价格高度相关)
sns.barplot(train.OverallQual,train.SalePrice)

在这里插入图片描述

#下面可以看到每一个特征与销售价格之间的关联
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train[cols],size=2.5)
plt.show()

在这里插入图片描述

因为最终目的是要预测销售价格，所以下面可以进行对改变量进行分析

from scipy import stats
from scipy.stats import norm, skew #for some statistics，norm实现正态分布，skew表示概率分布密度曲线相对于平均值不对称程度的特征数，也即偏度
#skew直观来看就是密度函数曲线尾部的相对长度
sns.distplot(train['SalePrice'] , fit=norm);#正态分布曲线拟合图
#通过函数获取拟合参数（Get the fitted parameters used by the function）
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
(mu, sigma) = norm.fit(train['SalePrice'])#返回mu均值，sigma是方差
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))    #{:,2f}是保留两位小数
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)  #检验样本数据概率分布图（如正态分布（默认））的方法
plt.show()

在这里插入图片描述

train.SalePrice = np.log1p(train.SalePrice)#对销售价格进行平滑处理（即将数据压缩到一个区间，逆运算是expm1）
y = train.SalePrice

在这里插入图片描述

#进行加工预处理,查看两个特征之间的散点图
plt.scatter(y=train.SalePrice,x=train.GrLivArea,c='black')
plt.show()

在这里插入图片描述

train_nas = train.isnull().sum()#计算每个特征的空值总数
train_nas = train_nas[train_nas>0]#筛选出有空值的特征
train_nas.sort_values(ascending = False)#按空值数量进行排序

在这里插入图片描述

#同理对训练集进行相同的操作
test_nas = test.isnull().sum()
test_nas = test_nas[test_nas>0]
test_nas.sort_values(ascending = False)

在这里插入图片描述

print("Find most important features relative to target")
corr = train.corr()#得到特征之间的相关性矩阵
corr.sort_values(['SalePrice'],ascending=False,inplace=True)#按照列（特征）SalePrice进行排序
print(corr.SalePrice)

在这里插入图片描述

#区分数字特征（减去目标）和分类特征,Differentiate numerical features (minus the target) and categorical features
categorical_features = train.select_dtypes(include=['object']).columns#只获取分类特征
categorical_features

在这里插入图片描述

numerical_features = train.select_dtypes(exclude = ["object"]).columns#获取非分类特征
numerical_features

categorical_features = train.select_dtypes(include = ["object"]).columns
numerical_features = train.select_dtypes(exclude = ["object"]).columns
numerical_features = numerical_features.drop("SalePrice")#非分类特征中删去目标值（销售价格）
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
train_num = train[numerical_features]
train_cat = train[categorical_features]

在这里插入图片描述

#使用mean（）来填充na值，实际上在进行特征工程时有很多需要探索的地方。
#NOTE: i simply used median() to fill na values, actually there is lot to explore when you do feature engineering. But this notebook aim is to simplify things(no heavy code)

## Handle remaining missing values for numerical features by using median as replacement
#使用中位数来填充处理数值特征缺失的部分
print('NAs for numerical features in train:' + str(train_num.isnull().values.sum()))
train_num = train_num.fillna(train_num.median())
print('Remaining NAs for numerical features in train:'+str(train_num.isnull().values.sum()))

在这里插入图片描述

from scipy.stats import skew
skewness = train_num.apply(lambda x:skew(x))#遍历每一列，将每一列都调用匿名函数
skewness.sort_values(ascending=False)

在这里插入图片描述

skewness = skewness[abs(skewness)>0.5]
skewness.index#取dataframe的特征名，---没有复制过来图片

skew_features = train[skewness.index]#从训练集中选出已经挑选出的特征，（它们是非分类型特征且这些特征之间的不对称度大于0.5）
skew_features.columns

在这里插入图片描述

#we can treat skewness of a feature with the help fof log transformation.so we'll apply the same here.
#借助对数转换来处理特征的偏斜度，因此我们将在此处应用相同的偏度。
skew_features = np.log1p(skew_features)  #将目标矩阵skew_features中的值全部取对数
train_cat.head()

在这里插入图片描述

str(train_cat.isnull().values.sum())#查看非分类特征中有无空值---0

下面开始进行模型

import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer #metrics 是指标，make_scorer从性能指标或损失函数中创建一个计分标准
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.concat([train_cat,train_num],axis=1)#将预处理的训练集合并（原本分为了分类集和非分类集，用来预处理）
X_train,X_test,y_train,y_test = train_test_split(train,y,test_size = 0.3,random_state= 0)

#用交叉验证集分布检测训练集和测试集
n_folds = 5
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold#K折交叉验证
scorer = make_scorer(mean_squared_error,greater_is_better = False)
def rmse_CV_train(model):
    kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values)#在K折交叉验证中将训练集再次划分
    rmse = np.sqrt(-cross_val_score(model,X_train,y_train,scoring ="neg_mean_squared_error",cv=kf))
    return (rmse)
def rmse_CV_test(model):
    kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(model,X_test,y_test,scoring ="neg_mean_squared_error",cv=kf))
    return (rmse)

#Linear model without Regularization
lr = LinearRegression()
lr.fit(X_train,y_train)
test_pre = lr.predict(X_test)
train_pre = lr.predict(X_train)
print('rmse on train',rmse_CV_train(lr).mean())
print('rmse on train',rmse_CV_test(lr).mean())

在这里插入图片描述

#plot between predicted values and residuals
plt.scatter(train_pre, train_pre - y_train, c = "blue",  label = "Training data")#残差即预测值与真实值之间的差异
plt.scatter(test_pre,test_pre - y_test, c = "black",  label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()

在这里插入图片描述

# Plot predictions - Real values绘画真实值和预测值散点图
plt.scatter(train_pre, y_train, c = "blue",  label = "Training data")
plt.scatter(test_pre, y_test, c = "black",  label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()

在这里插入图片描述

正则化是处理共线性，从数据中滤除噪声并最终防止过度拟合的非常有用的方法。
正则化背后的概念是引入附加信息（偏差）以惩罚极端参数权重。

Regularization is a very useful method to handle collinearity, filter out noise from data, and eventually prevent overfitting.
The concept behind regularization is to introduce additional information (bias) to penalize extreme parameter weights.
在这里插入图片描述

#RidgeCV内置交叉验证的岭回归，默认情况下，它执行通用的交叉验证，这是一种有效的留一交叉验证的形式。alpha是正则化的力度
#Ridge:固定阿尔法，求出最佳w，阿尔法与w的范数成反比，
#RidgeCV:多个阿尔法，得出多个对应最佳的w,然后得到最佳的w及对应的阿尔法
ridge = RidgeCV(alphas = [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60])

ridge.fit(X_train,y_train)
alpha = ridge.alpha_#一轮下来得到最好的alpha
print('best alpha',alpha)

print("Try again for more precision with alphas centered around " + str(alpha))
ridge = RidgeCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, 
                          alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
                          alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4],cv = 5)
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print("Best alpha :", alpha)
print("Ridge RMSE on Training set :", rmse_CV_train(ridge).mean())#K折交叉验证结果的均值
print("Ridge RMSE on Test set :", rmse_CV_test(ridge).mean())
y_train_rdg = ridge.predict(X_train)#岭回归的返回分数
y_test_rdg = ridge.predict(X_test)

在这里插入图片描述

print("Kcv RMSE on Training set :", y_train_rdg.mean())#K折交叉验证结果的均值
print("Kcv RMSE on Test set :", y_test_rdg.mean())

在这里插入图片描述

coef = pd.Series(ridge.coef_, index = X_train.columns)

print("Ridge picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

在这里插入图片描述

# Plot residuals
plt.scatter(y_train_rdg, y_train_rdg - y_train, c = "blue",  label = "Training data")
plt.scatter(y_test_rdg, y_test_rdg - y_test, c = "black", marker = "v", label = "Validation data")
plt.title("Linear regression with Ridge regularization")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()

在这里插入图片描述

# Plot predictions - Real values
plt.scatter(y_train_rdg, y_train, c = "blue",  label = "Training data")
plt.scatter(y_test_rdg, y_test, c = "black",  label = "Validation data")
plt.title("Linear regression with Ridge regularization")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()

在这里插入图片描述