K 近邻算法极其模型的评估Sklearn / KNN

在这里插入图片描述

在这里插入图片描述

import pandas as pd
import numpy as np

os.chdir('C:/Users/Liu/Desktop') #导入数据的路径

#数据量太多,主要看这几条
features = ['accommodates','bedrooms','bathrooms','beds','price','minimum_nights','maximum_nights','number_of_reviews']

#读数据
dc_listings = pd.read_csv('listings.csv')

dc_listings = dc_listings[features]
print(dc_listings.shape)

dc_listings.head()
'''
数据特征:
accommodates: 可以容纳的旅客
bedrooms: 卧室的数量
bathrooms: 厕所的数量
beds: 床的数量
price: 每晚的费用
minimum_nights: 客人最少租了几天
maximum_nights: 客人最多租了几天
numberofreviews: 评论的数量
'''
# 以案例来解释,假如我们的房子有三个房间,那怎么定价
our_acc_value = 3

#算我们之间的差值,也就是找和我房间数量最相近的其他房子的价格
dc_listings['distance'] = np.abs(dc_listings.accommodates-our_acc_value)

#通过计数和排序找到差值最小的房子,也就是和我最相近的房子
dc_listings.distance.value_counts().sort_index()
#这里我们只有了绝对值来计算,和我们距离为0的(同样数量的房间)有461

基于单变量预测价格

dc_listings = dc_listings.sample(frac=1,random_state=0) # 洗牌
dc_listings = dc_listings.sort_values('distance')

dc_listings['price'] = dc_listings.price.str.replace("\$|,",'').astype(float)#因为数据是是字符串形式,需要转换一下

mean_price = dc_listings.price.iloc[:5].mean()
mean_price
def predict_price(new_listing_value,feature_column):
    temp_df = train_df
    temp_df['distance'] = np.abs(dc_listings[feature_column] - new_listing_value)
    temp_df = temp_df.sort_values('distance')
    knn_5 = temp_df.price.iloc[:5]
    predicted_price = knn_5.mean()
    return(predicted_price)

#不要将所有数据全部拿来测试,这样计算出的模型会出现过拟合的现象
#需要分出训练集和测试集,通常是75%的数据用来训练,25%的数据用来测试,
#具体划分比例按数据集确定
dc_listings.drop('distance',axis=1) # 删除之前自己添加的一项
train_df = dc_listings.copy().iloc[:2792]  # 75%
test_df = dc_listings.copy().iloc[2792:]  # 25%


#  基于单变量预测价格
def predict_price(new_listing_value,feature_column):
    temp_df = train_df
    temp_df['distance'] = np.abs(dc_listings[feature_column] - new_listing_value)
    temp_df = temp_df.sort_values('distance')
    knn_5 = temp_df.price.iloc[:5]
    predicted_price = knn_5.mean()
    return(predicted_price)

# 即可得到测试集中,所有房子的价格了
test_df['predicted_price'] = test_df.accommodates.apply(predict_price,feature_column='accommodates')

test_df['squared_error'] = (test_df['predicted_price'] - test_df['price'])**(2)
mse = test_df['squared_error'].mean()
rmse = mse ** (1/2)
rmse

基于多变量预测价格

测试结果:

for feature in ['accommodates','bedrooms','bathrooms','number_of_reviews']:
    #test_df['predicted_price'] = test_df.accommodates.apply(predict_price,feature_column=feature)
    test_df['predicted_price'] = test_df[feature].apply(predict_price,feature_column=feature)
    test_df['squared_error'] = (test_df['predicted_price'] - test_df['price'])**(2)
    mse = test_df['squared_error'].mean()
    rmse = mse ** (1/2)
    print("RMSE for the {} column: {}".format(feature,rmse))

输出结果:

RMSE for the accommodates column: 212.98927967051543
RMSE for the bedrooms column: 199.80935328065033
RMSE for the bathrooms column: 230.24716705684227
RMSE for the number_of_reviews column: 235.91327066995507

输出结果的差异性很大,所以需要进行标准化或者归一化

import pandas as pd
from sklearn.preprocessing import StandardScaler
features = ['accommodates','bedrooms','bathrooms','beds','price','minimum_nights','maximum_nights','number_of_reviews']

dc_listings = pd.read_csv('listings.csv')

dc_listings = dc_listings[features]

dc_listings['price'] = dc_listings.price.str.replace("\$|,",'').astype(float)

dc_listings = dc_listings.dropna()

dc_listings[features] = StandardScaler().fit_transform(dc_listings[features])

normalized_listings = dc_listings

print(dc_listings.shape)

normalized_listings.head()

输出结果:
在这里插入图片描述

多变量距离的计算

norm_train_df = normalized_listings.copy().iloc[0:2792]
norm_test_df = normalized_listings.copy().iloc[2792:]

from scipy.spatial import distance
first_listing = normalized_listings.iloc[0][['accommodates', 'bathrooms']]
fifth_listing = normalized_listings.iloc[20][['accommodates', 'bathrooms']]
first_fifth_distance = distance.euclidean(first_listing, fifth_listing)
first_fifth_distance

多变量KNN模型

def predict_price_multivariate(new_listing_value,feature_columns):
    temp_df = norm_train_df
    temp_df['distance'] = distance.cdist(temp_df[feature_columns],[new_listing_value[feature_columns]])
    temp_df = temp_df.sort_values('distance')
    knn_5 = temp_df.price.iloc[:5]
    predicted_price = knn_5.mean()
    return(predicted_price)

cols = ['accommodates', 'bathrooms']
norm_test_df['predicted_price'] = norm_test_df[cols].apply(predict_price_multivariate,feature_columns=cols,axis=1)    
norm_test_df['squared_error'] = (norm_test_df['predicted_price'] - norm_test_df['price'])**(2)
mse = norm_test_df['squared_error'].mean()
rmse = mse ** (1/2)
print(rmse)

使用Sklearn来完成KNN

from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

cols = ['accommodates','bedrooms']
knn = KNeighborsRegressor()
knn.fit(norm_train_df[cols], norm_train_df['price'])
two_features_predictions = knn.predict(norm_test_df[cols])

two_features_mse = mean_squared_error(norm_test_df['price'], two_features_predictions)
two_features_rmse = two_features_mse ** (1/2)
print(two_features_rmse)

加入更多的特征

knn = KNeighborsRegressor()

cols = ['accommodates','bedrooms','bathrooms','beds','minimum_nights','maximum_nights','number_of_reviews']

knn.fit(norm_train_df[cols], norm_train_df['price'])
four_features_predictions = knn.predict(norm_test_df[cols])
four_features_mse = mean_squared_error(norm_test_df['price'], four_features_predictions)
four_features_rmse = four_features_mse ** (1/2)
four_features_rmse

sample() 可以用来进行洗牌

DataFrame.sample(n=None, # 要抽取的行数
				 frac=None, # 抽取行的比例
				 replace=False,# 是否为有放回抽样,True:有放回抽样,False:未放回抽样
 				 weights=None, # 字符索引或概率数组
							   # axis=0:为行字符索引或概率数组
							   # axis=1:为列字符索引或概率数组
 				 random_state=None, # 随机数发生器种子
 				 axis=None)  # 选择抽取数据的行还是列
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值