飞桨KNN模型练习-优快云博客

本文链接：https://blog.youkuaiyun.com/WuWuWu_bug/article/details/135873070

本文介绍了在Facebook举办的一场比赛中，通过使用KNN算法和特征工程对用户签到位置进行预测的过程，包括数据预处理、特征选择、模型训练及交叉验证。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.项目描述：

本次比赛的目的是预测一个人将要签到的地方。为了本次比赛，Facebook 创建了一个虚拟世界，其中包括 10 公里 * 10 公里共 100 平方公里，约 10 万个地方。对于给定的坐标集，您的任务是：根据用户的位置，准确性和时间戳等预测用户下一次的签到位置。数据被制作成类似于来自移动设备的位置数据。

2.代码展示：

import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier


#由于飞桨没有自带中文字体，所以需要修改字体默认路径
from matplotlib.font_manager import FontProperties
myfont = FontProperties(fname=r"work/simhei.ttf", size=14)
matplotlib.rcParams['font.sans-serif'] = ['simhei']
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['axes.unicode_minus'] = False


facebook = pd.read_csv("../data/FacebookLocation/train.csv")
print(facebook.head())
"""
   row_id       x       y  accuracy    time    place_id
0       0  0.7941  9.0809        54  470702  8523065625
1       1  5.9567  4.7968        13  186555  1757726713
2       2  8.3078  7.0407        74  322648  1137537235
3       3  7.3665  2.5165        65  704587  6567393236
4       4  4.0961  1.1307        31  472130  7440663949
"""

#由于本次只是练习，所以可以缩小一下数据量，简化模型训练过程
#首先，缩小空间范围
partial_data=data.query("x>2.0 & x<2.5 & y>2.0 & y<2.5")

#选择时间范围
# 使用to_datetime函数将DF中的time列转换为日期时间格式。其中，unit='s'表示时间单位为秒
time = pd.to_datetime(partial_data["time"], unit="s")
print(time.head())
"""
163    1970-01-08 18:02:17
310    1970-01-03 17:11:59
658    1970-01-06 19:32:23
1368   1970-01-04 16:50:22
1627   1970-01-07 21:18:04
Name: time, dtype: datetime64[ns]
"""
# 将转换后的日期时间数据存储在一个新的数据框time中
time = pd.DataFrame(time)
# 从time数据框中提取出日期、小时和星期几的信息，并分别存储在数据框的新列day、hour和weekday中
# .dt是一个访问器，它用于访问时间日期序列的属性和方法。它类似于.str访问器
partial_data.loc[:, "day"] = time['time'].dt.day
partial_data.loc[:, "hour"] = time['time'].dt.hour
partial_data.loc[:, "weekday"] = time['time'].dt.weekday
print(partial_data.head())
"""
      row_id       x       y  accuracy    time    place_id  day  hour  weekday
163      163  2.1663  2.3755        84  669737  3869813743    8    18        3
310      310  2.3695  2.2034         3  234719  2636621520    3    17        5
658      658  2.3236  2.1768        66  502343  7877745055    6    19        1
1368    1368  2.2613  2.3392        73  319822  9775192577    4    16        6
1627    1627  2.3331  2.0011        66  595084  6731326909    7    21        2
"""

#删掉签到较少的位置
# 使用groupby和count函数统计facebook_data数据框中每个地点的出现次数，并将结果存储在place_count数据框中
place_count = partial_data.groupby(by="place_id").count()
print(place_count.head())
"""
            row_id    x    y  accuracy  time  day  hour  weekday
place_id                                                        
1006234733       1    1    1         1     1    1     1        1
1008823061       4    4    4         4     4    4     4        4
1012580558       3    3    3         3     3    3     3        3
1025585791      21   21   21        21    21   21    21       21
1026507711     220  220  220       220   220  220   220      220
"""
# 筛选出place_count数据框中出现次数大于3的地点
place_count = place_count[place_count["row_id"] > 3]
print(place_count["row_id"] > 3)
"""
place_id
1006234733    False
1008823061     True
1012580558    False
1025585791     True
1026507711     True

Name: row_id, Length: 2524, dtype: bool
"""
#isin函数判断是否满足筛选函数
print(partial_data["place_id"].isin(place_count.index))
"""
163         True
310         True
658         True
1368        True
1627        True

Name: place_id, Length: 71664, dtype: bool
"""
# 使用isin函数只保留facebook_data数据框中属于筛选后的地点的数据
partial_data = partial_data[partial_data["place_id"].isin(place_count.index)]
print(partial_data.head())
"""
      row_id       x       y  accuracy    time    place_id  day  hour  weekday
163      163  2.1663  2.3755        84  669737  3869813743    8    18        3
310      310  2.3695  2.2034         3  234719  2636621520    3    17        5
658      658  2.3236  2.1768        66  502343  7877745055    6    19        1
1368    1368  2.2613  2.3392        73  319822  9775192577    4    16        6
1627    1627  2.3331  2.0011        66  595084  6731326909    7    21        2
"""


##确定特征值和目标值
x = partial_data[['x', 'y', "accuracy", "day", "hour", "weekday"]]  # 两个中括号，或者使用.loc
y = partial_data["place_id"]

#分割数据集
x_train,x_test,y_train,y_test=train_test_split(x,y,random_state=2,test_size=0.25) #random_state=2是随机数种子

#特征工程--标准化
transfer=StandardScaler() #实例化一个转换器
x_train=transfer.fit_transform(x_train)
x_test=transfer.fit_transform(x_test)

#机器学习---knn + cv
#实例化一个训练器
estimator=KNeighborsClassifier()

#交叉验证,网格搜索
param_grid={"n_neighbors":[3,5,7,9]} #可选的k值
#因为cv=3，即3折交叉验证，而超参数n_neighbors有4种，所以需要跑12次
#n_jobs是运行的cpu数，-1即是全部cpu，一般留一个cpu，数据太多建议租服务器
GridSearchCV(estimator=estimator,param_grid=param_grid,cv=3,n_jobs=-1) 

#训练模型
estimator.fit(x_train,y_train)

#模型评估
#准确率输出
score_ret=estimator.score(x_test,y_test)
print("准确率：\n",score_ret)

#预测结果
y_pre=estimate.predict(x_test)
print("预测值是：\n",y_pre)
print("最好的准确率为：\n",estimator.best_score_)
print("最好的模型参数为：\n",estimator.best_estimator_)
print("所有的结果为：\n", estimator.cv_results_)