【kaggle】Airbnb新用户的民宿预定结果预测

本文通过分析Airbnb用户数据,提取特征并利用多种机器学习模型进行目的地预测,对比不同模型的NDCG评分,揭示用户行为与旅行选择的关系。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
%matplotlib inline
import datetime
import os
import seaborn as sns#数据可视化
from datetime import date
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
import pickle #用于存储模型
import seaborn as sns
from sklearn.metrics import *
from sklearn.model_selection import *
train = pd.read_csv("airbnb/train_users_2.csv")
test = pd.read_csv("airbnb/test_users.csv")
print('the columns name of training dataset:\n',train.columns)
print('the columns name of test dataset:\n',test.columns)
the columns name of training dataset:
 Index(['id', 'date_account_created', 'timestamp_first_active',
       'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser', 'country_destination'],
      dtype='object')
the columns name of test dataset:
 Index(['id', 'date_account_created', 'timestamp_first_active',
       'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
       'language', 'affiliate_channel', 'affiliate_provider',
       'first_affiliate_tracked', 'signup_app', 'first_device_type',
       'first_browser'],
      dtype='object')
# 分析:
# train文件比test文件多了特征-country_destination
# country_destination是需要预测的目标变量
# 数据探索时着重分析train文件,test文件类似
print(train.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213451 entries, 0 to 213450
Data columns (total 16 columns):
id                         213451 non-null object
date_account_created       213451 non-null object
timestamp_first_active     213451 non-null int64
date_first_booking         88908 non-null object
gender                     213451 non-null object
age                        125461 non-null float64
signup_method              213451 non-null object
signup_flow                213451 non-null int64
language                   213451 non-null object
affiliate_channel          213451 non-null object
affiliate_provider         213451 non-null object
first_affiliate_tracked    207386 non-null object
signup_app                 213451 non-null object
first_device_type          213451 non-null object
first_browser              213451 non-null object
country_destination        213451 non-null object
dtypes: float64(1), int64(2), object(13)
memory usage: 26.1+ MB
None
#分析:
# trian文件包含213451行数据,16个特征
# 每个特征的数据类型和非空数值
# date_first_booking空值较多,在特征提取时可以考虑删除

特征分析:

#1.date_account_created
#查看前几行数据
print(train.date_account_created.head())
0    2010-06-28
1    2011-05-25
2    2010-09-28
3    2011-12-05
4    2010-09-14
Name: date_account_created, dtype: object
#对数据进行统计
print(train.date_account_created.value_counts().head())
print(train.date_account_created.value_counts().tail())
2014-05-13    674
2014-06-24    670
2014-06-25    636
2014-05-20    632
2014-05-14    622
Name: date_account_created, dtype: int64
2010-04-24    1
2010-03-09    1
2010-01-01    1
2010-06-18    1
2010-01-02    1
Name: date_account_created, dtype: int64
#获取信息
print(train.date_account_created.describe())
count         213451
unique          1634
top       2014-05-13
freq             674
Name: date_account_created, dtype: object
#观察用户增长情况
dac_train = train.date_account_created.value_counts()
dac_test = test.date_account_created.value_counts()
#将数据类型转换为datatime类型
dac_train_date = pd.to_datetime(train.date_account_created.value_counts().index)
dac_test_date = pd.to_datetime(test.date_account_created.value_counts().index)
#计算离首次注册时间相差的天数
dac_train_day = dac_train_date - dac_train_date.min()
dac_test_day = dac_test_date - dac_train_date.min()
#motplotlib作图
plt.scatter(dac_train_day.days, dac_train.values, color = 'r', label = 'train dataset')
plt.scatter(dac_test_day.days, dac_test.values, color = 'b', label = 'test dataset')

plt.title("Accounts created vs day")
plt.xlabel("Days")
plt.ylabel("Accounts created")
plt.legend(loc = 'upper left')

在这里插入图片描述

# 分析:
# x轴:离首次注册时间相差的天数
# y轴:当天注册的用户数量
# 随着时间的增长,用户注册的数量在急剧上升
#2.timestamp_first_active
#查看头几行数据
print(train.timestamp_first_active.head())
0    20090319043255
1    20090523174809
2    20090609231247
3    20091031060129
4    20091208061105
Name: timestamp_first_active, dtype: int64
#对数据进行统计看非重复值的数量
print(train.timestamp_first_active.value_counts().unique())
[1]
#分析: 结果[1]表明timestamp_first_active没有重复数据
#将时间戳转成日期形式并获取数据信息
tfa_train_dt = train.timestamp_first_active.astype(str).apply(lambda x:  
                                                                    datetime.datetime(int(x[:4]),
                                                                                      int(x[4:6]), 
                                                                                      int(x[6:8]), 
                                                                                      int(x[8:10]), 
                                                                                      int(x[10:12]),
                                                                                      int(x[12:])))
print(tfa_train_dt.describe())
count                  213451
unique                 213451
top       2013-07-01 05:26:34
freq                        1
first     2009-03-19 04:32:55
last      2014-06-30 23:58:24
Name: timestamp_first_active, dtype: object
#3.date_first_booking
#获取数据信息
print(train.date_first_booking.describe())
print(test.date_first_booking.describe())
count          88908
unique          1976
top       2014-05-22
freq             248
Name: date_first_booking, dtype: object
count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: date_first_booking, dtype: float64
# 分析:
# train文件中date_first_booking有大量缺失值
# test文件中date_first_booking全是缺失值
# 可以删除特征date_first_booking
#age
#对数据进行统计
print(train.age.value_counts().head())
30.0    6124
31.0    6016
29.0    5963
28.0    5939
32.0    5855
Name: age, dtype: int64
#分析:用户年龄主要集中在30左右
#柱状图统计
#首先将年龄进行分成4组missing values, too small age, reasonable age, too large age
age_train =[train[train.age.isnull()].age.shape[0],
            train.query('age < 15').age.shape[0],
            train.query("age >= 15 & age <= 90").age.shape[0],
            train.query('age > 90').age.shape[0]]

age_test = [test[test.age.isnull()].age.shape[0],
            test.query('age < 15').age.shape[0],
            test.query("age >= 15 & age <= 90").age.shape[0],
            test.query('age > 90').age.shape[0]]

columns = ['Null', 'age < 15', 'age', 'age > 90']

# plot
fig, (ax1,ax2) = plt.subplots(1,2,sharex=True, sharey = True,figsize=(10,5))

sns.barplot(columns, age_train, ax = ax1)
sns.barplot(columns, age_test, ax = ax2)

ax1.set_title('training dataset')
ax2.set_title('test dataset')
ax1.set_ylabel('counts')

在这里插入图片描述

#分析:异常年龄较少,且有一定数量的缺失值
#其他特征
#train文件中其他特征由于labels较少,我们可以在特征工程中直接进行one hot encoding即可
#统一使用柱状图进行统计
def feature_barplot(feature, df_train = train, df_test = test, figsize=(10,5), rot = 90, saveimg = False): 
    feat_train = df_train[feature].value_counts()
    feat_test = df_test[feature].value_counts()
    fig_feature, (axis1,axis2) = plt.subplots(1,2,sharex=True, sharey = True, figsize = figsize)
    sns.barplot(feat_train.index.values, feat_train.values, ax = axis1)
    sns.barplot(feat_test.index.values, feat_test.values, ax = axis2)
    axis1.set_xticklabels(axis1.xaxis.get_majorticklabels(), rotation = rot)
    axis2.set_xticklabels(axis1.xaxis.get_majorticklabels(), rotation = rot)
    axis1.set_title(feature + ' of training dataset')
    axis2.set_title(feature + ' of test dataset')
    axis1.set_ylabel('Counts')
    plt.tight_layout()
    if saveimg == True:
        figname = feature + ".png"
        fig_feature.savefig(figname, dpi = 75)
#gender
feature_barplot('gender', saveimg = True)

在这里插入图片描述

#signup_method
feature_barplot('signup_method')

在这里插入图片描述

#signup_flow
feature_barplot('signup_flow')

在这里插入图片描述

#language
feature_barplot('language')

在这里插入图片描述

#affiliate_channel
feature_barplot('affiliate_channel')

在这里插入图片描述

#first_affiliate_tracked
feature_barplot('first_affiliate_tracked')

在这里插入图片描述

#signup_app
feature_barplot('signup_app')

在这里插入图片描述

#first_device_type
feature_barplot('first_device_type')

在这里插入图片描述

#first_browser
feature_barplot('first_browser')

在这里插入图片描述

##sesion文件
#获取数据并查看头10行数据
df_sessions = pd.read_csv('airbnb/sessions.csv')
df_sessions.head(10)
user_id action action_type action_detail device_type secs_elapsed
0 d1mm9tcy42 lookup NaN NaN Windows Desktop 319.0
1 d1mm9tcy42 search_results click view_search_results Windows Desktop 67753.0
2 d1mm9tcy42 lookup NaN NaN Windows Desktop 301.0
3 d1mm9tcy42 search_results click view_search_results Windows Desktop 22141.0
4 d1mm9tcy42 lookup NaN NaN Windows Desktop 435.0
5 d1mm9tcy42 search_results click view_search_results Windows Desktop 7703.0
6 d1mm9tcy42 lookup NaN NaN Windows Desktop 115.0
7 d1mm9tcy42 personalize data wishlist_content_update Windows Desktop 831.0
8 d1mm9tcy42 index view view_search_results Windows Desktop 20842.0
9 d1mm9tcy42 lookup NaN NaN Windows Desktop 683.0
#将user_id改名为id
#这是为了后面的数据合并
df_sessions['id'] = df_sessions['user_id']
df_sessions = df_sessions.drop(['user_id'],axis=1) #按行删除
df_sessions.shape
(10567737, 6)
# (10567737, 6)
# 分析:session文件有10567737行数据,6个特征
#查看缺失值
df_sessions.isnull().sum()
<
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值