import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import sklearn as sk
% matplotlib inline
import datetime
import os
import seaborn as sns
from datetime import date
from sklearn. preprocessing import LabelEncoder
from sklearn. preprocessing import StandardScaler
from sklearn. preprocessing import LabelBinarizer
import pickle
import seaborn as sns
from sklearn. metrics import *
from sklearn. model_selection import *
train = pd. read_csv( "airbnb/train_users_2.csv" )
test = pd. read_csv( "airbnb/test_users.csv" )
print ( 'the columns name of training dataset:\n' , train. columns)
print ( 'the columns name of test dataset:\n' , test. columns)
the columns name of training dataset:
Index(['id', 'date_account_created', 'timestamp_first_active',
'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
'language', 'affiliate_channel', 'affiliate_provider',
'first_affiliate_tracked', 'signup_app', 'first_device_type',
'first_browser', 'country_destination'],
dtype='object')
the columns name of test dataset:
Index(['id', 'date_account_created', 'timestamp_first_active',
'date_first_booking', 'gender', 'age', 'signup_method', 'signup_flow',
'language', 'affiliate_channel', 'affiliate_provider',
'first_affiliate_tracked', 'signup_app', 'first_device_type',
'first_browser'],
dtype='object')
print ( train. info( ) )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 213451 entries, 0 to 213450
Data columns (total 16 columns):
id 213451 non-null object
date_account_created 213451 non-null object
timestamp_first_active 213451 non-null int64
date_first_booking 88908 non-null object
gender 213451 non-null object
age 125461 non-null float64
signup_method 213451 non-null object
signup_flow 213451 non-null int64
language 213451 non-null object
affiliate_channel 213451 non-null object
affiliate_provider 213451 non-null object
first_affiliate_tracked 207386 non-null object
signup_app 213451 non-null object
first_device_type 213451 non-null object
first_browser 213451 non-null object
country_destination 213451 non-null object
dtypes: float64(1), int64(2), object(13)
memory usage: 26.1+ MB
None
特征分析:
print ( train. date_account_created. head( ) )
0 2010-06-28
1 2011-05-25
2 2010-09-28
3 2011-12-05
4 2010-09-14
Name: date_account_created, dtype: object
print ( train. date_account_created. value_counts( ) . head( ) )
print ( train. date_account_created. value_counts( ) . tail( ) )
2014-05-13 674
2014-06-24 670
2014-06-25 636
2014-05-20 632
2014-05-14 622
Name: date_account_created, dtype: int64
2010-04-24 1
2010-03-09 1
2010-01-01 1
2010-06-18 1
2010-01-02 1
Name: date_account_created, dtype: int64
print ( train. date_account_created. describe( ) )
count 213451
unique 1634
top 2014-05-13
freq 674
Name: date_account_created, dtype: object
dac_train = train. date_account_created. value_counts( )
dac_test = test. date_account_created. value_counts( )
dac_train_date = pd. to_datetime( train. date_account_created. value_counts( ) . index)
dac_test_date = pd. to_datetime( test. date_account_created. value_counts( ) . index)
dac_train_day = dac_train_date - dac_train_date. min ( )
dac_test_day = dac_test_date - dac_train_date. min ( )
plt. scatter( dac_train_day. days, dac_train. values, color = 'r' , label = 'train dataset' )
plt. scatter( dac_test_day. days, dac_test. values, color = 'b' , label = 'test dataset' )
plt. title( "Accounts created vs day" )
plt. xlabel( "Days" )
plt. ylabel( "Accounts created" )
plt. legend( loc = 'upper left' )
print ( train. timestamp_first_active. head( ) )
0 20090319043255
1 20090523174809
2 20090609231247
3 20091031060129
4 20091208061105
Name: timestamp_first_active, dtype: int64
print ( train. timestamp_first_active. value_counts( ) . unique( ) )
[1]
tfa_train_dt = train. timestamp_first_active. astype( str ) . apply ( lambda x:
datetime. datetime( int ( x[ : 4 ] ) ,
int ( x[ 4 : 6 ] ) ,
int ( x[ 6 : 8 ] ) ,
int ( x[ 8 : 10 ] ) ,
int ( x[ 10 : 12 ] ) ,
int ( x[ 12 : ] ) ) )
print ( tfa_train_dt. describe( ) )
count 213451
unique 213451
top 2013-07-01 05:26:34
freq 1
first 2009-03-19 04:32:55
last 2014-06-30 23:58:24
Name: timestamp_first_active, dtype: object
print ( train. date_first_booking. describe( ) )
print ( test. date_first_booking. describe( ) )
count 88908
unique 1976
top 2014-05-22
freq 248
Name: date_first_booking, dtype: object
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
Name: date_first_booking, dtype: float64
print ( train. age. value_counts( ) . head( ) )
30.0 6124
31.0 6016
29.0 5963
28.0 5939
32.0 5855
Name: age, dtype: int64
age_train = [ train[ train. age. isnull( ) ] . age. shape[ 0 ] ,
train. query( 'age < 15' ) . age. shape[ 0 ] ,
train. query( "age >= 15 & age <= 90" ) . age. shape[ 0 ] ,
train. query( 'age > 90' ) . age. shape[ 0 ] ]
age_test = [ test[ test. age. isnull( ) ] . age. shape[ 0 ] ,
test. query( 'age < 15' ) . age. shape[ 0 ] ,
test. query( "age >= 15 & age <= 90" ) . age. shape[ 0 ] ,
test. query( 'age > 90' ) . age. shape[ 0 ] ]
columns = [ 'Null' , 'age < 15' , 'age' , 'age > 90' ]
fig, ( ax1, ax2) = plt. subplots( 1 , 2 , sharex= True , sharey = True , figsize= ( 10 , 5 ) )
sns. barplot( columns, age_train, ax = ax1)
sns. barplot( columns, age_test, ax = ax2)
ax1. set_title( 'training dataset' )
ax2. set_title( 'test dataset' )
ax1. set_ylabel( 'counts' )
def feature_barplot ( feature, df_train = train, df_test = test, figsize= ( 10 , 5 ) , rot = 90 , saveimg = False ) :
feat_train = df_train[ feature] . value_counts( )
feat_test = df_test[ feature] . value_counts( )
fig_feature, ( axis1, axis2) = plt. subplots( 1 , 2 , sharex= True , sharey = True , figsize = figsize)
sns. barplot( feat_train. index. values, feat_train. values, ax = axis1)
sns. barplot( feat_test. index. values, feat_test. values, ax = axis2)
axis1. set_xticklabels( axis1. xaxis. get_majorticklabels( ) , rotation = rot)
axis2. set_xticklabels( axis1. xaxis. get_majorticklabels( ) , rotation = rot)
axis1. set_title( feature + ' of training dataset' )
axis2. set_title( feature + ' of test dataset' )
axis1. set_ylabel( 'Counts' )
plt. tight_layout( )
if saveimg == True :
figname = feature + ".png"
fig_feature. savefig( figname, dpi = 75 )
feature_barplot( 'gender' , saveimg = True )
feature_barplot( 'signup_method' )
feature_barplot( 'signup_flow' )
feature_barplot( 'language' )
feature_barplot( 'affiliate_channel' )
feature_barplot( 'first_affiliate_tracked' )
feature_barplot( 'signup_app' )
feature_barplot( 'first_device_type' )
feature_barplot( 'first_browser' )
df_sessions = pd. read_csv( 'airbnb/sessions.csv' )
df_sessions. head( 10 )
user_id
action
action_type
action_detail
device_type
secs_elapsed
0
d1mm9tcy42
lookup
NaN
NaN
Windows Desktop
319.0
1
d1mm9tcy42
search_results
click
view_search_results
Windows Desktop
67753.0
2
d1mm9tcy42
lookup
NaN
NaN
Windows Desktop
301.0
3
d1mm9tcy42
search_results
click
view_search_results
Windows Desktop
22141.0
4
d1mm9tcy42
lookup
NaN
NaN
Windows Desktop
435.0
5
d1mm9tcy42
search_results
click
view_search_results
Windows Desktop
7703.0
6
d1mm9tcy42
lookup
NaN
NaN
Windows Desktop
115.0
7
d1mm9tcy42
personalize
data
wishlist_content_update
Windows Desktop
831.0
8
d1mm9tcy42
index
view
view_search_results
Windows Desktop
20842.0
9
d1mm9tcy42
lookup
NaN
NaN
Windows Desktop
683.0
df_sessions[ 'id' ] = df_sessions[ 'user_id' ]
df_sessions = df_sessions. drop( [ 'user_id' ] , axis= 1 )
df_sessions. shape
(10567737, 6)
df_sessions. isnull( ) . sum ( )
<