数据导入
import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import seaborn as sns
% matplotlib inline
import os
print ( os. listdir( "./input" ) )
train = pd. read_csv( "./input/train.csv" )
train. head( )
['test.csv', 'train.csv', 'gender_submission.csv']
---------------------------------------------------------------------------
PermissionError Traceback (most recent call last)
<ipython-input-4-6eab5534ffe4> in <module>
7 import os
8 print(os.listdir("./input"))
----> 9 train = pd.read_csv("./input/train.csv")
10 train.head()
~/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
312
313 return wrapper
~/anaconda3/lib/python3.7/site-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
584 kwds.update(kwds_defaults)
585
--> 586 return _read(filepath_or_buffer, kwds)
587
588
~/anaconda3/lib/python3.7/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
480
481 # Create the parser.
--> 482 parser = TextFileReader(filepath_or_buffer, **kwds)
483
484 if chunksize or iterator:
~/anaconda3/lib/python3.7/site-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
809 self.options["has_index_names"] = kwds["has_index_names"]
810
--> 811 self._engine = self._make_engine(self.engine)
812
813 def close(self):
~/anaconda3/lib/python3.7/site-packages/pandas/io/parsers/readers.py in _make_engine(self, engine)
1038 )
1039 # error: Too many arguments for "ParserBase"
-> 1040 return mapping[engine](self.f, **self.options) # type: ignore[call-arg]
1041
1042 def _failover_to_python(self):
~/anaconda3/lib/python3.7/site-packages/pandas/io/parsers/c_parser_wrapper.py in __init__(self, src, **kwds)
49
50 # open handles
---> 51 self._open_handles(src, kwds)
52 assert self.handles is not None
53
~/anaconda3/lib/python3.7/site-packages/pandas/io/parsers/base_parser.py in _open_handles(self, src, kwds)
227 memory_map=kwds.get("memory_map", False),
228 storage_options=kwds.get("storage_options", None),
--> 229 errors=kwds.get("encoding_errors", "strict"),
230 )
231
~/anaconda3/lib/python3.7/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
704 encoding=ioargs.encoding,
705 errors=errors,
--> 706 newline="",
707 )
708 else:
PermissionError: [Errno 13] Permission denied: './input/train.csv'
test = pd. read_csv( "./input/test.csv" )
test. head( )
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q 1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S 2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q 3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S 4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
train. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
合并train 和test
all = pd. concat( [ train, test] , sort = False )
all [ 'Age' ] = all [ 'Age' ] . fillna( value= all [ 'Age' ] . median( ) )
all [ 'Fare' ] = all [ 'Fare' ] . fillna( value= all [ 'Fare' ] . median( ) )
all . info( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 1309 non-null int64
1 Survived 891 non-null float64
2 Pclass 1309 non-null int64
3 Name 1309 non-null object
4 Sex 1309 non-null object
5 Age 1309 non-null float64
6 SibSp 1309 non-null int64
7 Parch 1309 non-null int64
8 Ticket 1309 non-null object
9 Fare 1309 non-null float64
10 Cabin 295 non-null object
11 Embarked 1307 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB
sns. catplot( x = 'Embarked' , kind = 'count' , data = all )
<seaborn.axisgrid.FacetGrid at 0x127f25d60>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dUQU32kV-1678272641348)(titanic-adaboost_files/titanic-adaboost_6_1.png)]
all [ 'Embarked' ] = all [ 'Embarked' ] . fillna( 'S' )
all . info( )
all . head( )
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 1309 non-null int64
1 Survived 891 non-null float64
2 Pclass 1309 non-null int64
3 Name 1309 non-null object
4 Sex 1309 non-null object
5 Age 1309 non-null float64
6 SibSp 1309 non-null int64
7 Parch 1309 non-null int64
8 Ticket 1309 non-null object
9 Fare 1309 non-null float64
10 Cabin 295 non-null object
11 Embarked 1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 165.2+ KB
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked 0 1 0.0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 1 2 1.0 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 2 3 1.0 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 3 4 1.0 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 4 5 0.0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
all . loc[ all [ 'Age' ] <= 16 , 'Age' ] = 0
all . loc[ ( all [ 'Age' ] > 16 ) & ( all [ 'Age' ] <= 32 ) , 'Age' ] = 1
all . loc[ ( all [ 'Age' ] > 32 ) & ( all [ 'Age' ] <= 48 ) , 'Age' ] = 2
all . loc[ ( all [ 'Age' ] > 48 ) & ( all [ 'Age' ] <= 64 ) , 'Age' ] = 3
all . loc[ all [ 'Age' ] > 64 , 'Age' ] = 4
all . head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title 0 1 0.0 3 Braund, Mr. Owen Harris male 0.0 1 0 A/5 21171 7.2500 M S Mr. 1 2 1.0 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 0.0 1 0 PC 17599 71.2833 C C Mrs. 2 3 1.0 3 Heikkinen, Miss. Laina female 0.0 0 0 STON/O2. 3101282 7.9250 M S Miss. 3 4 1.0 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 0.0 1 0 113803 53.1000 C S Mrs. 4 5 0.0 3 Allen, Mr. William Henry male 0.0 0 0 373450 8.0500 M S Mr.
import re
def get_title ( name) :
title_search = re. search( ' ([A-Za-z]+\.)' , name)
if title_search:
return title_search. group( 1 )
return ""
print ( get_title( 'Heikkinen, Miss. Laina' ) )
Miss.
all [ 'Title' ] = all [ 'Name' ] . apply ( get_title)
all [ 'Title' ] . value_counts( )
all . head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title 0 1 0.0 3 Braund, Mr. Owen Harris male 1.0 1 0 A/5 21171 7.2500 NaN S Mr. 1 2 1.0 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 2.0 1 0 PC 17599 71.2833 C85 C Mrs. 2 3 1.0 3 Heikkinen, Miss. Laina female 1.0 0 0 STON/O2. 3101282 7.9250 NaN S Miss. 3 4 1.0 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 2.0 1 0 113803 53.1000 C123 S Mrs. 4 5 0.0 3 Allen, Mr. William Henry male 2.0 0 0 373450 8.0500 NaN S Mr.
all [ 'Title' ] = all [ 'Title' ] . replace( [ 'Capt.' , 'Dr.' , 'Major.' , 'Rev.' ] , 'Officer.' )
all [ 'Title' ] = all [ 'Title' ] . replace( [ 'Lady.' , 'Countess.' , 'Don.' , 'Sir.' , 'Jonkheer.' , 'Dona.' ] , 'Royal.' )
all [ 'Title' ] = all [ 'Title' ] . replace( [ 'Mlle.' , 'Ms.' ] , 'Miss.' )
all [ 'Title' ] = all [ 'Title' ] . replace( [ 'Mme.' ] , 'Mrs.' )
all [ 'Title' ] . value_counts( )
Mr. 757
Miss. 264
Mrs. 198
Master. 61
Officer. 19
Royal. 6
Col. 4
Name: Title, dtype: int64
all [ 'Cabin' ] = all [ 'Cabin' ] . fillna( 'Missing' )
all [ 'Cabin' ] = all [ 'Cabin' ] . str [ 0 ]
all [ 'Cabin' ] . value_counts( )
all . head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title 0 1 0.0 3 Braund, Mr. Owen Harris male 1.0 1 0 A/5 21171 7.2500 M S Mr. 1 2 1.0 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 2.0 1 0 PC 17599 71.2833 C C Mrs. 2 3 1.0 3 Heikkinen, Miss. Laina female 1.0 0 0 STON/O2. 3101282 7.9250 M S Miss. 3 4 1.0 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 2.0 1 0 113803 53.1000 C S Mrs. 4 5 0.0 3 Allen, Mr. William Henry male 2.0 0 0 373450 8.0500 M S Mr.
all [ 'Family_Size' ] = all [ 'SibSp' ] + all [ 'Parch' ] + 1
all [ 'IsAlone' ] = 0
all . loc[ all [ 'Family_Size' ] == 1 , 'IsAlone' ] = 1
all . head( )
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title Family_Size IsAlone 0 1 0.0 3 Braund, Mr. Owen Harris male 1.0 1 0 A/5 21171 7.2500 M S Mr. 2 0 1 2 1.0 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 2.0 1 0 PC 17599 71.2833 C C Mrs. 2 0 2 3 1.0 3 Heikkinen, Miss. Laina female 1.0 0 0 STON/O2. 3101282 7.9250 M S Miss. 1 1 3 4 1.0 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 2.0 1 0 113803 53.1000 C S Mrs. 2 0 4 5 0.0 3 Allen, Mr. William Henry male 2.0 0 0 373450 8.0500 M S Mr. 1 1
all_1 = all . drop( [ 'Name' , 'Ticket' ] , axis = 1 )
all_1. head( )
PassengerId Survived Pclass Sex Age SibSp Parch Fare Cabin Embarked Title 0 1 0.0 3 male 1.0 1 0 7.2500 M S Mr. 1 2 1.0 1 female 2.0 1 0 71.2833 C C Mrs. 2 3 1.0 3 female 1.0 0 0 7.9250 M S Miss. 3 4 1.0 1 female 2.0 1 0 53.1000 C S Mrs. 4 5 0.0 3 male 2.0 0 0 8.0500 M S Mr.
all_dummies = pd. get_dummies( all_1, drop_first = True )
all_dummies. head( )
PassengerId Survived Pclass Age SibSp Parch Fare Sex_male Cabin_B Cabin_C ... Cabin_M Cabin_T Embarked_Q Embarked_S Title_Master. Title_Miss. Title_Mr. Title_Mrs. Title_Officer. Title_Royal. 0 1 0.0 3 1.0 1 0 7.2500 1 0 0 ... 1 0 0 1 0 0 1 0 0 0 1 2 1.0 1 2.0 1 0 71.2833 0 0 1 ... 0 0 0 0 0 0 0 1 0 0 2 3 1.0 3 1.0 0 0 7.9250 0 0 0 ... 1 0 0 1 0 1 0 0 0 0 3 4 1.0 1 2.0 1 0 53.1000 0 0 1 ... 0 0 0 1 0 0 0 1 0 0 4 5 0.0 3 2.0 0 0 8.0500 1 0 0 ... 1 0 0 1 0 0 1 0 0 0
5 rows × 24 columns
all_train = all_dummies[ all_dummies[ 'Survived' ] . notna( ) ]
all_test = all_dummies[ all_dummies[ 'Survived' ] . isna( ) ]
all_test. head( )
PassengerId Survived Pclass Age SibSp Parch Fare Sex_male Cabin_B Cabin_C ... Cabin_M Cabin_T Embarked_Q Embarked_S Title_Master. Title_Miss. Title_Mr. Title_Mrs. Title_Officer. Title_Royal. 0 892 NaN 3 2.0 0 0 7.8292 1 0 0 ... 1 0 1 0 0 0 1 0 0 0 1 893 NaN 3 2.0 1 0 7.0000 0 0 0 ... 1 0 0 1 0 0 0 1 0 0 2 894 NaN 2 3.0 0 0 9.6875 1 0 0 ... 1 0 1 0 0 0 1 0 0 0 3 895 NaN 3 1.0 0 0 8.6625 1 0 0 ... 1 0 0 1 0 0 1 0 0 0 4 896 NaN 3 1.0 1 1 12.2875 0 0 0 ... 1 0 0 1 0 0 0 1 0 0
5 rows × 24 columns
from sklearn. model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( all_train. drop( [ 'PassengerId' , 'Survived' ] , axis= 1 ) ,
all_train[ 'Survived' ] , test_size= 0.30 ,
random_state= 101 , stratify = all_train[ 'Survived' ] )
from sklearn. ensemble import AdaBoostClassifier
from sklearn. tree import DecisionTreeClassifier
ada = AdaBoostClassifier( DecisionTreeClassifier( ) , n_estimators= 100 , random_state= 0 )
ada. fit( X_train, y_train)
predictions = ada. predict( X_test)
from sklearn. metrics import classification_report
print ( classification_report( y_test, predictions) )
print ( f'Train Accuracy - : { ada. score( X_train, y_train) : .3f } ' )
print ( f'Test Accuracy - : { ada. score( X_test, y_test) : .3f } ' )
precision recall f1-score support
0.0 0.80 0.83 0.82 165
1.0 0.71 0.67 0.69 103
accuracy 0.77 268
macro avg 0.76 0.75 0.75 268
weighted avg 0.77 0.77 0.77 268
Train Accuracy - : 0.961
Test Accuracy - : 0.769
最终预测
TestForPred = all_test. drop( [ 'PassengerId' , 'Survived' ] , axis = 1 )
t_pred = ada. predict( TestForPred) . astype( int )
PassengerId = all_test[ 'PassengerId' ]
adaSub = pd. DataFrame( { 'PassengerId' : PassengerId, 'Survived' : t_pred } )
adaSub. head( )
PassengerId Survived 0 892 0 1 893 0 2 894 1 3 895 0 4 896 0
adaSub. to_csv( "1_Ada_Submission.csv" , index = False )