机器学习-adaboost

数据导入

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
print(os.listdir("./input"))
train = pd.read_csv("./input/train.csv")
train.head()
['test.csv', 'train.csv', 'gender_submission.csv']



---------------------------------------------------------------------------

PermissionError                           Traceback (most recent call last)

<ipython-input-4-6eab5534ffe4> in <module>
      7 import os
      8 print(os.listdir("./input"))
----> 9 train = pd.read_csv("./input/train.csv")
     10 train.head()


~/anaconda3/lib/python3.7/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    309                     stacklevel=stacklevel,
    310                 )
--> 311             return func(*args, **kwargs)
    312 
    313         return wrapper


~/anaconda3/lib/python3.7/site-packages/pandas/io/parsers/readers.py in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
    584     kwds.update(kwds_defaults)
    585 
--> 586     return _read(filepath_or_buffer, kwds)
    587 
    588 


~/anaconda3/lib/python3.7/site-packages/pandas/io/parsers/readers.py in _read(filepath_or_buffer, kwds)
    480 
    481     # Create the parser.
--> 482     parser = TextFileReader(filepath_or_buffer, **kwds)
    483 
    484     if chunksize or iterator:


~/anaconda3/lib/python3.7/site-packages/pandas/io/parsers/readers.py in __init__(self, f, engine, **kwds)
    809             self.options["has_index_names"] = kwds["has_index_names"]
    810 
--> 811         self._engine = self._make_engine(self.engine)
    812 
    813     def close(self):


~/anaconda3/lib/python3.7/site-packages/pandas/io/parsers/readers.py in _make_engine(self, engine)
   1038             )
   1039         # error: Too many arguments for "ParserBase"
-> 1040         return mapping[engine](self.f, **self.options)  # type: ignore[call-arg]
   1041 
   1042     def _failover_to_python(self):


~/anaconda3/lib/python3.7/site-packages/pandas/io/parsers/c_parser_wrapper.py in __init__(self, src, **kwds)
     49 
     50         # open handles
---> 51         self._open_handles(src, kwds)
     52         assert self.handles is not None
     53 


~/anaconda3/lib/python3.7/site-packages/pandas/io/parsers/base_parser.py in _open_handles(self, src, kwds)
    227             memory_map=kwds.get("memory_map", False),
    228             storage_options=kwds.get("storage_options", None),
--> 229             errors=kwds.get("encoding_errors", "strict"),
    230         )
    231 


~/anaconda3/lib/python3.7/site-packages/pandas/io/common.py in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
    704                 encoding=ioargs.encoding,
    705                 errors=errors,
--> 706                 newline="",
    707             )
    708         else:


PermissionError: [Errno 13] Permission denied: './input/train.csv'
test = pd.read_csv("./input/test.csv")
test.head()
PassengerIdPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
08923Kelly, Mr. Jamesmale34.5003309117.8292NaNQ
18933Wilkes, Mrs. James (Ellen Needs)female47.0103632727.0000NaNS
28942Myles, Mr. Thomas Francismale62.0002402769.6875NaNQ
38953Wirz, Mr. Albertmale27.0003151548.6625NaNS
48963Hirvonen, Mrs. Alexander (Helga E Lindqvist)female22.011310129812.2875NaNS
train.info()
# test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB

合并train 和test

# 合并是为了保证特征处理
all = pd.concat([train, test], sort = False)

# 使用中位数替换空值
all['Age'] = all['Age'].fillna(value=all['Age'].median())
all['Fare'] = all['Fare'].fillna(value=all['Fare'].median())
all.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1309 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1309 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB
# 登船港口
sns.catplot(x = 'Embarked', kind = 'count', data = all) 
<seaborn.axisgrid.FacetGrid at 0x127f25d60>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dUQU32kV-1678272641348)(titanic-adaboost_files/titanic-adaboost_6_1.png)]

# 登入港口缺失值替换
all['Embarked'] = all['Embarked'].fillna('S')
all.info()
all.head()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1309 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1309 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1309 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 165.2+ KB
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
010.03Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
121.01Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
231.03Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
341.01Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
450.03Allen, Mr. William Henrymale35.0003734508.0500NaNS
# 年龄
all.loc[ all['Age'] <= 16, 'Age'] = 0
all.loc[(all['Age'] > 16) & (all['Age'] <= 32), 'Age'] = 1
all.loc[(all['Age'] > 32) & (all['Age'] <= 48), 'Age'] = 2
all.loc[(all['Age'] > 48) & (all['Age'] <= 64), 'Age'] = 3
all.loc[ all['Age'] > 64, 'Age'] = 4 
all.head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedTitle
010.03Braund, Mr. Owen Harrismale0.010A/5 211717.2500MSMr.
121.01Cumings, Mrs. John Bradley (Florence Briggs Th...female0.010PC 1759971.2833CCMrs.
231.03Heikkinen, Miss. Lainafemale0.000STON/O2. 31012827.9250MSMiss.
341.01Futrelle, Mrs. Jacques Heath (Lily May Peel)female0.01011380353.1000CSMrs.
450.03Allen, Mr. William Henrymale0.0003734508.0500MSMr.
# 获取先生、小姐
# Mrs 夫人、Mr 先生、Miss 女士
import re
def get_title(name):
    title_search = re.search(' ([A-Za-z]+\.)', name)
    
    if title_search:
        return title_search.group(1)
    return ""

# 获取姓名
print(get_title('Heikkinen, Miss. Laina'))
Miss.
all['Title'] = all['Name'].apply(get_title)
all['Title'].value_counts()
all.head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedTitle
010.03Braund, Mr. Owen Harrismale1.010A/5 211717.2500NaNSMr.
121.01Cumings, Mrs. John Bradley (Florence Briggs Th...female2.010PC 1759971.2833C85CMrs.
231.03Heikkinen, Miss. Lainafemale1.000STON/O2. 31012827.9250NaNSMiss.
341.01Futrelle, Mrs. Jacques Heath (Lily May Peel)female2.01011380353.1000C123SMrs.
450.03Allen, Mr. William Henrymale2.0003734508.0500NaNSMr.
all['Title'] = all['Title'].replace(['Capt.', 'Dr.', 'Major.', 'Rev.'], 'Officer.')
all['Title'] = all['Title'].replace(['Lady.', 'Countess.', 'Don.', 'Sir.', 'Jonkheer.', 'Dona.'], 'Royal.')
all['Title'] = all['Title'].replace(['Mlle.', 'Ms.'], 'Miss.')
all['Title'] = all['Title'].replace(['Mme.'], 'Mrs.')
all['Title'].value_counts()
Mr.         757
Miss.       264
Mrs.        198
Master.      61
Officer.     19
Royal.        6
Col.          4
Name: Title, dtype: int64
# 座舱
all['Cabin'] = all['Cabin'].fillna('Missing') # 空值填写Missing
all['Cabin'] = all['Cabin'].str[0]
all['Cabin'].value_counts()
all.head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedTitle
010.03Braund, Mr. Owen Harrismale1.010A/5 211717.2500MSMr.
121.01Cumings, Mrs. John Bradley (Florence Briggs Th...female2.010PC 1759971.2833CCMrs.
231.03Heikkinen, Miss. Lainafemale1.000STON/O2. 31012827.9250MSMiss.
341.01Futrelle, Mrs. Jacques Heath (Lily May Peel)female2.01011380353.1000CSMrs.
450.03Allen, Mr. William Henrymale2.0003734508.0500MSMr.
# 判断是否是单身
all['Family_Size'] = all['SibSp'] + all['Parch'] + 1 # 计算家庭成员
all['IsAlone'] = 0

# 家庭成员 = 1 标记单独
all.loc[all['Family_Size']==1, 'IsAlone'] = 1 
all.head()
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarkedTitleFamily_SizeIsAlone
010.03Braund, Mr. Owen Harrismale1.010A/5 211717.2500MSMr.20
121.01Cumings, Mrs. John Bradley (Florence Briggs Th...female2.010PC 1759971.2833CCMrs.20
231.03Heikkinen, Miss. Lainafemale1.000STON/O2. 31012827.9250MSMiss.11
341.01Futrelle, Mrs. Jacques Heath (Lily May Peel)female2.01011380353.1000CSMrs.20
450.03Allen, Mr. William Henrymale2.0003734508.0500MSMr.11
# 删除无效维度
all_1 = all.drop(['Name', 'Ticket'], axis = 1)
all_1.head()
PassengerIdSurvivedPclassSexAgeSibSpParchFareCabinEmbarkedTitle
010.03male1.0107.2500MSMr.
121.01female2.01071.2833CCMrs.
231.03female1.0007.9250MSMiss.
341.01female2.01053.1000CSMrs.
450.03male2.0008.0500MSMr.
all_dummies = pd.get_dummies(all_1, drop_first = True)
all_dummies.head()
PassengerIdSurvivedPclassAgeSibSpParchFareSex_maleCabin_BCabin_C...Cabin_MCabin_TEmbarked_QEmbarked_STitle_Master.Title_Miss.Title_Mr.Title_Mrs.Title_Officer.Title_Royal.
010.031.0107.2500100...1001001000
121.012.01071.2833001...0000000100
231.031.0007.9250000...1001010000
341.012.01053.1000001...0001000100
450.032.0008.0500100...1001001000

5 rows × 24 columns

# notna()函数检测 DataFrame 中的现有/非缺失值。
all_train = all_dummies[all_dummies['Survived'].notna()]
# all_train.info()

all_test = all_dummies[all_dummies['Survived'].isna()]
all_test.head()
PassengerIdSurvivedPclassAgeSibSpParchFareSex_maleCabin_BCabin_C...Cabin_MCabin_TEmbarked_QEmbarked_STitle_Master.Title_Miss.Title_Mr.Title_Mrs.Title_Officer.Title_Royal.
0892NaN32.0007.8292100...1010001000
1893NaN32.0107.0000000...1001000100
2894NaN23.0009.6875100...1010001000
3895NaN31.0008.6625100...1001001000
4896NaN31.01112.2875000...1001000100

5 rows × 24 columns

from sklearn.model_selection import train_test_split
# stratify: 依据标签y,按原数据y中各类比例,分配给train和test
X_train, X_test, y_train, y_test = train_test_split(all_train.drop(['PassengerId','Survived'],axis=1), 
                                                    all_train['Survived'], test_size=0.30, 
                                                    random_state=101, stratify = all_train['Survived'])

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# AdaBoostClassifier 模型 
# 基分类器 - DecisionTreeClassifier

ada = AdaBoostClassifier(DecisionTreeClassifier(),n_estimators=100, random_state=0)
ada.fit(X_train,y_train)

predictions = ada.predict(X_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

print (f'Train Accuracy - : {ada.score(X_train,y_train):.3f}')
print (f'Test Accuracy - : {ada.score(X_test,y_test):.3f}')
              precision    recall  f1-score   support

         0.0       0.80      0.83      0.82       165
         1.0       0.71      0.67      0.69       103

    accuracy                           0.77       268
   macro avg       0.76      0.75      0.75       268
weighted avg       0.77      0.77      0.77       268

Train Accuracy - : 0.961
Test Accuracy - : 0.769

最终预测

TestForPred = all_test.drop(['PassengerId', 'Survived'], axis = 1)
t_pred = ada.predict(TestForPred).astype(int)
PassengerId = all_test['PassengerId']

adaSub = pd.DataFrame({'PassengerId': PassengerId, 'Survived':t_pred })
adaSub.head()
PassengerIdSurvived
08920
18930
28941
38950
48960
adaSub.to_csv("1_Ada_Submission.csv", index = False)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

the uzi

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值