1_泰坦尼克号EDA

该博客主要展示了泰坦尼克号数据集的预处理过程,包括缺失值处理、特征编码、数据可视化和相关性分析。通过对年龄、性别、仓位等级、登船港口等特征的分析,确定了与票价相关的因素,并使用线性回归模型进行了预测。此外,还进行了五折交叉验证评估模型性能。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

import pandas as pd 
import numpy as np
import os
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号

warnings.filterwarnings('ignore')
#查看当前路径
os.getcwd()
'C:\\Develop\\python_project\\ML\\learning'
data = pd.read_csv(r'./data/train.csv', names=['乘客ID','是否幸存','仓位等级','姓名','性别','年龄','兄弟姐妹个数','父母子女个数','船票信息','票价','客舱','登船港口'],index_col='乘客ID',header=0)
data.head()
是否幸存仓位等级姓名性别年龄兄弟姐妹个数父母子女个数船票信息票价客舱登船港口
乘客ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
503Allen, Mr. William Henrymale35.0003734508.0500NaNS
# data.to_csv(r'./data/train1.csv')
data.shape
(891, 11)
data.info(verbose=True)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   是否幸存    891 non-null    int64  
 1   仓位等级    891 non-null    int64  
 2   姓名      891 non-null    object 
 3   性别      891 non-null    object 
 4   年龄      714 non-null    float64
 5   兄弟姐妹个数  891 non-null    int64  
 6   父母子女个数  891 non-null    int64  
 7   船票信息    891 non-null    object 
 8   票价      891 non-null    float64
 9   客舱      204 non-null    object 
 10  登船港口    889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB
data.describe()
是否幸存仓位等级年龄兄弟姐妹个数父母子女个数票价
count891.000000891.000000714.000000891.000000891.000000891.000000
mean0.3838382.30864229.6991180.5230080.38159432.204208
std0.4865920.83607114.5264971.1027430.80605749.693429
min0.0000001.0000000.4200000.0000000.0000000.000000
25%0.0000002.00000020.1250000.0000000.0000007.910400
50%0.0000003.00000028.0000000.0000000.00000014.454200
75%1.0000003.00000038.0000001.0000000.00000031.000000
max1.0000003.00000080.0000008.0000006.000000512.329200
data.describe(include='O')
姓名性别船票信息客舱登船港口
count891891891204889
unique89126811473
topMernagh, Mr. Robertmale1601G6S
freq157774644
#查看缺失值的情况
data.isnull().sum()
是否幸存        0
仓位等级        0
姓名          0
性别          0
年龄        177
兄弟姐妹个数      0
父母子女个数      0
船票信息        0
票价          0
客舱        687
登船港口        2
dtype: int64
# nan可视化
missing = data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()

在这里插入图片描述

data['客舱'].value_counts()
G6             4
C23 C25 C27    4
B96 B98        4
F2             3
D              3
              ..
B78            1
A23            1
E63            1
C91            1
E36            1
Name: 客舱, Length: 147, dtype: int64

客舱是否和票价有相关。

PS:将客舱的首字母作为客舱等级的类别条件

from collections import Iterable
data['客舱等级']=data['客舱'].apply(lambda x : x[0] if isinstance(x,Iterable)  == True else np.nan)
data[['客舱','客舱等级']]
客舱客舱等级
乘客ID
1NaNNaN
2C85C
3NaNNaN
4C123C
5NaNNaN
.........
887NaNNaN
888B42B
889NaNNaN
890C148C
891NaNNaN

891 rows × 2 columns

data['客舱等级'].value_counts()
C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: 客舱等级, dtype: int64
fig,axes=plt.subplots(2,4,figsize=(20, 10))  #创建一个1行三列的图片  
#设置主标题
fig.suptitle('客舱等级与票价的关系')
axes[0][0].set_title('A')
sns.distplot(data[['票价']].loc[data['客舱等级']=='A'],ax=axes[0,0]);
axes[0][1].set_title('B')
sns.distplot(data[['票价']].loc[data['客舱等级']=='B'],ax=axes[0,1]);
axes[0][2].set_title('C')
sns.distplot(data[['票价']].loc[data['客舱等级']=='C'],ax=axes[0,2]);
axes[0][3].set_title('D')
sns.distplot(data[['票价']].loc[data['客舱等级']=='D'],ax=axes[0,3]);
axes[1][0].set_title('E')
sns.distplot(data[['票价']].loc[data['客舱等级']=='E'],ax=axes[1,0]);
axes[1][1].set_title('F')
sns.distplot(data[['票价']].loc[data['客舱等级']=='F'],ax=axes[1,1]);
axes[1][2].set_title('G')
sns.distplot(data[['票价']].loc[data['客舱等级']=='G'],ax=axes[1,2]);
axes[1][3].set_title('T')
sns.distplot(data[['票价']].loc[data['客舱等级']=='T'],ax=axes[1,3]);

在这里插入图片描述

data[['性别','年龄','票价','客舱等级']].loc[data['客舱等级'].notna()]
性别年龄票价客舱等级
乘客ID
2female38.071.2833C
4female35.053.1000C
7male54.051.8625E
11female4.016.7000G
12female58.026.5500C
...............
872female47.052.5542D
873male33.05.0000B
880female56.083.1583C
888female19.030.0000B
890male26.030.0000C

204 rows × 4 columns

for column in ["性别","登船港口"]:
    print("字段名",column)
    print("------------------")
    print(data[column].value_counts())
字段名 性别
------------------
male      577
female    314
Name: 性别, dtype: int64
字段名 登船港口
------------------
S    644
C    168
Q     77
Name: 登船港口, dtype: int64
data['性别_标识'] = data['性别'].map({'male': 1, 'female': 2})
data.head()
是否幸存仓位等级姓名性别年龄兄弟姐妹个数父母子女个数船票信息票价客舱登船港口客舱等级性别_标识
乘客ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNSNaN1
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85CC2
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNSNaN2
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123SC2
503Allen, Mr. William Henrymale35.0003734508.0500NaNSNaN1
data['登船港口_标识']= data['登船港口'].map({'S': 1, 'C': 2, 'Q': 3})
data.head()
是否幸存仓位等级姓名性别年龄兄弟姐妹个数父母子女个数船票信息票价客舱登船港口客舱等级性别_标识登船港口_标识
乘客ID
103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNSNaN11.0
211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85CC22.0
313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNSNaN21.0
411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123SC21.0
503Allen, Mr. William Henrymale35.0003734508.0500NaNSNaN11.0
# data1 = data.loc[data['客舱等级'].notna()] #[['年龄','性别_标识','票价','仓位等级','客舱等级','登船港口_标识']]
data1=data.copy()
data1['年龄_分箱'] =pd.cut(data1['年龄'],[0,5,15,30,50,80],labels = False)
data1['票价_log'] = np.log(data1['票价'])
data1['票价_log'][np.isinf(data1['票价_log'])] = np.nan
data1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   是否幸存     891 non-null    int64  
 1   仓位等级     891 non-null    int64  
 2   姓名       891 non-null    object 
 3   性别       891 non-null    object 
 4   年龄       714 non-null    float64
 5   兄弟姐妹个数   891 non-null    int64  
 6   父母子女个数   891 non-null    int64  
 7   船票信息     891 non-null    object 
 8   票价       891 non-null    float64
 9   客舱       204 non-null    object 
 10  登船港口     889 non-null    object 
 11  客舱等级     204 non-null    object 
 12  性别_标识    891 non-null    int64  
 13  登船港口_标识  889 non-null    float64
 14  年龄_分箱    714 non-null    float64
 15  票价_log   876 non-null    float64
dtypes: float64(5), int64(5), object(6)
memory usage: 118.3+ KB
#data1['年龄_分箱']=(data1['年龄_分箱'].notna()).astype(int)
# ## 特征与标签组合的散点可视化
# sns.pairplot(data=data1[['年龄','票价_log','客舱等级']],diag_kind='hist', hue= '客舱等级')
# plt.show()

客舱等级目前的分类效果比较差

# data1.groupby(['仓位等级','客舱等级','性别_标识'])['客舱等级'].count()
# data1.groupby(['仓位等级','客舱等级','性别_标识'])['票价'].mean()

票价是否和年龄性别仓位等级有关系。

data2=data1.loc[data['票价']==0]
data1=data1.loc[data1['票价']>0]
data1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 876 entries, 1 to 891
Data columns (total 16 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   是否幸存     876 non-null    int64  
 1   仓位等级     876 non-null    int64  
 2   姓名       876 non-null    object 
 3   性别       876 non-null    object 
 4   年龄       707 non-null    float64
 5   兄弟姐妹个数   876 non-null    int64  
 6   父母子女个数   876 non-null    int64  
 7   船票信息     876 non-null    object 
 8   票价       876 non-null    float64
 9   客舱       201 non-null    object 
 10  登船港口     874 non-null    object 
 11  客舱等级     201 non-null    object 
 12  性别_标识    876 non-null    int64  
 13  登船港口_标识  874 non-null    float64
 14  年龄_分箱    707 non-null    float64
 15  票价_log   876 non-null    float64
dtypes: float64(5), int64(5), object(6)
memory usage: 116.3+ KB
data1[['年龄','性别_标识','仓位等级','年龄_分箱','登船港口_标识','票价','票价_log']]
年龄性别_标识仓位等级年龄_分箱登船港口_标识票价票价_log
乘客ID
122.0132.01.07.25001.981001
238.0213.02.071.28334.266662
326.0232.01.07.92502.070022
435.0213.01.053.10003.972177
535.0133.01.08.05002.085672
........................
88727.0122.01.013.00002.564949
88819.0212.01.030.00003.401197
889NaN23NaN1.023.45003.154870
89026.0112.02.030.00003.401197
89132.0133.03.07.75002.047693

876 rows × 7 columns

data1['票价'].value_counts()
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
32.3208     1
13.8583     1
7.6292      1
15.0500     1
8.6833      1
Name: 票价, Length: 247, dtype: int64
## 1) 总体分布概况(无界约翰逊分布等)
import scipy.stats as st
y = data1['票价']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)

在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

## 2) 查看skewness and kurtosis
sns.distplot(data1['票价']);
print("Skewness: %f" % data1['票价'].skew())
print("Kurtosis: %f" % data1['票价'].kurt())
Skewness: 4.770117
Kurtosis: 33.094179

在这里插入图片描述

y = data1['票价_log']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=False, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)

在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

## 2) 查看skewness and kurtosis
sns.distplot(data1['票价_log']);
print("Skewness: %f" % data1['票价_log'].skew())
print("Kurtosis: %f" % data1['票价_log'].kurt())
Skewness: 0.901272
Kurtosis: 0.092646

在这里插入图片描述

data1.skew(), data1.kurt()
(是否幸存       0.454980
 仓位等级      -0.645700
 年龄         0.397549
 兄弟姐妹个数     3.663054
 父母子女个数     2.719314
 票价         4.770117
 性别_标识      0.591376
 登船港口_标识    1.513679
 年龄_分箱     -0.509732
 票价_log     0.901272
 dtype: float64,
 是否幸存       -1.797101
 仓位等级       -1.264175
 年龄          0.181680
 兄弟姐妹个数     17.569865
 父母子女个数      9.571814
 票价         33.094179
 性别_标识      -1.654057
 登船港口_标识     1.009103
 年龄_分箱       0.556544
 票价_log      0.092646
 dtype: float64)
sns.distplot(data1.skew(),color='blue',axlabel ='Skewness')

在这里插入图片描述

sns.distplot(data1.kurt(),color='orange',axlabel ='Kurtness')

在这里插入图片描述

使用log对目标票价处理后,偏值和峰值下降到0.96,0.24总体属于正态分布,数据正偏右尾

## 1) 相关性分析
price_numeric = data1
correlation = price_numeric.corr()
print(correlation['票价_log'].sort_values(ascending = False),'\n')
票价_log     1.000000
票价         0.817386
父母子女个数     0.339180
是否幸存       0.325452
兄弟姐妹个数     0.324373
性别_标识      0.247711
年龄         0.135352
年龄_分箱      0.091907
登船港口_标识   -0.012083
仓位等级      -0.754893
Name: 票价_log, dtype: float64 
f , ax = plt.subplots(figsize = (7, 7))

plt.title('Correlation of Numeric Features with Price',y=1,size=16)

sns.heatmap(correlation,square = True,  vmax=0.85)

在这里插入图片描述

# 对类别特征进行 OneEncoder  '年龄_分箱','性别_标识','仓位等级','登船港口_标识','票价_log'
data1 = pd.get_dummies(data1, columns=['年龄_分箱', '性别_标识', '仓位等级', '登船港口_标识'])
# 对类别特征进行 OneEncoder  '年龄_分箱','性别_标识','仓位等级','登船港口_标识','票价_log'
data2 = pd.get_dummies(data2, columns=['年龄_分箱', '性别_标识', '仓位等级', '登船港口_标识'])
data1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 876 entries, 1 to 891
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   是否幸存         876 non-null    int64  
 1   姓名           876 non-null    object 
 2   性别           876 non-null    object 
 3   年龄           707 non-null    float64
 4   兄弟姐妹个数       876 non-null    int64  
 5   父母子女个数       876 non-null    int64  
 6   船票信息         876 non-null    object 
 7   票价           876 non-null    float64
 8   客舱           201 non-null    object 
 9   登船港口         874 non-null    object 
 10  客舱等级         201 non-null    object 
 11  票价_log       876 non-null    float64
 12  年龄_分箱_0.0    876 non-null    uint8  
 13  年龄_分箱_1.0    876 non-null    uint8  
 14  年龄_分箱_2.0    876 non-null    uint8  
 15  年龄_分箱_3.0    876 non-null    uint8  
 16  年龄_分箱_4.0    876 non-null    uint8  
 17  性别_标识_1      876 non-null    uint8  
 18  性别_标识_2      876 non-null    uint8  
 19  仓位等级_1       876 non-null    uint8  
 20  仓位等级_2       876 non-null    uint8  
 21  仓位等级_3       876 non-null    uint8  
 22  登船港口_标识_1.0  876 non-null    uint8  
 23  登船港口_标识_2.0  876 non-null    uint8  
 24  登船港口_标识_3.0  876 non-null    uint8  
dtypes: float64(3), int64(3), object(6), uint8(13)
memory usage: 100.1+ KB
correlation = data1.corr()
f , ax = plt.subplots(figsize = (7, 7))
plt.title('票价相关性',y=1,size=16)
sns.heatmap(correlation,square = True,  vmax=0.8)

在这里插入图片描述

# k_feature 太大会很难跑,没服务器,所以提前 interrupt 了
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
sfs = SFS(LinearRegression(),
           k_features=6,
           forward=True,
           floating=False,
           scoring = 'r2',
           cv = 0)
x = data1.drop(['票价','票价_log'], axis=1)
numerical_cols = x.select_dtypes(exclude = 'object').columns
x = x[numerical_cols]
x = x.fillna(0)
y = data1['票价_log'].fillna(0)
sfs.fit(x, y)
sfs.k_feature_names_ 
('兄弟姐妹个数', '父母子女个数', '性别_标识_1', '仓位等级_1', '仓位等级_2', '登船港口_标识_2.0')
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
fig1 = plot_sfs(sfs.get_metric_dict(), kind='std_dev')
plt.grid()
plt.show()

在这里插入图片描述

#将特征分类
#数字特征:'乘客ID','年龄','兄弟姐妹个数','父母子女个数','票价'
#类别特征:'仓位等级','性别','客舱','登船港口'
#文本型特征:'姓名','船票信息'
#目标:'是否幸存'

data1.to_csv('./data/train1.csv',encoding='utf_8_sig')
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data1[['兄弟姐妹个数', '父母子女个数', '性别_标识_1', '仓位等级_1', '仓位等级_3', '登船港口_标识_2.0']],
                                               data1['票价_log'],train_size=0.8)  #自动建立训练及测试 数据集函数,其中train_size=0.8为分割比例

print('x原始数据',data1[['兄弟姐妹个数', '父母子女个数', '性别_标识_1', '仓位等级_1', '仓位等级_3', '登船港口_标识_2.0']].shape,
     'x训练数据',x_train.shape,
     'x测试数据',x_test.shape,)
print('y原始数据',data1['票价_log'].shape,
     'y训练数据',y_train.shape,
     'y测试数据',y_test.shape,)
x原始数据 (876, 6) x训练数据 (700, 6) x测试数据 (176, 6)
y原始数据 (876,) y训练数据 (700,) y测试数据 (176,)
from sklearn.linear_model import LinearRegression
model= LinearRegression()
model.fit(x_train[['兄弟姐妹个数', '父母子女个数', '性别_标识_1', '仓位等级_1', '仓位等级_3', '登船港口_标识_2.0']],y_train)   #sklearn里的model.fit(X,y) 中的X,y必须是矩阵形式 
LinearRegression()
x_train=x_train.values
y_train=y_train.values
#第1步:导入线性回归
from sklearn.linear_model import LinearRegression
# 第2步:创建模型:线性回归
model = LinearRegression()
#第3步:训练模型
model.fit(x_train , y_train)
LinearRegression()
#训练数据的预测值
y_train_pred = model.predict(x_train)
plt.figure(dpi=300,figsize=(24,8))
plt.plot([i for  i in range(700)], y_train_pred,  color='skyblue', label='预测值')
plt.plot([i for  i in range(700)], y_train, color='blue', label='实际值')
plt.legend()
 
plt.xlabel('序号ID')
plt.ylabel('票价_log')
plt.show()

在这里插入图片描述

五折交叉验证&&均方误差&&平均绝对误差

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error,  make_scorer
def log_transfer(func):
    def wrapper(y, yhat):
        result = func(np.log(y), np.nan_to_num(np.log(yhat)))
        return result
    return wrapper
scores = cross_val_score(model, X=x_train, y=y_train, verbose=1, cv = 5, scoring=make_scorer(log_transfer(mean_absolute_error)))
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
print('AVG:', np.mean(scores))
AVG: 0.09158952675993322
scores = cross_val_score(model, X=x_train, y=y_train_pred, verbose=1, cv = 5, scoring=make_scorer(mean_absolute_error))
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
print('AVG:', np.mean(scores))
AVG: 1.4134725136370564e-15
scores = pd.DataFrame(scores.reshape(1,-1))
scores.columns = ['cv' + str(x) for x in range(1, 6)]
scores.index = ['平均绝对误差MAE']
scores
cv1cv2cv3cv4cv5
平均绝对误差MAE1.049954e-152.531308e-156.756500e-161.091191e-151.719260e-15
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_train, y_train_pred)
print('均方误差',MSE)
均方误差 0.17890857693897988
data2.info()#
<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 180 to 823
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   是否幸存         15 non-null     int64  
 1   姓名           15 non-null     object 
 2   性别           15 non-null     object 
 3   年龄           7 non-null      float64
 4   兄弟姐妹个数       15 non-null     int64  
 5   父母子女个数       15 non-null     int64  
 6   船票信息         15 non-null     object 
 7   票价           15 non-null     float64
 8   客舱           3 non-null      object 
 9   登船港口         15 non-null     object 
 10  客舱等级         3 non-null      object 
 11  票价_log       0 non-null      float64
 12  年龄_分箱_2.0    15 non-null     uint8  
 13  年龄_分箱_3.0    15 non-null     uint8  
 14  性别_标识_1      15 non-null     uint8  
 15  仓位等级_1       15 non-null     uint8  
 16  仓位等级_2       15 non-null     uint8  
 17  仓位等级_3       15 non-null     uint8  
 18  登船港口_标识_1.0  15 non-null     uint8  
dtypes: float64(3), int64(3), object(6), uint8(7)
memory usage: 1.6+ KB
data2['登船港口_标识_2.0']=0
data2_pred = model.predict(data2[['兄弟姐妹个数', '父母子女个数', '性别_标识_1', '仓位等级_1', '仓位等级_3', '登船港口_标识_2.0']])
data2_pred,np.exp(data2_pred)

(array([2.12092784, 3.8968038 , 2.12092784, 2.63339506, 2.12092784,
        2.63339506, 2.63339506, 2.63339506, 2.12092784, 3.8968038 ,
        2.63339506, 2.63339506, 3.8968038 , 3.8968038 , 3.8968038 ]),
 array([ 8.33887105, 49.24480131,  8.33887105, 13.92095222,  8.33887105,
        13.92095222, 13.92095222, 13.92095222,  8.33887105, 49.24480131,
        13.92095222, 13.92095222, 49.24480131, 49.24480131, 49.24480131]))
pd.DataFrame(data2_pred,columns=['票价'])
票价
02.120928
13.896804
22.120928
32.633395
42.120928
52.633395
62.633395
72.633395
82.120928
93.896804
102.633395
112.633395
123.896804
133.896804
143.896804
pd.DataFrame(np.exp(data2_pred),columns=['票价'])
票价
08.338871
149.244801
28.338871
313.920952
48.338871
513.920952
613.920952
713.920952
88.338871
949.244801
1013.920952
1113.920952
1249.244801
1349.244801
1449.244801
del data2['票价']
data2.reset_index()
乘客ID是否幸存姓名性别年龄兄弟姐妹个数父母子女个数船票信息客舱登船港口客舱等级票价_log年龄_分箱_2.0年龄_分箱_3.0性别_标识_1仓位等级_1仓位等级_2仓位等级_3登船港口_标识_1.0登船港口_标识_2.0
01800Leonard, Mr. Lionelmale36.000LINENaNSNaNNaN01100110
12640Harrison, Mr. Williammale40.000112059B94SBNaN01110010
22721Tornquist, Mr. William Henrymale25.000LINENaNSNaNNaN10100110
32780Parkes, Mr. Francis "Frank"maleNaN00239853NaNSNaNNaN00101010
43030Johnson, Mr. William Cahoone Jrmale19.000LINENaNSNaNNaN10100110
54140Cunningham, Mr. Alfred FlemingmaleNaN00239853NaNSNaNNaN00101010
64670Campbell, Mr. WilliammaleNaN00239853NaNSNaNNaN00101010
74820Frost, Mr. Anthony Wood "Archie"maleNaN00239854NaNSNaNNaN00101010
85980Johnson, Mr. Alfredmale49.000LINENaNSNaNNaN01100110
96340Parr, Mr. William Henry MarshmaleNaN00112052NaNSNaNNaN00110010
106750Watson, Mr. Ennis HastingsmaleNaN00239856NaNSNaNNaN00101010
117330Knight, Mr. Robert JmaleNaN00239855NaNSNaNNaN00101010
128070Andrews, Mr. Thomas Jrmale39.000112050A36SANaN01110010
138160Fry, Mr. RichardmaleNaN00112058B102SBNaN00110010
148230Reuchlin, Jonkheer. John Georgemale38.00019972NaNSNaNNaN01110010
data2=pd.concat([data2.reset_index(),pd.DataFrame(np.exp(data2_pred),columns=['票价'])],axis=1)
data2['票价_log'] = np.log(data2['票价'])
data2.set_index(["乘客ID"], inplace=True)
data2
是否幸存姓名性别年龄兄弟姐妹个数父母子女个数船票信息客舱登船港口客舱等级票价_log年龄_分箱_2.0年龄_分箱_3.0性别_标识_1仓位等级_1仓位等级_2仓位等级_3登船港口_标识_1.0登船港口_标识_2.0票价
乘客ID
1800Leonard, Mr. Lionelmale36.000LINENaNSNaN2.120928011001108.338871
2640Harrison, Mr. Williammale40.000112059B94SB3.8968040111001049.244801
2721Tornquist, Mr. William Henrymale25.000LINENaNSNaN2.120928101001108.338871
2780Parkes, Mr. Francis "Frank"maleNaN00239853NaNSNaN2.6333950010101013.920952
3030Johnson, Mr. William Cahoone Jrmale19.000LINENaNSNaN2.120928101001108.338871
4140Cunningham, Mr. Alfred FlemingmaleNaN00239853NaNSNaN2.6333950010101013.920952
4670Campbell, Mr. WilliammaleNaN00239853NaNSNaN2.6333950010101013.920952
4820Frost, Mr. Anthony Wood "Archie"maleNaN00239854NaNSNaN2.6333950010101013.920952
5980Johnson, Mr. Alfredmale49.000LINENaNSNaN2.120928011001108.338871
6340Parr, Mr. William Henry MarshmaleNaN00112052NaNSNaN3.8968040011001049.244801
6750Watson, Mr. Ennis HastingsmaleNaN00239856NaNSNaN2.6333950010101013.920952
7330Knight, Mr. Robert JmaleNaN00239855NaNSNaN2.6333950010101013.920952
8070Andrews, Mr. Thomas Jrmale39.000112050A36SA3.8968040111001049.244801
8160Fry, Mr. RichardmaleNaN00112058B102SB3.8968040011001049.244801
8230Reuchlin, Jonkheer. John Georgemale38.00019972NaNSNaN3.8968040111001049.244801

缺失的票价补充完成

data1.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 876 entries, 1 to 891
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   是否幸存         876 non-null    int64  
 1   姓名           876 non-null    object 
 2   性别           876 non-null    object 
 3   年龄           707 non-null    float64
 4   兄弟姐妹个数       876 non-null    int64  
 5   父母子女个数       876 non-null    int64  
 6   船票信息         876 non-null    object 
 7   票价           876 non-null    float64
 8   客舱           201 non-null    object 
 9   登船港口         874 non-null    object 
 10  客舱等级         201 non-null    object 
 11  票价_log       876 non-null    float64
 12  年龄_分箱_0.0    876 non-null    uint8  
 13  年龄_分箱_1.0    876 non-null    uint8  
 14  年龄_分箱_2.0    876 non-null    uint8  
 15  年龄_分箱_3.0    876 non-null    uint8  
 16  年龄_分箱_4.0    876 non-null    uint8  
 17  性别_标识_1      876 non-null    uint8  
 18  性别_标识_2      876 non-null    uint8  
 19  仓位等级_1       876 non-null    uint8  
 20  仓位等级_2       876 non-null    uint8  
 21  仓位等级_3       876 non-null    uint8  
 22  登船港口_标识_1.0  876 non-null    uint8  
 23  登船港口_标识_2.0  876 non-null    uint8  
 24  登船港口_标识_3.0  876 non-null    uint8  
dtypes: float64(3), int64(3), object(6), uint8(13)
memory usage: 140.1+ KB
data2.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 15 entries, 180 to 823
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   是否幸存         15 non-null     int64  
 1   姓名           15 non-null     object 
 2   性别           15 non-null     object 
 3   年龄           7 non-null      float64
 4   兄弟姐妹个数       15 non-null     int64  
 5   父母子女个数       15 non-null     int64  
 6   船票信息         15 non-null     object 
 7   客舱           3 non-null      object 
 8   登船港口         15 non-null     object 
 9   客舱等级         3 non-null      object 
 10  票价_log       15 non-null     float64
 11  年龄_分箱_2.0    15 non-null     uint8  
 12  年龄_分箱_3.0    15 non-null     uint8  
 13  性别_标识_1      15 non-null     uint8  
 14  仓位等级_1       15 non-null     uint8  
 15  仓位等级_2       15 non-null     uint8  
 16  仓位等级_3       15 non-null     uint8  
 17  登船港口_标识_1.0  15 non-null     uint8  
 18  登船港口_标识_2.0  15 non-null     int64  
 19  票价           15 non-null     float64
dtypes: float64(3), int64(4), object(6), uint8(7)
memory usage: 1.7+ KB

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值