1 数据重构
1.1 数据的合并
载入data文件夹中的文件,观察他们之间的关系
#导入库
import numpy as np
import pandas as pd
#数据的载入
data_ld=pd.read_csv('data/train-left-down.csv')
data_lu=pd.read_csv('data/train-left-up.csv')
data_rd=pd.read_csv('data/train-right-down.csv')
data_ru=pd.read_csv('data/train-right-up.csv')
data_ld.head()
PassengerId | Survived | Pclass | Name | |
---|---|---|---|---|
0 | 440 | 0 | 2 | Kvillner, Mr. Johan Henrik Johannesson |
1 | 441 | 1 | 2 | Hart, Mrs. Benjamin (Esther Ada Bloomfield) |
2 | 442 | 0 | 3 | Hampe, Mr. Leon |
3 | 443 | 0 | 3 | Petterson, Mr. Johan Emil |
4 | 444 | 1 | 2 | Reynaldo, Ms. Encarnacion |
data_lu.head()
PassengerId | Survived | Pclass | Name | |
---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) |
4 | 5 | 0 | 3 | Allen, Mr. William Henry |
data_rd.head()
Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | male | 31.0 | 0 | 0 | C.A. 18723 | 10.500 | NaN | S |
1 | female | 45.0 | 1 | 1 | F.C.C. 13529 | 26.250 | NaN | S |
2 | male | 20.0 | 0 | 0 | 345769 | 9.500 | NaN | S |
3 | male | 25.0 | 1 | 0 | 347076 | 7.775 | NaN | S |
4 | female | 28.0 | 0 | 0 | 230434 | 13.000 | NaN | S |
data_ru.head()
Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
【提示】结合之前我们加载的train.csv数据,大致预测一下上面的数据是什么
使用concat方法:将数据train-left-up.csv和train-right-up.csv横向合并为一张表
#将表合并好之后,在保存在文件夹
pd.concat([data_lu,data_ru],axis=1).to_csv('result_up.csv')
使用concat方法:将train-left-down和train-right-down横向合并为一张表
#将表合并好之后,在保存在文件夹
pd.concat([data_ld,data_rd],axis=1).to_csv('result_down.csv')
将result_up和result_down纵向合并为result
#加载数据
result_down=pd.read_csv('result_down.csv')
result_up=pd.read_csv('result.csv')
#和并数据
result=pd.concat([result_up,result_down])
result.head()
Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
使用DataFrame自带的方法join方法和append合并表
#join提供了列方向(拼接列)的拼接操作,支持左联、右联、内联和外联四种操作类型
#append 提供了行方向(堆叠行)的拼接操作
result_up=data_lu.join(data_ru)
result_down=data_ld.join(data_rd)
result=result_up.append(result_down)
result.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
**使用Panads的merge方法和DataFrame的append方法合并表
#merge()合并数据集, 通过left, right确定连接字段,默认是两个数据集相同的字段
"""
left_index:使用左侧DataFrame中的行索引作为连接键;
right_index:使用右侧DataFrame中的行索引作为连接键;
"""
result_up = pd.merge(data_lu,data_ru,left_index=True,right_index=True)
result_down = pd.merge(data_ld,data_rd,left_index=True,right_index=True)
result = result_up.append(result_down)
result.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
#保存数据
result.to_csv('result.csv')
1.2 将数据变为Series类型
#举一个小例子
#DataFrame的创建
test1 = {'name': ['张三', '李四', '王二麻子', '李华'],
'ages': [35, 60, 25, 15],
'height': [170,165,175,180]
}
test1 = pd.DataFrame(test1)
test1
name | ages | height | |
---|---|---|---|
0 | 张三 | 35 | 170 |
1 | 李四 | 60 | 165 |
2 | 王二麻子 | 25 | 175 |
3 | 李华 | 15 | 180 |
test2=test1.stack()
test2
0 name 张三
ages 35
height 170
1 name 李四
ages 60
height 165
2 name 王二麻子
ages 25
height 175
3 name 李华
ages 15
height 180
dtype: object
# 数据加载
text = pd.read_csv('result.csv')
#使用stack将数据转化为Series stack的意思是堆叠,堆积
unit_result=text.stack()
unit_result.head(13)
0 Unnamed: 0 0
PassengerId 1
Survived 0
Pclass 3
Name Braund, Mr. Owen Harris
Sex male
Age 22
SibSp 1
Parch 0
Ticket A/5 21171
Fare 7.25
Embarked S
1 Unnamed: 0 1
dtype: object
#将数据保存为unit_result,csv
unit_result.to_csv('unit_result.csv')
#读取数据
test = pd.read_csv('unit_result.csv')
test.head(13)
Unnamed: 0 | Unnamed: 1 | 0 | |
---|---|---|---|
0 | 0 | Unnamed: 0 | 0 |
1 | 0 | PassengerId | 1 |
2 | 0 | Survived | 0 |
3 | 0 | Pclass | 3 |
4 | 0 | Name | Braund, Mr. Owen Harris |
5 | 0 | Sex | male |
6 | 0 | Age | 22.0 |
7 | 0 | SibSp | 1 |
8 | 0 | Parch | 0 |
9 | 0 | Ticket | A/5 21171 |
10 | 0 | Fare | 7.25 |
11 | 0 | Embarked | S |
12 | 1 | Unnamed: 0 | 1 |
2 数据聚合与运算
# 导入基本库
import numpy as np
import pandas as pd
# 加载文件result.csv,并查看这个文件
data=pd.read_csv('result.csv')
data.head()
Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
2.1 数据运用
了解GroupBy
#举一个小例子
test={'key1':['a','a','b','c','c'],
'key2':['A','B','C','C','B'],
'num1':[1,8,3,7,5],
'num2':[2,6,3,8,9]
}
test=pd.DataFrame(test)
test
key1 | key2 | num1 | num2 | |
---|---|---|---|---|
0 | a | A | 1 | 2 |
1 | a | B | 8 | 6 |
2 | b | C | 3 | 3 |
3 | c | C | 7 | 8 |
4 | c | B | 5 | 9 |
#key1各类的大小
test['num2'].groupby(test['key1']).size()
key1
a 2
b 1
c 2
Name: num2, dtype: int64
#key1各类num1的平均值结果
test['num1'].groupby(test['key1']).mean()
key1
a 4.5
b 3.0
c 6.0
Name: num1, dtype: float64
test['num1'].groupby([test['key1'],test['key2']]).size()
key1 key2
a A 1
B 1
b C 1
c B 1
C 1
Name: num1, dtype: int64
test['num1'].groupby([test['key1'],test['key2']]).mean()
key1 key2
a A 1
B 8
b C 3
c B 5
C 7
Name: num1, dtype: int64
计算泰坦尼克号男性与女性的平均票价
Fare_mean=data['Fare'].groupby(data['Sex']).mean()
Fare_mean
Sex
female 44.479818
male 25.523893
Name: Fare, dtype: float64
统计泰坦尼克号中男女的存活人数
Survived_sum_Sex=data['Survived'].groupby(data['Sex']).sum()
Survived_sum_Sex
#另一种写法
#data.groupby(data['Sex'])['Survived'].sum()
Sex
female 233
male 109
Name: Survived, dtype: int64
计算客舱不同等级的存活人数
data['Survived'].groupby(data['Pclass']).sum()
#另一种写法
#data.groupby(data['Pclass'])['Survived'].sum()
Pclass
1 136
2 87
3 119
Name: Survived, dtype: int64
【思考】从数据分析的角度,上面的统计结果可以得出那些结论
#女性平均票价高于男性票价
#女性存活人数高于男性
#总体上来说客舱的等级高存活的机会大
通过agg()函数来同时计算上述的任务。并使用rename函数修改列名。
data.groupby('Sex').agg({'Fare': 'mean', 'Pclass': 'count'}).rename(columns=
{'Fare': 'mean_fare', 'Pclass': 'count_pclass'})
mean_fare | count_pclass | |
---|---|---|
Sex | ||
female | 44.479818 | 314 |
male | 25.523893 | 577 |
data.groupby(data['Pclass']).agg({'Survived':'sum'})
Survived | |
---|---|
Pclass | |
1 | 136 |
2 | 87 |
3 | 119 |
统计在不同等级的票中的不同年龄的船票花费的平均值
data['Fare'].groupby([data['Pclass'],data['Age']]).mean().head()
#data.groupby(['Pclass','Age'])['Fare'].mean().head()
Pclass Age
1 0.92 151.5500
2.00 151.5500
4.00 81.8583
11.00 120.0000
14.00 120.0000
Name: Fare, dtype: float64
将上述的数据合并,并保存到sex_fare_survived.csv
#用concat进行合并
sex_fare_survived=pd.concat([Fare_mean,Survived_sum_Sex],axis=1)
sex_fare_survived
#用merge进行合并
sex_fare_survived=pd.merge(Fare_mean,Survived_sum_Sex,on='Sex')
sex_fare_survived
'''
merge:函数的参数:
on:用于连接的列名
left_on,right_on :左侧(右侧)用于连接键的列
left_index(right_index):将左(右)侧的行索引作为连接键
sort:对于合并后的数据进行排列,默认为true,对于数据量比较大的时候,把他设置为False,性能更好
suffixes:默认为('_x','_y')
copy:默认为True,可以改为False,数据量比较大时
'''
Fare | Survived | |
---|---|---|
Sex | ||
female | 44.479818 | 233 |
male | 25.523893 | 109 |
#保存文件
sex_fare_survived.to_csv('sex_fare_survived.csv')
得出不同年龄的总的存活人数,然后找出存活人数的最高的年龄,最后计算存活人数最高的存活率(存活人数/总人数)
#第一种写法
Age_survived=data['Survived'].groupby(data['Age']).sum()
#第二种写法
data.groupby(data['Age'])['Survived'].sum()
Age
0.42 1
0.67 1
0.75 2
0.83 2
0.92 1
..
70.00 0
70.50 0
71.00 0
74.00 0
80.00 1
Name: Survived, Length: 88, dtype: int64
# 最大存活年龄和人数
Age_survived[Age_survived.values==Age_survived.max()]
Age
24.0 15
Name: Survived, dtype: int64
# 总存活人数
_sum=data['Survived'].sum()
_sum
#最大存活率
Age_survived.max()/_sum
0.043859649122807015
本文主要学习内容来源:datawhale