import numpy
import pandas
from matplotlib import pyplot
class Titanic:
"""
Titanic.csv 根目录的泰坦尼克号的详情文档
PassengerId 乘客的唯一id
Survived 1:获救 0:死亡
Pclass 座舱等级 3最好
Name 姓名
Sex 性别
Age 年龄
SibSp 船上有没有兄弟姐妹
Parch 穿上有没有父母等直系亲属
Ticket 船票
Fare 花费
Cabin 座舱号
Embarked 从那个港口登船
任务:
1 统计存活率多少
2 那个年龄段存活率最高
3 女性存活率是否高于男性
4 船上是否存在明显的贫富差距
5 头等仓的存活率是否高于经济舱
6 有亲属在船上的乘客比率 有亲属是否影响存活率
7 从那个港口是否影响存活率
8 不同年龄段女性的获救率
"""
def __init__(self):
self.df_titanic = pandas.read_csv('Titanic.csv')
self.columns = self.df_titanic.columns
self.index = self.df_titanic.index
self.sum_ = self.df_titanic.count()[0]
def _rate_survival(self,dataFarm):
cuihuo = dataFarm['Survived'].value_counts()[1]
zong_shu = dataFarm.count()[0]
return cuihuo/zong_shu
# 1 统计存活率多少
def survival_rate(self):
return self._rate_survival(self.df_titanic)
# return survived
# return self.total
# 2 那个年龄段存活率最高
def max_survival_age_bracket(self):
avg = self.df_titanic['Age'].mean()
data = self.df_titanic.fillna(avg)
max_age = data['Age'].max()
n = int(max_age // 10 + 1)
# print(data['Age'])
max_survival_age = 0
max_rate = 0
for i in range(n):
ages = data[(i * 10 < data['Age']) & (data['Age'] <= (i + 1) * 10)]
sum_ages = ages.count()[0]
sum_survival = ages['Survived'].value_counts()[1]
rate = sum_survival / sum_ages
if rate > max_survival_age:
max_survival_age = i
max_rate = rate
return '{}岁--{}岁的人存活率最大,区间存活率为{}'.format(max_survival_age * 10, (max_survival_age + 1) * 10, max_rate)
# 3 女性存活率是否高于男性
def sex_survival(self):
nv = self.df_titanic[self.df_titanic['Sex'].str.contains('female')]
nv_survivals_sum = nv['Survived'].value_counts()
nv_rate = nv_survivals_sum[1] / (nv.count()[0])
nan = self.df_titanic[self.df_titanic['Sex'].str.contains('male')]
nan_survivals_sum = nan['Survived'].value_counts()
nan_rate = nan_survivals_sum[1] / (nan.count()[0])
if nv_rate > nan_rate:
return '女士存活率({}) 大于 男士存活率({})'.format(nv_rate, nan_rate)
return '女士存活率({}) 小于 男士存活率({})'.format(nv_rate, nan_rate)
# 4 船上是否存在明显的贫富差距
def rich_gap(self):
pclass = self.df_titanic['Pclass'].value_counts()
one_pc_rate = pclass[1] / self.sum_
two_pc_rate = pclass[2] / self.sum_
three_pc_rate = pclass[3] / self.sum_
rate = three_pc_rate / one_pc_rate
if rate > 1.5:
return '座舱等级为3的比率{},座舱等级为2的比率{},座舱等级为1的比率{},存在明显的贫富差距'.format(three_pc_rate, two_pc_rate, one_pc_rate)
else:
return '座舱等级为3的比率{},座舱等级为2的比率{},座舱等级为1的比率{},不存在明显的贫富差距'.format(three_pc_rate, two_pc_rate, one_pc_rate)
# 5 头等仓的存活率是否高于经济舱
def pclass_survivesd(self):
tou_pclass = self.df_titanic[self.df_titanic['Pclass'] == 3]
tou_sum = tou_pclass.count()[0]
tou_rate = tou_pclass['Survived'].value_counts()[1]/tou_sum
jing_pclass = self.df_titanic[self.df_titanic['Pclass'] == 1]
jing_sum = jing_pclass.count()[0]
jing_rate = jing_pclass['Survived'].value_counts()[1]/jing_sum
if jing_rate > tou_rate:
return '经济舱{}生存率高于头等舱{}'.format(jing_rate,tou_rate)
return '头等舱{}生存率高于经济舱{}'.format(tou_rate,jing_rate)
# 6 有亲属在船上的乘客比率 有亲属是否影响存活率
def qinshu_survived_ziji(self):
wu_peoples = self.df_titanic[(self.df_titanic['SibSp'] == 0) & (self.df_titanic['Parch']==0)]
wu_cuihuo = wu_peoples['Survived'].value_counts()[1]
wu_sum = wu_peoples.count()[0]
wu_rate = wu_cuihuo/wu_sum
you_sum = self.sum_ - wu_sum
you_cuihuo = self.df_titanic['Survived'].value_counts()[1] - wu_cuihuo
you_rate = you_cuihuo/you_sum
# return [you_cuihuo,you_sum,you_rate,'------',wu_cuihuo,wu_sum,wu_rate]
if you_rate > wu_rate:
return '有亲戚在船上的人的存活率{} 大于 没有亲戚在船上的存活率{}'.format(you_rate,wu_rate)
return '没有亲戚在船上的存活率{} 大于 有亲戚在船上的人的存活率{}'.format(wu_rate, you_rate)
# 7 从那个港口是否影响存活率
def gangkou_survived(self):
s_gang = self.df_titanic[self.df_titanic['Embarked'] == 'S']
q_gang = self.df_titanic[self.df_titanic['Embarked'] == 'Q']
c_gang = self.df_titanic[self.df_titanic['Embarked'] == 'C']
s_rate = self._rate_survival(s_gang)
q_rate = self._rate_survival(q_gang)
c_rate = self._rate_survival(c_gang)
return {'s港口':s_rate,'q港口':q_rate,'c港口':c_rate}
# 8 不同年龄段女性的获救率
def nv_different_age(self):
max_survival_age = 0
max_rate = 0
data = self.df_titanic[self.df_titanic['Sex'].str.contains('female')]
max_age = data['Age'].max()
n = int(max_age//10 +1)
# return n
for i in range(n):
ages = data[(i * 10 < data['Age']) & (data['Age'] <= (i + 1) * 10)]
sum_ages = ages.count()[0]
sum_survival = ages['Survived'].value_counts()[1]
rate = sum_survival / sum_ages
if rate > max_survival_age:
max_survival_age = i
max_rate = rate
return '{}岁--{}岁的女人存活率最大,区间存活率为{}'.format(max_survival_age * 10, (max_survival_age + 1) * 10, max_rate)
if __name__ == '__main__':
titanic_cc = Titanic()
print(titanic_cc.survival_rate())
print(titanic_cc.max_survival_age_bracket())
print(titanic_cc.sex_survival())
print(titanic_cc.rich_gap())
print(titanic_cc.pclass_survivesd())
print(titanic_cc.qinshu_survived_ziji())
print(titanic_cc.gangkou_survived())
print(titanic_cc.nv_different_age())
需要先下载
Titanic.csv
文件,然后保存到目录中,因为是浅分析,只能固定的分析内容,烦请谅解。谢谢