导入源文件
(注:原文件数据存在于百度网盘:链接:https://pan.baidu.com/s/1NVwb1KJ4eobq55BzyRo1nQ?pwd=pyth
提取码:pyth)
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
titanic_survival.head()
计算Age字段空值的个数
#The Pandas library uses NaN, which stands for "not a number", to indicate a missing value.
#we can use the pandas.isnull() function which takes a pandas series and returns a series of True and False values
age = titanic_survival["Age"]
print(age.loc[0:10])
age_is_null = pd.isnull(age)#判断是否为缺失值
print (age_is_null)#false 和true 缺失就是true
age_null_true = age[age_is_null]#作为索引
print (age_null_true)#保留了空值字段
age_null_count = len(age_null_true)#计算空值的个数
print(age_null_count)
#mean fare for each class,求得每个Pclass对应的价格平均值,不同舱位,价格不同,求不同仓(Pclass)的价格均值,常见做法
#mean fare for each class,求得每个Pclass对应的价格平均值
passenger_classes = [1, 2, 3]
fares_by_class = {}
for this_class in passenger_classes:
pclass_rows = titanic_survival[titanic_survival["Pclass"] == this_class]#把乘坐同类Pclass的数据挑出来,如this_class=1,把所有Pclass为1的挑出来
pclass_fares = pclass_rows["Fare"]
fare_for_class = pclass_fares.mean()
fares_by_class[this_class] = fare_for_class#在fares_by_class字典中,求得每个Pclass对应的平均价格
print (fares_by_class)#结果为{1: 84.15468749999992, 2: 20.66218315217391, 3: 13.675550101832997}
上述问题对于不同的组进行计算,有没有更为简便的操作呢?答案是有的,如:
#index tells the method which column to group by
#values is the column that we want to apply the calculation to
#aggfunc specifies the calculation we want to perform
passenger_Fare = titanic_survival.pivot_table(index="Pclass", values="Fare", aggfunc=np.mean)
print (passenger_Fare)#按照Pclass的值进行分组,对Fare进行计算均值
#结果为:
'''
Fare
Pclass
1 84.154687
2 20.662183
3 13.675550
'''
#如要计算不同舱的生存率,
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
print (passenger_survival)#按照Pclass的值进行分组,对Survived进行计算均值
#结果为:
'''
Pclass
1 0.629630
2 0.472826
3 0.242363
'''
#计算不同Pclass平均年龄,参数aggfunc没赋值时,默认参数为aggfunc=np.mean,求均值。
passenger_age = titanic_survival.pivot_table(index="Pclass", values="Age")#aggfunc=np.mean
print(passenger_age)
'''
Age
Pclass
1 38.233441
2 29.877630
3 25.140620
'''
#根据不同上岸地点Embarked统计,费用Fare之和,统计活下来Survived总人数
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
'''
Fare Survived
Embarked
C 10072.2962 93
Q 1022.2543 30
S 17439.3988 217
'''
如何删除带有空值的行或者列?使用dropna函数。
#specifying axis=1 or axis='columns' will drop any columns that have null values
drop_na_columns = titanic_survival.dropna(axis=1)#删除带有空值的列,返回的是删除带空值列后的矩阵数据
# print(drop_na_columns)
drop_na_rows = titanic_survival.dropna(axis=0)#删除带有空值的行,返回的是删除带空值行后的矩阵数据
# print(drop_na_rows)
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])#限定操作在subset中,如果["Age", "Sex"]有缺失值,则删除行(axis=0).
print (new_titanic_survival)
准确定位到某行某列的元素,使用索引,列索引用
row_index_83_age = titanic_survival.loc[83,"Age"]#第83行,第"Age"列
row_index_1000_pclass = titanic_survival.loc[1000,"Pclass"]#第83行,第"Age"列
print (row_index_83_age)
print (row_index_1000_pclass)
如何把样本按照某一列进行排序呢?使用sort_values,排序后索引更改需使用reset_index
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
# help(titanic_survival.sort_values)
print (new_titanic_survival[0:10])#按照"Age"把所有样本排序,但索引未变
titanic_reindexed = new_titanic_survival.reset_index(drop=True)#原索引不用了
print(titanic_reindexed.iloc[0:10])#修改索引
如何对数据矩阵的每行或者每列应用自定义的函数?可以使用apply函数(这样可以简化操作)
#想要对数据矩阵的每行或者每列应用自定义函数,可以使用apply函数(这样可以简化操作)
def not_null_count(column):
column_null = pd.isnull(column)#对每列进行求空值,得到的是true和false组成的向量
null = column[column_null]#通过索引的方式筛选出带数据元素
return len(null)#返回带有数据的元素个数
column_null_count = titanic_survival.apply(not_null_count)#应用在数据的每列中,默认参数apply(func, axis=0)axis=0,表示对列操作,axis=0表示对行操作
print (column_null_count)
#结果为:
'''
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
'''
#对数据元素的行进行操作,axis参数=1,如:
#By passing in the axis=1 argument, we can use the DataFrame.apply() method to iterate over rows instead of columns.
def which_class(row):
pclass = row['Pclass']
if pd.isnull(pclass):
return "Unknown"
elif pclass == 1:
return "First Class"
elif pclass == 2:
return "Second Class"
elif pclass == 3:
return "Third Class"
classes = titanic_survival.apply(which_class, axis=1)
print (classes)
#结果为
'''
0 Third Class
1 First Class
2 Third Class
3 First Class
4 Third Class
...
886 Second Class
887 First Class
888 Third Class
889 First Class
890 Third Class
Length: 891, dtype: object
'''
#如对年龄分级,18以下的为True,其他的都False
def is_minor(row):#挑出年龄小于18岁的数据,低于18为True
if row["Age"] < 18:
return True
else:
return False
minors = titanic_survival.apply(is_minor, axis=1)#对行进行操作,年龄被分为两个档,一个True一个False
print (minors)
'''
0 False
1 False
2 False
3 False
4 False
...
886 False
887 False
888 False
889 False
890 False
Length: 891, dtype: bool
'''
#对年龄分三个label
def generate_age_label(row):#对行进行操作,年龄被分为三个label,unknown,minor,adult
age = row["Age"]
if pd.isnull(age):
return "unknown"
elif age < 18:
return "minor"
else:
return "adult"
age_labels = titanic_survival.apply(generate_age_label, axis=1)
print (age_labels)
'''
0 adult
1 adult
2 adult
3 adult
4 adult
...
886 adult
887 adult
888 unknown
889 adult
890 adult
Length: 891, dtype: object
'''
#这样分好label之后,把分好labe的数据替换原数据列,再去统计数据会简便很多
#如:按年龄label统计各label生还人数均值
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print (age_group_survival)
'''
Survived
age_labels
adult 0.381032
minor 0.539823
unknown 0.293785
'''