1、对数据进行观察,认识数据格式
df_train = spark.read.csv('./data/titanic-train.csv',header=True,inferSchema=True).cache()
# 打印出dataframe对象df_train的Schema信息
df_train.printSchema()
# 行数、列数
print(df_train.count(),len(df_train.columns))
# 默认输出20条数据
df_train.show()
2、对数据进行描述统计
df_train = spark.read.csv('./data/titanic-train.csv',header=True,inferSchema=True).cache()
#计算基本的统计描述信息,行数,
df_train.describe("Age","Pclass","SibSp","Parch").show()
df_train.describe("Sex","Cabin","Embarked","Fare","Survived").show()
# 分组求和
pdf = df_train.groupBy('sex','Survived') \
.agg({'PassengerId': 'count'}) \
.withColumnRenamed("count(PassengerId)","count") \
.orderBy("sex") \
.toPandas()
print(pdf)
sex Survived count
0 female 1 233
1 female 0 81
2 male 0 468
3 male 1 109
print(pdf[pdf["Survived"]== 1])
sex Survived count
0 female 1 233
3 male 1 109
print(pdf[pdf["Survived"]== 0])
sex Survived count
1 female 0 81
2 male 0 468
# 获取count列
print(pdf[pdf["Survived"]== 1]["count"])
count
233
109