数据的描述
1.数据的集中趋势
(1)平均值
import pandas as pd
# 1.算数平均值
cars_score=pd.read_csv(r'D:\Projects\Python\Doing\pythonProject\data\cars_score.csv')
print(cars_score.mean(axis=0))
'''
油耗 3.752
动力 4.429
外观 4.763
空间 4.701
dtype: float64
'''
# 2.加权平均值
RFM=pd.read_excel(r'D:\Projects\Python\Doing\pythonProject\data\RFM.xlsx')
RFM['Weight_Mean']=0.2*RFM['R_score']+0.5*RFM['F_score']+0.3*RFM['M_score']
print(RFM.head())
'''
lst_order_date freq tot_amt R_score F_score M_score Weight_Mean
0 2017-06-01 5 1469.99998 6 4 6 5.0
1 2016-07-11 2 168.00000 3 2 2 2.2
2 2017-07-02 1 79.00000 6 1 1 2.0
3 2016-06-01 1 109.00000 1 1 2 1.3
4 2017-02-19 3 316.00000 5 3 4 3.7
'''
# 3.几何平均值
GDP=pd.read_excel(r'D:\Projects\Python\Doing\pythonProject\data\G_D_P.xlsx')
# 利用cumprod方法实现所有元素的累计乘积
cum_prod=GDP.Grouth.cumprod()
# 基于cum_prod结果,利用索引将最后一个累积元素取出来
res=cum_prod[GDP.shape[0]-1]
# 计算几何平均值
print(pow(res,1/len(cum_prod)))
# 0.08776443979162651
(2)中位数和四分位数
import pandas as pd
import matplotlib.pyplot as plt
tips=pd.read_csv(r'D:\Projects\Python\Doing\pythonProject\data\tips.csv')
# 基于pandas模块中的hist方法绘制直方图
tips.tip.hist(grid=False, # 去除图框内的网格线
facecolor='steelblue', # 直方图的探充塞
edgecolor='black' # 直方图的边框色
)
# plt.show()
# 1.中位数、均值
print(tips.tip.median()) # 2.9
print(tips.tip.mean()) #2.9982786885245902
# 2.四分位点
print(tips.tip.quantile(q=0.25))
print(tips.tip.quantile(q=0.75))
(3)众数
import pandas as pd
titanic=pd.read_excel(r'D:\Projects\Python\Doing\pythonProject\data\Titanic.xlsx')
print(titanic.Embarked.mode())
'''
0 S
dtype: object
'''
income=pd.read_excel(r'D:\Projects\Python\Doing\pythonProject\data\Income.xlsx')
# 返回众数所在组的行索引
index=