1.模块 import pandas as pd
import numpy as np
1.1.读表 detail=pd.read_excel(r’E:\DataAnalyst\数据分析表\meal_order_detail.xlsx’,sheet_name=1)
2.透视表 pivot_table==> groupby:
2.1.aggfunc:聚合函数:np.sum/np.mean/np.std
index:行分组键
colunms:列分组键
values:想要求统计的元素
fill_value:填充值 将表中的nan值修改为任何一个想要的值
margins:汇总开关
2.1.1.行分组 print(pd.pivot_table(detail[[‘order_id’, ‘counts’, ‘amounts’]], index=‘order_id’,aggfunc=np.sum))
amounts counts
order_id
162 1032 18
170 450 12
2.1.2.多个行分组键
detail_pivot=pd.pivot_table(detail[[‘order_id’, ‘counts’, ‘amounts’,‘dishes_name’]], index=[‘order_id’,‘dishes_name’],aggfunc=np.sum)
print(detail_pivot)
print(detail_pivot.index)
amounts counts
order_id dishes_name
162 53度茅台 128 1
姜葱炒花蟹 45 2
意文柠檬汁 13 1
拌土豆丝 25 1
MultiIndex(levels=[[162, 170, 172]] codes=[[0, 0, 0,]] names=[‘order_id’, ‘dishes_name’])
2.1.3.列分组键
print(pd.pivot_table(detail[[‘order_id’, ‘counts’, ‘amounts’,‘dishes_name’]],
index=‘order_id’,
columns=‘dishes_name’,
aggfunc=np.sum))
amounts … counts
dishes_name 42度海之蓝 北冰洋汽水 38度剑南春 50度古井贡酒 … 黄尾袋鼠西拉
order_id …
162 NaN NaN NaN NaN … NaN 2.0 NaN NaN
2.1.4.创建pandas
data=pd.DataFrame({‘name’:[‘张三’,‘乔大爷’,‘老王’,‘李四’],
‘age’:[12,16,26,50],
‘job’:[‘student’,‘student’,‘doc’,‘doc’],
‘provi’:[‘shandong’,‘shanxi’,‘taiwan’,‘taiwan’]})
2.1.5.多个行分组键
print(pd.pivot_table(data[[‘job’, ‘age’,‘provi’]], index=[‘job’,‘provi’]))
age
job provi
doc taiwan 38
student shandong 12
shanxi 16
2.1.6.行分组键与列分组键同时使用
print(pd.pivot_table(data[[‘job’, ‘age’, ‘provi’]], columns=‘provi’, index=‘job’))
print(pd.pivot_table(data[[‘job’, ‘age’, ‘provi’]], columns=‘provi’, index=‘job’,values=‘age’))
print(pd.pivot_table(data[[‘job’, ‘age’, ‘provi’]], columns=‘provi’, index=‘job’,values=‘age’,fill_value=0))
age
provi shandong shanxi taiwan
job
doc NaN NaN 38.0
student 12.0 16.0 NaN
provi shandong shanxi taiwan
job
doc NaN NaN 38.0
student 12.0 16.0 NaN
provi shandong shanxi taiwan
job
doc 0 0 38
student 12 16 0
2.1.7.添加汇总开关与aggfunc统一
print(pd.pivot_table(data[[‘job’, ‘age’, ‘provi’]],
columns=‘provi’, index=‘job’,values=‘age’,fill_value=0,margins=True))
provi shandong shanxi taiwan All
job
doc 0 0 38 38
student 12 16 0 14
All 12 16 38 26
2.2.交叉表
2.2.1.使用crosstab 函数创建交叉表
print(pd.crosstab(index=data[‘job’], columns=data[‘provi’], values=data[‘age’],aggfunc=np.sum))
provi shandong shanxi taiwan
job
doc NaN NaN 76.0
student 12.0 16.0 NaN
3.合并数据
3.1.df1 = pd.DataFrame({‘name’: [‘张三’, ‘乔大爷’, ‘老王’,‘李四’],
‘age’: [12, 16, 26, 50],
‘job’:[‘student’, ‘student’, ‘doc’, ‘doc’],
‘provi’: [‘shandong’, ‘shanxi’, ‘taiwan’, ‘taiwan’]},index=range(4))
df2 = pd.DataFrame({‘name’: [‘张三’,‘赵五’],
‘age’: [ 12, 40],
‘id’: [‘0’, ‘1’]}, index= [0, 4])
3.1.1. name age job provi
0 张三 12 student shandong
1 乔大爷 16 student shanxi
2 老王 26 doc taiwan
3 李四 50 doc taiwan
name age id
0 张三 12 0
4 赵五 40 1
3.1.2.横向堆叠
横向堆叠外连接 print(pd.concat([df1, df2], axis=1, join=‘outer’))
name age job provi name age id
0 张三 12.0 student shandong 张三 12.0 0
1 乔大爷 16.0 student shanxi NaN NaN NaN
2 老王 26.0 doc taiwan NaN NaN NaN
3 李四 50.0 doc taiwan NaN NaN NaN
4 NaN NaN NaN NaN 赵五 40.0 1
#横向堆叠内连接 print(pd.concat([df1, df2], axis=1, join=‘inner’))
name age job provi name age id
0 张三 12 student shandong 张三 12 0
3.1.3.纵向堆叠
#纵向堆叠外连接
print(pd.concat([df1, df2], axis=0, join=‘outer’))
age id job name provi
0 12 NaN student 张三 shandong
1 16 NaN student 乔大爷 shanxi
2 26 NaN doc 老王 taiwan
3 50 NaN doc 李四 taiwan
0 12 0 NaN 张三 NaN
4 40 1 NaN 赵五 NaN
#纵向堆叠内连接
print(pd.concat([df1, df2], axis=0, join=‘inner’))
name age
0 张三 12
1 乔大爷 16
2 老王 26
3 李四 50
0 张三 12
4 赵五 40
3.2.
3.2.1.主键连接 merge
left = pd.DataFrame({‘key1’: [‘K0’, ‘K0’, ‘K1’, ‘K3’],
‘key2’: [‘K0’, ‘K1’, ‘K0’, ‘K1’],
‘A’: [‘A0’, ‘A1’, ‘A2’, ‘A3’],
‘B’: [‘B0’, ‘B1’, ‘B2’, ‘B3’]})
right = pd.DataFrame({‘key1’: [‘K0’, ‘K1’, ‘K1’, ‘K2’],
‘key2’: [‘K0’, ‘K0’, ‘K0’, ‘K0’],
‘C’: [‘C0’, ‘C1’, ‘C2’, ‘C3’],
‘D’: [‘D0’, ‘D1’, ‘D2’, ‘D3’]})
key1 key2 A B
0 K0 K0 A0 B0
1 K0 K1 A1 B1
2 K1 K0 A2 B2
3 K3 K1 A3 B3
key1 key2 C D
0 K0 K0 C0 D0
1 K1 K0 C1 D1
2 K1 K0 C2 D2
3 K2 K0 C3 D3
how=(left,right,inner,outer)
merge_data=pd.merge(left,right,how=‘inner’,on=‘key1’)
print(merge_data)
omerge_data=pd.merge(left,right,how=‘outer’,on=‘key1’)
print(omerge_data)
key1 key2_x A B key2_y C D
0 K0 K0 A0 B0 K0 C0 D0
1 K0 K1 A1 B1 K0 C0 D0
2 K1 K0 A2 B2 K0 C1 D1
3 K1 K0 A2 B2 K0 C2 D2
key1 key2_x A B key2_y C D
0 K0 K0 A0 B0 K0 C0 D0
1 K0 K1 A1 B1 K0 C0 D0
2 K1 K0 A2 B2 K0 C1 D1
3 K1 K0 A2 B2 K0 C2 D2
4 K3 K1 A3 B3 NaN NaN NaN
5 K2 NaN NaN NaN K0 C3 D3
merge_datal=pd.merge(left,right,how=‘left’,on=‘key1’)
print(merge_datal)
merge_datar=pd.merge(left,right,how=‘right’,on=‘key1’)
print(merge_datar)
key1 key2_x A B key2_y C D
0 K0 K0 A0 B0 K0 C0 D0
1 K0 K1 A1 B1 K0 C0 D0
2 K1 K0 A2 B2 K0 C1 D1
3 K1 K0 A2 B2 K0 C2 D2
4 K3 K1 A3 B3 NaN NaN NaN
key1 key2_x A B key2_y C D
0 K0 K0 A0 B0 K0 C0 D0
1 K0 K1 A1 B1 K0 C0 D0
2 K1 K0 A2 B2 K0 C1 D1
3 K1 K0 A2 B2 K0 C2 D2
4 K2 NaN NaN NaN K0 C3 D3
通过多个主键拼接
rmerge_data=pd.merge(left,right,how=‘right’,on=[‘key1’,‘key2’])
print(rmerge_data)
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
3 K2 K0 NaN NaN C3 D3
imerge_data=pd.merge(left,right,how=‘inner’,on=[‘key1’,‘key2’])
print(imerge_data)
key1 key2 A B C D
0 K0 K0 A0 B0 C0 D0
1 K1 K0 A2 B2 C1 D1
2 K1 K0 A2 B2 C2 D2
lrimerge_data=pd.merge(left,right,how=‘inner’,left_on=‘key1’,right_on=‘key2’)
print(lrimerge_data)
key1_x key2_x A B key1_y key2_y C D
0 K0 K0 A0 B0 K0 K0 C0 D0
1 K0 K0 A0 B0 K1 K0 C1 D1
2 K0 K0 A0 B0 K1 K0 C2 D2
3 K0 K0 A0 B0 K2 K0 C3 D3
4 K0 K1 A1 B1 K0 K0 C0 D0
5 K0 K1 A1 B1 K1 K0 C1 D1
6 K0 K1 A1 B1 K1 K0 C2 D2
7 K0 K1 A1 B1 K2 K0 C3 D3
lrimerge_data1=pd.merge(left,right,how=‘left’,left_on=‘key2’,right_on=‘key1’)
print(lrimerge_data1)
key1_x key2_x A B key1_y key2_y C D
0 K0 K0 A0 B0 K0 K0 C0 D0
1 K0 K1 A1 B1 K1 K0 C1 D1
2 K0 K1 A1 B1 K1 K0 C2 D2
3 K1 K0 A2 B2 K0 K0 C0 D0
4 K3 K1 A3 B3 K1 K0 C1 D1
5 K3 K1 A3 B3 K1 K0 C2 D2
3.2.2.重叠合并
dict1 = {‘ID’:[1,2,3,4,5,6,7,8,9],‘System’:[‘W10’,‘w10’,np.nan,‘w10’,np.nan,np.nan,‘w7’,‘w7’,‘w8’]}
dict2 = {‘ID’:[1,2,3,4,5,6,7,8,9],‘System’:[np.nan,np.nan,‘w7’,‘w7’,‘w7’,‘w7’,‘w8’,np.nan,np.nan]}
df1 = pd.DataFrame(dict1)
df2 = pd.DataFrame(dict2)
ID System
0 1 W10
1 2 w10
2 3 NaN
3 4 w10
4 5 NaN
5 6 NaN
6 7 w7
7 8 w7
8 9 w8
ID System
0 1 NaN
1 2 NaN
2 3 w7
3 4 w7
4 5 w7
5 6 w7
6 7 w8
7 8 NaN
8 9 NaN
主表如果有值,保留原值 df2.combine_first(df1) df2主表 df1副表 df1.combine_first(df2) df1主表 df2副表
print(‘合并后的数据为:\n’,df2.combine_first(df1))
合并后的数据为:
ID System
0 1 W10
1 2 w10
2 3 w7
3 4 w7
4 5 w7
5 6 w7
6 7 w8
7 8 w7
8 9 w8
print(‘合并后的数据为:\n’,df1.combine_first(df2))
合并后的数据为:
ID System
0 1 W10
1 2 w10
2 3 w7
3 4 w10
4 5 w7
5 6 w7
6 7 w7
7 8 w7
8 9 w8
4.清洗数据
4.1.处理重复值
4.1.1.left = pd.DataFrame({‘key1’: [‘k2’, ‘K0’, ‘K1’, np.nan],
‘key2’: [4, 3,2,1],
‘A’: [np.nan, np.nan, np.nan, np.nan],
‘B’: [1, 1, 2, 3]})
key1 key2 A B
0 k2 4 NaN 1
1 K0 3 NaN 1
2 K1 2 NaN 2
3 NaN 1 NaN 3
4.1.2.#isnull/notnull
#print(left.isnull().sum())
#print(left.notnull().sum())
key1 1
key2 0
A 4
B 0
dtype: int64
key1 3
key2 4
A 0
B 4
dtype: int64
4.1.3.删除法
print(left.dropna(axis=1,how=‘any’))
print(left.dropna(axis=0,how=‘any’))
print(left.dropna(axis=1,how=‘all’))
key2 B
0 4 1
1 3 1
2 2 2
3 1 3
Empty DataFrame
Columns: [key1, key2, A, B]
Index: []
key1 key2 B
0 k2 4 1
1 K0 3 1
2 K1 2 2
3 NaN 1 3
4.1.4.替换法
print(left.fillna(’-1’)) #众数、中位数、均值
key1 key2 A B
0 k2 4 -1 1
1 K0 3 -1 1
2 K1 2 -1 2
3 -1 1 -1 3
4.1.5.插值法
线性插值
x=np.array([1,2,3,4,5,8,9,10])
y=np.array([3,5,7,9,11,17,19,10])
y1 = np.array([2,8,18,32,50,128,162,200])
from scipy.interpolate import interp1d
linear=interp1d(x,y,kind=‘linear’) #y=kx+b
linear1=interp1d(x,y1,kind=‘linear’)
print(linear([3,6]))
[ 7. 13.]
拉格朗日插值
from scipy.interpolate import lagrange
#larger_value=lagrange(x,y)
larger_value=lagrange(x,y1)
print(larger_value([6,7]))
[72. 98.]
4.2.检测与处理异常值
4.2.1.#(1)根据经验值判断:速度、加速度 、温度
x=np.array([1,200000,3,4,5,8,9,10])
4.2.2.3σ原则
#μ均值 、σ标准差
#当μ-σ<x<μ+σ(0.68)
#当μ-2σ<x<μ+2σ(0.954)
#当μ-3σ<x<μ+3σ(0.997)
ser=detail[‘amounts’]
mask1=ser.mean()-3ser.std()>ser
mask2=ser.mean()+3ser.std()<ser
mask=mask1 | mask2 ##异常值对应的bool值
print(ser[mask])
95 158
150 175
151 169
191 159
255 175
260 159
4.3.转换数据
4.3.1.哑变量处理
left1 = pd.DataFrame({‘key1’: [‘K0’, ‘K0’, ‘K1’, ‘K3’],
‘key2’: [‘K0’, ‘K1’, ‘K0’, ‘K1’],
‘A’: [‘A0’, ‘A1’, ‘A2’, ‘A3’],
‘B’: [‘B0’, ‘B1’, ‘B2’, ‘B3’]})
print(pd.get_dummies(left1[‘A’]))
A0 A1 A2 A3
0 1 0 0 0
1 0 1 0 0
2 0 0 1 0
3 0 0 0 1
4.3.2.离散化连续型数据
print(pd.cut(detail[‘amounts’], 5))
Categories (5, interval[float64]): [(0.823, 36.4] < (36.4, 71.8] < (71.8, 107.2] < (107.2, 142.6] < (142.6, 178.0]]
4.4.标准化数据 将不同的属性值化成相同的标准
4.4.1.离差标准化
#x1=(x-min)/(max-min)
##x1范围:0<x1<=1
#问题:(1)min=max,(2)min/max异常值
4.4.2.标准差标准化
#(x-μ)/σ
##输出:均值为0, 标准差为1
4.4.3.小数定标标准化
#x1=x/10^k
ser=detail[‘amounts’]
print(ser / 10 ** np.ceil((np.log10(ser.abs().max()))))
0 0.089
1 0.045
2 0.055
4.5.相似度求法
4.5.1.# 加载数据
detail = pd.read_excel(‘meal_order_detail.xlsx’)
#展示detail
print(detail)
#求取相似度
#res = detail[[‘amounts’, ‘counts’]].corr()
#res = detail[[‘amounts’, ‘counts’]].corr(method=‘spearman’)
res = detail[[‘amounts’, ‘counts’]].corr(method=‘kendall’)