层次化索引
层次化也就是在一个轴上拥有多个索引级别
Series的层次化索引
data=Series(np.random.randn(10),index=[
['a','a','a','b','b','b','c','c','d','d'],
[1,2,3,1,2,3,1,2,2,3]
])
data
a 1 0.965999
2 -0.271733
3 0.133910
b 1 -0.806885
2 -0.622905
3 -0.355330
c 1 -0.659194
2 -1.082872
d 2 -0.043984
3 -1.125324
dtype: float64
# 选取数据子集
data['b']
1 -0.806885
2 -0.622905
3 -0.355330
dtype: float64
data['b':'c'] # 在pandas中顾头也顾尾
b 1 -0.806885
2 -0.622905
3 -0.355330
c 1 -0.659194
2 -1.082872
dtype: float64
data.ix[['b','d']] # 按行索引名称选择
b 1 -0.806885
2 -0.622905
3 -0.355330
d 2 -0.043984
3 -1.125324
dtype: float64
# 在内层中进行选取,选择所有的行索引中的2这一行
data[:,2]
a -0.271733
b -0.622905
c -1.082872
d -0.043984
dtype: float64
# 层次化索引在数据重塑和基于分组的操作中扮演着重要的角色
# 这个函数会把层次化索引转为DataFrame格式,最外层的行索引作为DataFrame的行索引,内层的索引作为列索引
data.unstack()
1 2 3
a0.965999-0.2717330.133910
b-0.806885-0.622905-0.355330
c-0.659194-1.082872NaN
dNaN -0.043984-1.125324
# unstack()的逆运算,转回来
data.unstack().stack()
a 1 0.965999
2 -0.271733
3 0.133910
b 1 -0.806885
2 -0.622905
3 -0.355330
c 1 -0.659194
2 -1.082872
d 2 -0.043984
3 -1.125324
dtype: float64
DataFrame的层次化索引
frame = pd.DataFrame(np.arange(12).reshape(4,3),index=[['a','a','b','b'],[1,2,1,2]],
columns=[['ohio','ohio','color'],['green','red','green']]
)
frame
ohiocolor
greenredgreen
a1012
2345
b1678
291011
# 给层级行索引加名字
frame.index.names = ['key1','key2']
# 给层级列索引加名字
frame.columns.names = ['state','color']
frame
stateohiocolor
colorgreenredgreen
key1key2
a1012
2345
b1678
291011
frame['ohio']
colorgreenred
key1key2
a101
234
b167
2910
重排分级顺序
frame
stateohiocolor
colorgreenredgreen
key1key2
a1012
2345
b1678
291011
# 这里sortlevel()括号里的0指把key2和key1交换后按key2排序
frame.swaplevel(0,1).sortlevel(0)
stateohiocolor
colorgreenredgreen
key2key1
1a012
b678
2a345
b91011
# 1指按key1排序
frame.swaplevel(0,1).sortlevel(1)
stateohiocolor
colorgreenredgreen
key2key1
1a012
2a345
1b678
2b91011
根据层次索引级别汇总统计
frame
stateohiocolor
colorgreenredgreen
key1key2
a1012
2345
b1678
291011
# 以key2的1和1相加,2和2索引相加
frame.sum(level='key2')
stateohiocolor
colorgreenredgreen
key2
16810
2121416
# 以行索引的green索引相加,red没有不做改变
frame.sum(level='color',axis=1)
colorgreenred
key1key2
a121
284
b1147
22010
使用DataFrame的列
frame1 = pd.DataFrame({'a':range(7),'b':range(7,0,-1),
'c':['one','one','one','two','two','two','two'],
'd':[0,1,2,0,1,2,3]
})
frame1
abcd
007one0
116one1
225one2
334two0
443two1
552two2
661two3
#把c/d设置为行索引,默认会删除这两列,如果不想删除,可以吧drop=False开启
frame1.set_index(['c','d'])
ab
cd
one007
116
225
two034
143
252
361
# reset_index会把cd设置为列索引,了解就行
frame2.reset_index()
indexabcd
0007one0
1116one1
2225one2
3334two0
4443two1
5552two2
6661two3