mysql 层次化_Pandas基本功能之层次化索引及层次化汇总-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_29165725/article/details/114333864

本文详细介绍了Pandas中层次化索引的概念及其在Series和DataFrame中的应用，包括数据子集的选择、数据重塑及基于分组的操作，并通过实例展示了如何进行层级索引的创建、重排和汇总统计。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

层次化索引

层次化也就是在一个轴上拥有多个索引级别

Series的层次化索引

data=Series(np.random.randn(10),index=[

['a','a','a','b','b','b','c','c','d','d'],

[1,2,3,1,2,3,1,2,2,3]

])

data

a 1 0.965999

2 -0.271733

3 0.133910

b 1 -0.806885

2 -0.622905

3 -0.355330

c 1 -0.659194

2 -1.082872

d 2 -0.043984

3 -1.125324

dtype: float64

# 选取数据子集

data['b']

1 -0.806885

2 -0.622905

3 -0.355330

dtype: float64

data['b':'c'] # 在pandas中顾头也顾尾

b 1 -0.806885

2 -0.622905

3 -0.355330

c 1 -0.659194

2 -1.082872

dtype: float64

data.ix[['b','d']] # 按行索引名称选择

b 1 -0.806885

2 -0.622905

3 -0.355330

d 2 -0.043984

3 -1.125324

dtype: float64

# 在内层中进行选取，选择所有的行索引中的2这一行

data[:,2]

a -0.271733

b -0.622905

c -1.082872

d -0.043984

dtype: float64

# 层次化索引在数据重塑和基于分组的操作中扮演着重要的角色

# 这个函数会把层次化索引转为DataFrame格式，最外层的行索引作为DataFrame的行索引，内层的索引作为列索引

data.unstack()

1 2 3

a0.965999-0.2717330.133910

b-0.806885-0.622905-0.355330

c-0.659194-1.082872NaN

dNaN -0.043984-1.125324

# unstack()的逆运算，转回来

data.unstack().stack()

a 1 0.965999

2 -0.271733

3 0.133910

b 1 -0.806885

2 -0.622905

3 -0.355330

c 1 -0.659194

2 -1.082872

d 2 -0.043984

3 -1.125324

dtype: float64

DataFrame的层次化索引

frame = pd.DataFrame(np.arange(12).reshape(4,3),index=[['a','a','b','b'],[1,2,1,2]],

columns=[['ohio','ohio','color'],['green','red','green']]

)

frame

ohiocolor

greenredgreen

a1012

2345

b1678

291011

# 给层级行索引加名字

frame.index.names = ['key1','key2']

# 给层级列索引加名字

frame.columns.names = ['state','color']

frame

stateohiocolor

colorgreenredgreen

key1key2

a1012

2345

b1678

291011

frame['ohio']

colorgreenred

key1key2

a101

234

b167

2910

重排分级顺序

frame

stateohiocolor

colorgreenredgreen

key1key2

a1012

2345

b1678

291011

# 这里sortlevel()括号里的0指把key2和key1交换后按key2排序

frame.swaplevel(0,1).sortlevel(0)

stateohiocolor

colorgreenredgreen

key2key1

1a012

b678

2a345

b91011

# 1指按key1排序

frame.swaplevel(0,1).sortlevel(1)

stateohiocolor

colorgreenredgreen

key2key1

1a012

2a345

1b678

2b91011

根据层次索引级别汇总统计

frame

stateohiocolor

colorgreenredgreen

key1key2

a1012

2345

b1678

291011

# 以key2的1和1相加，2和2索引相加

frame.sum(level='key2')

stateohiocolor

colorgreenredgreen

key2

16810

2121416

# 以行索引的green索引相加，red没有不做改变

frame.sum(level='color',axis=1)

colorgreenred

key1key2

a121

284

b1147

22010

使用DataFrame的列

frame1 = pd.DataFrame({'a':range(7),'b':range(7,0,-1),

'c':['one','one','one','two','two','two','two'],

'd':[0,1,2,0,1,2,3]

})

frame1

abcd

007one0

116one1

225one2

334two0

443two1

552two2

661two3

#把c/d设置为行索引，默认会删除这两列，如果不想删除，可以吧drop=False开启

frame1.set_index(['c','d'])

one007

116

225

two034

143

252

361

# reset_index会把cd设置为列索引,了解就行

frame2.reset_index()

indexabcd

0007one0

1116one1

2225one2

3334two0

4443two1

5552two2

6661two3