1.创建层次化索引
- 隐式构造
(1)最常用的方法是给DataFrame构造函数的index参数传递两个或者更多的数组
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=(6,6))
index = [
['一班','一班','一班','二班','二班','二班'],
['学生1','学生2','学生3','学生4','学生5','学生6']
]
columns = [
['期中','期中','期中','期末','期末','期末'],
['语文','数学','英语','语文','数学','英语']
]
df = pd.DataFrame(data,index=index,columns=columns)
print(df)
(2)Series创建多层索引
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=6)
index = [
['一班','一班','一班','二班','二班','二班'],
['学生1','学生2','学生3','学生4','学生5','学生6']
]
s = pd.Series(data=data,index=index)
print(s)
- 显示构造pd.MultiIndex
(1)使用数组
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=(6,6))
index = pd.MultiIndex.from_arrays([
['一班','一班','一班','二班','二班','二班'],
['学生1','学生2','学生3','学生4','学生5','学生6']
])
columns = [
['期中','期中','期中','期末','期末','期末'],
['语文','数学','英语','语文','数学','英语']
]
df = pd.DataFrame(data,index=index,columns=columns)
print(df)
(2)使用tuple
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=(6,6))
index = pd.MultiIndex.from_tuples(
(
('一班','学生1'),('一班','学生2'),('一班','学生3'),
('二班','学生4'),('二班','学生5'),('二班','学生6'),
)
)
columns = [
['期中','期中','期中','期末','期末','期末'],
['语文','数学','英语','语文','数学','英语']
]
df = pd.DataFrame(data,index=index,columns=columns)
print(df)
(3)使用product----笛卡尔积
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=(6,6))
index = pd.MultiIndex.from_product([
['一班','二班'],
['学生1','学生2','学生3']
])
columns = [
['期中','期中','期中','期末','期末','期末'],
['语文','数学','英语','语文','数学','英语']
]
df = pd.DataFrame(data,index=index,columns=columns)
print(df)
2.多层索引中Series的索引与切片操作(可以使用loc和iloc)
- 索引
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=6)
index = [
['一班','一班','一班','二班','二班','二班'],
['学生1','学生2','学生3','学生4','学生5','学生6']
]
s = pd.Series(data=data,index=index)
print(s)
print('###########################')
#显示索引
print(s['一班'])
print(s['一班']['学生3'])
print('###########################')
#隐式索引
print(s[0])
print(s.iloc[[1,2]])
- 切片
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=6)
index = [
['一班','一班','一班','二班','二班','二班'],
['学生1','学生2','学生3','学生4','学生5','学生6']
]
s = pd.Series(data=data,index=index)
print(s)
print('###########################')
#显示索引
print(s['一班':'二班'])
print('###########################')
#隐式索引
print(s[1:5])
3.多层索引中DataFrame的索引与切片操作(可以使用loc和iloc)
- 索引
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=(6,6))
index = [
['一班','一班','一班','二班','二班','二班'],
['学生1','学生2','学生3','学生4','学生5','学生6']
]
columns = [
['期中','期中','期中','期末','期末','期末'],
['语文','数学','英语','语文','数学','英语']
]
df = pd.DataFrame(data,index=index,columns=columns)
print(df)
print('#################################')
#获取列索引
print(df['期中']['数学'])
print(df.iloc[:,[0,2,1]])
print('#################################')
#获取行索引
print(df.loc['二班'])
print(df.loc['二班','学生6'])
print(df.iloc[[1,3,4,2]])
print('#################################')
#获取元素
print(df['期中']['数学']['一班']['学生1'])
print(df.iloc[0,1])
- 切片
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=(6,6))
index = [
['一班','一班','一班','二班','二班','二班'],
['学生1','学生2','学生3','学生4','学生5','学生6']
]
columns = [
['期中','期中','期中','期末','期末','期末'],
['语文','数学','英语','语文','数学','英语']
]
df = pd.DataFrame(data,index=index,columns=columns)
print(df)
print('#################################')
#行切片
print(df.iloc[1:5])
print('#################################')
print(df.loc[('一班','学生2'):('二班','学生5')])
print('#################################')
#列切片
print(df.iloc[:,1:5])
print('#################################')
print(df.loc[:,'期中':'期末'])
⚠️注意:建议切片操作使隐式索引,即使用数字下标。
4.索引的堆叠
- df.stack(level=-1):将列索引变成行索引,level用于选择变哪一层的列索引,默认-1(最里层),最外层的索引值为0,变化过后的列索引将会插入行索引的最里层
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=(6,6))
index = pd.MultiIndex.from_product([
['一班','二班'],
['学生1','学生2','学生3']
])
columns = [
['期中','期中','期中','期末','期末','期末'],
['语文','数学','英语','语文','数学','英语']
]
df = pd.DataFrame(data,index=index,columns=columns)
print(df)
print("#############################")
print(df.stack())
print("#############################")
print(df.stack(level=-1))
print("#############################")
print(df.stack(level=0))
- df.unstack(level=-1):将行索引变成列索引,level用于选择变哪一层的行索引,默认-1(最里层),最外层的索引值为0,变化过后的行索引将会插入列索引的最里层
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=(6,6))
index = pd.MultiIndex.from_product([
['一班','二班'],
['学生1','学生2','学生3']
])
columns = [
['期中','期中','期中','期末','期末','期末'],
['语文','数学','英语','语文','数学','英语']
]
df = pd.DataFrame(data,index=index,columns=columns)
df2 = df.stack(level=0)
print(df2)
print("#############################")
print(df2.unstack())
print("#############################")
print(df2.unstack(-1))
print("#############################")
print(df2.unstack(1))
- 用fill_value填充NaN
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=(6,6))
index = pd.MultiIndex.from_tuples(
(
('一班','学生1'),('一班','学生2'),('一班','学生3'),
('二班','学生4'),('二班','学生5'),('二班','学生6'),
)
)
columns = [
['期中','期中','期中','期末','期末','期末'],
['语文','数学','英语','语文','数学','英语']
]
df = pd.DataFrame(data,index=index,columns=columns)
print(df)
print('##################################')
print(df.unstack())
print('##################################')
print(df.unstack(fill_value=0))
5.聚合操作
- DataFrame聚合函数:求和、平均值、最大值、最小值.......
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=(6,6))
index = pd.MultiIndex.from_tuples(
(
('一班','学生1'),('一班','学生2'),('一班','学生3'),
('二班','学生4'),('二班','学生5'),('二班','学生6'),
)
)
columns = [
['期中','期中','期中','期末','期末','期末'],
['语文','数学','英语','语文','数学','英语']
]
df = pd.DataFrame(data,index=index,columns=columns)
df2 = df.loc['一班','期中']
print(df2)
print('######################################')
print('所有数的和:',df2.values.sum())
print('######################################')
print('sum:\n',df2.sum()) #axis默认为0,求每一列中多行的和
print('######################################')
print('sum:\n',df2.sum(axis=1)) #求每一行中多列的和
- 多层索引聚合操作
import numpy as np
import pandas as pd
data = np.random.randint(0,100,size=(6,6))
index = pd.MultiIndex.from_tuples(
(
('一班','学生1'),('一班','学生2'),('一班','学生3'),
('二班','学生4'),('二班','学生5'),('二班','学生6'),
)
)
columns = [
['期中','期中','期中','期末','期末','期末'],
['语文','数学','英语','语文','数学','英语']
]
df = pd.DataFrame(data,index=index,columns=columns)
print(df)
print('######################################')
print('sum:\n',df.sum()) #axis默认为0,求每一列中多行的和
print('######################################')
print('sum:\n',df.sum(axis=1)) #求每一行中多列的和
print('######################################')
print('sum:\n',df.sum(axis=0,level=0)) #计算行中第1层的和
知识点为听课总结笔记,课程为B站“千锋教育Pandas数据分析从入门到实战,零基础小白保姆级Python数据分析教程”:001_Pandas_Pandas介绍_哔哩哔哩_bilibili