类似SQL:
select city max(temperature) from city_weather group by city
group by :先对数据分组,然后在每个分组上应用聚合函数,转换函数
- 分组使用聚合函数做数据统计
- 遍历group by的结果理解执行流程
- 实例分组探索天气数据
一,分组使用聚合函数做数据统计
1,单个列groupby,查询所有数据列的统计
import numpy as np
import pandas as pd
# 创建 DataFrame
df = pd.DataFrame({
'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C': np.random.rand(8),
'D': np.random.rand(8)
})
# 打印原始 DataFrame
print("原始 DataFrame:")
print(df)
# 按列 A 分组并求和
a = df.groupby('A').sum(numeric_only=True)
原始 DataFrame:
A B C D
0 foo one 0.037578 0.915455
1 bar one 0.808149 0.172095
2 foo two 0.550795 0.320976
3 bar three 0.162670 0.833281
4 foo two 0.258594 0.270333
5 bar two 0.431245 0.094254
6 foo one 0.640197 0.169021
7 foo three 0.758293 0.526107
按列 A 分组求和结果:
C D
A
bar 1.402065 1.099630
foo 2.245456 2.201892
2. 多个列groupby,查询所有数据列的统计。('A','B')成对变成了二级索引。
A B C D
0 foo one 0.268145 0.012703
1 bar one 0.763292 0.942677
2 foo two 0.816369 0.880583
3 bar three 0.011418 0.978870
4 foo two 0.012794 0.413313
5 bar two 0.508946 0.153901
6 foo one 0.844478 0.563549
7 foo three 0.025693 0.573663
df.groupby(['A','B']).mean()
b = df.groupby(['A','B']).mean()
print(b)
C D
A B
bar one 0.763292 0.942677
three 0.011418 0.978870
two 0.508946 0.153901
foo one 0.556312 0.288126
three 0.025693 0.573663
two 0.414582 0.646948
c = df.groupby(['A','B'], as_index=False).mean()
print(c)
A B C D
0 bar one 0.685117 0.136034
1 bar three 0.241754 0.830470
2 bar two 0.432996 0.350951
3 foo one 0.477358 0.765364
4 foo three 0.005063 0.719531
5 foo two 0.393793 0.287368
3. 同时相看多种数据统计
import numpy as np
import pandas as pd
# 创建 DataFrame
df = pd.DataFrame({
'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C': np.random.rand(8),
'D': np.random.rand(8)
})
# 打印原始 DataFrame
print("原始 DataFrame:")
print(df)
# 按列 A 分组并进行多种聚合操作
d = df.groupby('A').agg([np.sum, np.mean, np.std])
# 打印聚合结果
print("\n分组聚合结果:")
print(d)
## 原始DataFrame
A B C D
0 foo one 0.123456 0.654321
1 bar one 0.234567 0.765432
2 foo two 0.345678 0.876543
3 bar three 0.456789 0.987654
4 foo two 0.567890 0.098765
5 bar two 0.678901 0.109876
6 foo one 0.789012 0.210987
7 foo three 0.890123 0.321098
## 分组聚合结果
C D
sum mean std sum mean std
A
bar 1.370257 0.456752 0.222167 1.862962 0.620987 0.438889
foo 2.716159 0.543232 0.301234 2.161714 0.432343 0.321098
4. 查看单列的结果数据统计
# 方法1 :预过滤
e = df.groupby('A')['C'].agg([np.sum, np.mean, np.std])
print(e)
sum mean std
A
bar 1.278805 0.426268 0.270575
foo 2.369430 0.473886 0.216729
方法2 .
# 创建 DataFrame
df = pd.DataFrame({
'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C': np.random.rand(8),
'D': np.random.rand(8)
})
# 打印原始 DataFrame
print("原始 DataFrame:")
print(df)
原始 DataFrame:
A B C D
0 foo one 0.517518 0.134908
1 bar one 0.509692 0.758133
2 foo two 0.200657 0.389188
3 bar three 0.034803 0.825717
4 foo two 0.667955 0.780484
5 bar two 0.356315 0.180094
6 foo one 0.164015 0.070777
7 foo three 0.005900 0.173317
f = df.groupby('A').agg([np.sum, np.mean, np.std])['C']
print(f)
# 提取列 C 的聚合结果
f = agg_result['C'] # 提取列 C 的聚合结果
print("\n列 C 的聚合结果:")
print(f)
按列 A 分组求和结果:
C D
A
bar 0.900810 1.763944
foo 1.556046 1.548674
sum mean std
A
bar 0.900810 0.300270 0.242354
foo 1.556046 0.311209 0.272457
5. 不同列使用不同的聚合函数
import numpy as np
import pandas as pd
# 创建 DataFrame
df = pd.DataFrame({
'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C': np.random.rand(8),
'D': np.random.rand(8)
})
# 打印原始 DataFrame
print("原始 DataFrame:")
print(df)
原始 DataFrame:
A B C D
0 foo one 0.022609 0.455736
1 bar one 0.524275 0.586531
2 foo two 0.649971 0.864277
3 bar three 0.379156 0.983663
4 foo two 0.917491 0.817069
5 bar two 0.729709 0.800121
6 foo one 0.408776 0.161176
7 foo three 0.744711 0.728075
g = df.groupby('A').agg({'C': np.sum,'D': np.mean})
print(g)
C D
A
bar 1.633141 0.790105
foo 2.743558 0.605267
二,遍历groupby的结果理解执行流程
for 循环可以直接遍历每个group
1. 遍历单个列聚合的分组
# 创建 DataFrame
df = pd.DataFrame({
'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C': np.random.rand(8),
'D': np.random.rand(8)
})
print(df)
A B C D
0 foo one 0.257037 0.665599
1 bar one 0.712079 0.828423
2 foo two 0.537003 0.098247
3 bar three 0.051362 0.665340
4 foo two 0.462230 0.410000
5 bar two 0.092589 0.705635
6 foo one 0.186831 0.964534
7 foo three 0.919710 0.127256
g = df.groupby('A')
print(g)
for name, group in g:
print(name)
print(group)
print()
h = g.get_group('bar')
print(h)
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000022D9DBC7FA0>
bar
A B C D
1 bar one 0.239312 0.416462
3 bar three 0.538912 0.377047
5 bar two 0.349983 0.110356
foo
A B C D
0 foo one 0.590524 0.033294
2 foo two 0.164536 0.471936
4 foo two 0.301538 0.585096
6 foo one 0.616674 0.446310
7 foo three 0.498085 0.063923
A B C D
1 bar one 0.239312 0.416462
3 bar three 0.538912 0.377047
5 bar two 0.349983 0.110356
2. 遍历多个列聚合的分组
# 创建 DataFrame
df = pd.DataFrame({
'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C': np.random.rand(8),
'D': np.random.rand(8)
})
print(df)
A B C D
0 foo one 0.958409 0.006504
1 bar one 0.314079 0.703237
2 foo two 0.885358 0.860236
3 bar three 0.109789 0.332691
4 foo two 0.885469 0.637880
5 bar two 0.102636 0.709535
6 foo one 0.204329 0.497091
7 foo three 0.675232 0.999516
g = df.groupby(['A', 'B'])
for name, group in g:
print(name)
print(group)
print()
print(g)
('bar', 'one')
A B C D
1 bar one 0.981852 0.489836
('bar', 'three')
A B C D
3 bar three 0.810885 0.730725
('bar', 'two')
A B C D
5 bar two 0.259971 0.611428
('foo', 'one')
A B C D
0 foo one 0.015916 0.699206
6 foo one 0.067090 0.784468
('foo', 'three')
A B C D
7 foo three 0.461789 0.299584
('foo', 'two')
A B C D
2 foo two 0.461944 0.290143
4 foo two 0.328032 0.348542
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000025D32337940>
i = g.get_group(('foo', 'one'))
print(i)
A B C D
0 foo one 0.958379 0.286958
6 foo one 0.164123 0.035663
- 可以直接查询group后的某几列,生成Series或者子DataFrame
- 其实所有的聚合统计,都是在dataframe和series上进行的
三, 实例分组探索天气数据
import pandas as pd
file_path = r'C:\TELCEL_MEXICO_BOT\A\Weather.csv'
df = pd.read_csv(file_path,encoding='utf-8')
1. 查看每个月的最高温度
df['bWendu'] = df['bWendu'].str.replace('°C','').astype('int32')
df['yWendu'] = df['yWendu'].str.replace('°C','').astype('int32')
print(df.head())
ymd bWendu yWendu tianqi fengxiang fengji aqi aqiInfo aqiLevel
0 2025-01-01 -25 -6 晴~多云 西北风 1-2级 59 优 2
1 2025-01-02 2 -9 阴 东南风 3-4级 48 优 1
2 2025-01-03 -11 -2 晴~多云 西风 4-8级 28 良 1
3 2025-01-04 0 -4 晴~多云 东风 2-5级 30 良 1
4 2025-01-05 3 -1 小雨 东风 3-5级 25 良 1
df['month'] = df['ymd'].str[:7]
print(df.head())
ymd bWendu yWendu tianqi ... aqi aqiInfo aqiLevel month
0 2025-01-01 -25 -6 晴~多云 ... 59 优 2 2025-01
1 2025-01-02 2 -9 阴 ... 48 优 1 2025-01
2 2025-01-03 -11 -2 晴~多云 ... 28 良 1 2025-01
3 2025-01-04 0 -4 晴~多云 ... 30 良 1 2025-01
4 2025-01-05 3 -1 小雨 ... 25 良 1 2025-01
[5 rows x 10 columns]
data = df.groupby('month')['bWendu'].max()
print(data)
month
2025-01 10
2025-02 10
2025-03 10
Name: bWendu, dtype: int32
print(type(data))
<class 'pandas.core.series.Series'>
2. 查看每个月的最高温度,最低温度,平均空气质量指数
import pandas as pd
import numpy as np
file_path = r'C:\TELCEL_MEXICO_BOT\A\Weather.csv'
df = pd.read_csv(file_path,encoding='utf-8')
df['bWendu'] = df['bWendu'].str.replace('°C','').astype('int32')
df['yWendu'] = df['yWendu'].str.replace('°C','').astype('int32')
print(df.head())
ymd bWendu yWendu tianqi fengxiang fengji aqi aqiInfo aqiLevel
0 2025-01-01 -25 -6 晴~多云 西北风 1-2级 59 优 2
1 2025-01-02 2 -9 阴 东南风 3-4级 48 优 1
2 2025-01-03 -11 -2 晴~多云 西风 4-8级 28 良 1
3 2025-01-04 0 -4 晴~多云 东风 2-5级 30 良 1
4 2025-01-05 3 -1 小雨 东风 3-5级 25 良 1
df['month'] = df['ymd'].str[:7]
print(df.head())
ymd bWendu yWendu tianqi ... aqi aqiInfo aqiLevel month
0 2025-01-01 -25 -6 晴~多云 ... 59 优 2 2025-01
1 2025-01-02 2 -9 阴 ... 48 优 1 2025-01
2 2025-01-03 -11 -2 晴~多云 ... 28 良 1 2025-01
3 2025-01-04 0 -4 晴~多云 ... 30 良 1 2025-01
4 2025-01-05 3 -1 小雨 ... 25 良 1 2025-01
[5 rows x 10 columns]
group_data = df.groupby('month').agg({'bWendu':np.max, 'yWendu':np.min, 'aqi':np.mean})
print(group_data)
bWendu yWendu aqi
month
2025-01 10 -10 39.157895
2025-02 10 -10 37.916667
2025-03 10 -10 35.888889

被折叠的 条评论
为什么被折叠?



