>>> import pandas as pd
>>> df = pd.DataFrame({'key1':['a','a','b','b','a'],'key2':['one','two','one','two','one'],'data1':np.random.randn(5),'data2':np.random.randn(5)})>>> df
data1 data2 key1 key2
0 -0.123333 0.977207 a one
1 -1.252341 -1.700965 a two
2 1.293032 -1.942645 b one
3 0.101999 -0.346671 b two
4 -0.926819 -1.112701 a one
#根据key1对data1进行分组 ,计算平均值,和统计。groupd为对象,并没有进行计算
>>> groupd =df['data1'].groupby(df['key1'])
>>> groupd.mean()
key1
a -0.767497
b 0.697515
Name: data1, dtype: float64
>>> groupd.count()
key1
a 3
b 2
Name: data1, dtype: int64
#可以对两列进行分组,求平均值,统计
>>> means = df['data1'].groupby([df['key1'],df['key2']]).mean()
>>> means
key1 key2
a one -0.525076
two -1.252341
b one 1.293032
two 0.101999
Name: data1, dtype: float64
>>> means = df['data1'].groupby([df['key1'],df['key2']]).count()
>>> means
key1 key2
a one 2
two 1
b one 1
two 1
Name: data1, dtype: int64
>>> means.unstack()
key2 one two
key1
a 2 1
b 1 1
#分组键可以是任意合适长度的数组
>>> states = np.array(['ohio','california','california','ohio','ohio'])
>>> years = np.array([2005,2005,2006,2005,2006])
>>> df['data1'].groupby([states,years]).mean()
>>> df['data1'].groupby([states,years]).mean()
california 2005 -1.252341
2006 1.293032
ohio 2005 -0.010667
2006 -0.926819
Name: data1, dtype: float64
>>> df.groupby('key1').mean()
data1 data2
key1
a -0.767497 -0.612153
b 0.697515 -1.144658
>>> df.groupby(['key1'.'key2']).count()
data1 data2
key1 key2
a one 2 2
two 1 1
b one 1 1
two 1 1
>>> df.groupby(['key1','key2']).size()
key1 key2
a one 2
two 1
b one 1
two 1
dtype: int64
>>> for n,g in df.groupby('key1'):
... print(n)
... print(g)
...
a
data1 data2 key1 key2
0 -0.123333 0.977207 a one
1 -1.252341 -1.700965 a two
4 -0.926819 -1.112701 a one
b
data1 data2 key1 key2
2 1.293032 -1.942645 b one
3 0.101999 -0.346671 b two
>>> for n,g in df.groupby(['key1','key2']):
... print(n)
... print(g)
...
('a', 'one')
data1 data2 key1 key2
0 -0.123333 0.977207 a one
4 -0.926819 -1.112701 a one
('a', 'two')
data1 data2 key1 key2
1 -1.252341 -1.700965 a two
('b', 'one')
data1 data2 key1 key2
2 1.293032 -1.942645 b one
('b', 'two')
data1 data2 key1 key2
3 0.101999 -0.346671 b two
#上述与下等效
>>> df.groupby('key1')['data1'].mean()
key1
a -0.767497
b 0.697515
Name: data1, dtype: float64