利用Python进行数据分析的学习笔记——chap9

数据聚合与分组运算

GroupBy技术

import numpy as np
import pandas as pd
from pandas import DataFrame,Series
df = DataFrame({
   
   'key1':['a','a','b','b','a'],
               'key2':['one','two','one','two','one'],
               'data1':np.random.randn(5),
               'data2':np.random.randn(5)})
df
key1 key2 data1 data2
0 a one -0.074122 -0.571432
1 a two 0.347874 -0.794645
2 b one 0.399766 -0.596056
3 b two 1.209857 -0.266257
4 a one -0.001175 0.180895
#根据key1进行分组,并计算data1列的平均值。
grouped = df['data1'].groupby(df['key1'])
grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CCD8450910>
grouped.mean()
key1
a    0.090859
b    0.804812
Name: data1, dtype: float64
means = df['data1'].groupby([df['key1'],df['key2']]).mean()
means
key1  key2
a     one    -0.037649
      two     0.347874
b     one     0.399766
      two     1.209857
Name: data1, dtype: float64
means.unstack()
key2 one two
key1
a -0.037649 0.347874
b 0.399766 1.209857
states = np.array(['Ohio','California','California','Ohio','Ohio'])
years = np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
California  2005    0.347874
            2006    0.399766
Ohio        2005    0.567867
            2006   -0.001175
Name: data1, dtype: float64
df.groupby('key1').mean()
data1 data2
key1
a 0.090859 -0.395061
b 0.804812 -0.431157
df.groupby(['key1','key2']).mean()
data1 data2
key1 key2
a one -0.037649 -0.195268
two 0.347874 -0.794645
b one 0.399766 -0.596056
two 1.209857 -0.266257
#GroupBy的size方法,可以返回一个含有分组大小的Series。目前,分组键中的任何缺失值都会被排除在结果之外。
df.groupby(['key1','key2']).size()
key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

对分组进行迭代

for name,group in df.groupby('key1'):
    print(name)
    print(group)
a
  key1 key2     data1     data2
0    a  one -0.074122 -0.571432
1    a  two  0.347874 -0.794645
4    a  one -0.001175  0.180895
b
  key1 key2     data1     data2
2    b  one  0.399766 -0.596056
3    b  two  1.209857 -0.266257
for (k1,k2),group in df.groupby(['key1','key2']):
    print(k1,k2)
    print(group)
a one
  key1 key2     data1     data2
0    a  one -0.074122 -0.571432
4    a  one -0.001175  0.180895
a two
  key1 key2     data1     data2
1    a  two  0.347874 -0.794645
b one
  key1 key2     data1     data2
2    b  one  0.399766 -0.596056
b two
  key1 key2     data1     data2
3    b  two  1.209857 -0.266257
pieces = dict(list(df.groupby('key1')))
pieces['b']
key1 key2 data1 data2
2 b one 0.399766 -0.596056
3 b two 1.209857 -0.266257
df.dtypes
key1      object
key2      object
data1    float64
data2    float64
dtype: object
grouped = df.groupby(df.dtypes,axis=1)
dict(list(grouped))
{dtype('float64'):       data1     data2
 0 -0.074122 -0.571432
 1  0.347874 -0.794645
 2  0.399766 -0.596056
 3  1.209857 -0.266257
 4 -0.001175  0.180895,
 dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

选取一个或一组列

df.groupby('key1')['data1']
df.groupby('key1')[['data2']]
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001CCCF3F55A0>
df.groupby(['key1','key2'])[['data2']].mean()
data2
key1 key2
a one -0.195268
two -0.794645
b one -0.596056
two -0.266257
s_grouped = df.groupby(['key1','key2'])['data2']
s_grouped
<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001CCD8452DA0>
s_grouped.mean()
key1  key2
a     one    -0.195268
      two    -0.794645
b     one    -0.596056
      two    -0.266257
Name: data2, dtype: float64

通过字典或Series进行分组

people = DataFrame(np.random.randn(5,5),
                  columns=['a','b','c','d','e'],
                  index=['Joe','Steve','Wes','Jim','Travis'])
people.loc[2:3,['b','c']] = np.nan#添加几个NA值
people
C:\windows\ FutureWarning: Slicing a positional slice with .loc is not supported, and will raise TypeError in a future version.  Use .loc with labels or .iloc with positions instead.
  people.loc[2:3,['b','c']] = np.nan#添加几个NA值
a b c d e
Joe 0.309327 1.658107 1.146959 -0.123471 0.159285
Steve 1.380735 -0.703245 0.158134 -1.602958 1.455772
Wes -0.766580 NaN NaN 0.074462 1.430541
Jim -0.615666 2.578830 -0.002766 0.885567 -0.375239
Travis -0.033534 1.158113 0.637327 1.473547 0.373215
mapping = {
   
   'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column = people.groupby(mapping,axis=1)
by_column.sum()
blue red
Joe 1.023488 2.126719
Steve -1.444824 2.133263
Wes 0.074462 0.663960
Jim 0.882800 1.587925
Travis 2.110874 1.497794
map_series = Series(mapping)
map_series
a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object
people.groupby(map_series,axis=1).count()
blue red
Joe 2 3
Steve 2 3
Wes 1 2
Jim 2 3
Travis 2 3

通过函数进行分组

people.groupby(len).sum()
a b c d e
3 -1.072920 4.236937 1.144193 0.836558 1.214587
5 1.380735 -0.703245 0.158134 -1.602958 1.455772
6 -0.033534 1.158113 0.637327 1.473547 0.373215
key_list = ['one','one','one','two','two']
people.groupby([len,key_list]).min()
a b c d e
3 one -0.766580 1.658107 1.146959 -0.123471 0.159285
two -0.615666 2.578830 -0.002766 0.885567 -0.375239
5 one 1.380735 -0.703245 0.158134 -1.602958 1.455772
6 two -0.033534 1.158113 0.637327 1.473547 0.373215

根据索引级别分组

columns = pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],
                                    [1,3,5,1,3]],names=['cty','tenor'])
hier_df = DataFrame(np.random.randn(4,5),columns=columns)
hier_df
cty US JP
tenor 1 3 5 1 3
0 0.971689 -0.207027 0.641528 1.197729 -0.800907
1 0.906871 -0.087288 0.204273 -0.009374 0.637842
2 0.649755 -0.800055 -0.057130 -1.087200 0.435762
3 -0.618737 0.325816 -0.702310 -0.519860 -0.101653
hier_df.groupby(level='cty',axis=1).count()
cty JP US
0 2 3
1 2 3
2 2 3
3 2 3

数据聚合

grouped = df.groupby('key1')
#如果传入的百分位上没有值,则quantile会进行线性插值
grouped['data1'].quantile(0.9)
key1
a    0.278064
b    1.128848
Name: data1, dtype: float64
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)
C:\windows\TFutureWarning: ['key2'] did not aggregate successfully. If any error is raised t
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值