数据聚合与分组运算

#数据聚合与分组
import pandas as pd
import numpy as np
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                       'key2' : ['one', 'two', 'one', 'two', 'one'],
                       'data1' : np.random.randn(5),
                       'data2' : np.random.randn(5)})
df
key1key2data1data2
0aone1.3645960.352792
1atwo1.6856260.236429
2bone-0.537077-0.018004
3btwo1.3898660.826195
4aone0.8497331.619383
grouped = df['data1'].groupby(df['key1'])
grouped
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000261B3E2CEB8>
grouped.mean()
key1
a    1.299985
b    0.426394
Name: data1, dtype: float64
means = df['data1'].groupby([df['key1'],df['key2']]).mean()#两个参数分组
means
key1  key2
a     one     1.107165
      two     1.685626
b     one    -0.537077
      two     1.389866
Name: data1, dtype: float64
means.unstack()
key2onetwo
key1
a1.1071651.685626
b-0.5370771.389866
states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
years = np.array([2005, 2005, 2006, 2005, 2006])
#分组键可以是任何长度适当的数组
df['data1'].groupby([states, years]).mean()
California  2005    1.685626
            2006   -0.537077
Ohio        2005    1.377231
            2006    0.849733
Name: data1, dtype: float64
df.groupby('key1').mean()
data1data2
key1
a1.2999850.736201
b0.4263940.404096
df.groupby(['key1','key2']).mean()
data1data2
key1key2
aone1.1071650.986087
two1.6856260.236429
bone-0.537077-0.018004
two1.3898660.826195
df.groupby(['key1','key2']).size()#返回分组大小
key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64
#对分组进行迭代,GroupBy对象支持迭代
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
  key1 key2     data1     data2
0    a  one  1.364596  0.352792
1    a  two  1.685626  0.236429
4    a  one  0.849733  1.619383
b
  key1 key2     data1     data2
2    b  one -0.537077 -0.018004
3    b  two  1.389866  0.826195
for (k1,k2),group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)
('a', 'one')
  key1 key2     data1     data2
0    a  one  1.364596  0.352792
4    a  one  0.849733  1.619383
('a', 'two')
  key1 key2     data1     data2
1    a  two  1.685626  0.236429
('b', 'one')
  key1 key2     data1     data2
2    b  one -0.537077 -0.018004
('b', 'two')
  key1 key2     data1     data2
3    b  two  1.389866  0.826195
#将这些数据片段做成一个字典:
pieces = dict(list(df.groupby('key1')))
pieces['b']
key1key2data1data2
2bone-0.537077-0.018004
3btwo1.3898660.826195
#对列进行分组
df.dtypes
key1      object
key2      object
data1    float64
data2    float64
dtype: object
grouped = df.groupby(df.dtypes,axis=1)
for dtype,group in grouped:
    print(dtype)
    print(group)
float64
      data1     data2
0  1.364596  0.352792
1  1.685626  0.236429
2 -0.537077 -0.018004
3  1.389866  0.826195
4  0.849733  1.619383
object
  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one
#选取一列或列的子集
df['data1'].groupby(df['key1'])
<pandas.core.groupby.groupby.SeriesGroupBy object at 0x00000261B3E74FD0>
df.groupby(['key1', 'key2'])[['data2']].mean()
data2
key1key2
aone0.986087
two0.236429
bone-0.018004
two0.826195
#通过字典或Series进行分组
people = pd.DataFrame(np.random.randn(5,5),
                     columns=['a','b','c','d','e'],
                     index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2:3,[1,2]] = np.nan
people
abcde
Joe0.0377961.6310830.455609-1.4053270.495940
Steve0.3043930.3260050.1463500.075903-0.263559
Wes-0.055827NaNNaN1.268622-0.541199
Jim2.0343940.8188110.3339910.1587341.187207
Travis2.719235-0.459516-0.2922500.1581691.102169
mapping = {'a': 'red', 'b': 'red', 'c': 'blue',#分组字典
              'd': 'blue', 'e': 'red', 'f' : 'orange'}
by_column = people.groupby(mapping,axis=1)
by_column.sum()
bluered
Joe-0.9497182.164819
Steve0.2222530.366838
Wes1.268622-0.597026
Jim0.4927264.040412
Travis-0.1340823.361889
map_series = pd.Series(mapping)
map_series
a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object
people.groupby(map_series,axis=1).count()
bluered
Joe23
Steve23
Wes12
Jim23
Travis23
#通过函数进行分组
people.groupby(len).sum()
abcde
32.0163632.4498940.7896010.0220291.141948
50.3043930.3260050.1463500.075903-0.263559
62.719235-0.459516-0.2922500.1581691.102169
key_list = ['one', 'one', 'one', 'two', 'two']
people.groupby([len, key_list]).min()
abcde
3one-0.0558271.6310830.455609-1.405327-0.541199
two2.0343940.8188110.3339910.1587341.187207
5one0.3043930.3260050.1463500.075903-0.263559
6two2.719235-0.459516-0.2922500.1581691.102169
#根据索引级别分组
columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],
                                        [1, 3, 5, 1, 3]],
                                        names=['cty', 'tenor'])
hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)
hier_df
ctyUSJP
tenor13513
0-1.8658610.1454541.4766890.7984110.546048
11.8871490.2186130.9511652.790860-2.419909
2-0.681806-0.490238-2.2159090.3755480.145144
30.5013221.358101-1.0694531.916614-0.018305
#根据级别分组,使用level关键字传递级别序号或名字:
hier_df.groupby(level='cty',axis=1).count()
ctyJPUS
023
123
223
323
#数据聚合
#聚合指的是任何能够从数组产生标量值的数据转换过程
df
key1key2data1data2
0aone1.3645960.352792
1atwo1.6856260.236429
2bone-0.537077-0.018004
3btwo1.3898660.826195
4aone0.8497331.619383
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)
key1
a    1.621420
b    1.197171
Name: data1, dtype: float64
def peak_to_peak(arr):
    return arr.max()-arr.min()
grouped.agg(peak_to_peak)#使用你自己的聚合函数,只需将其传入aggregate或agg方法
data1data2
key1
a0.8358931.382954
b1.9269430.844199
grouped.describe()
data1data2
countmeanstdmin25%50%75%maxcountmeanstdmin25%50%75%max
key1
a3.01.2999850.4216750.8497331.1071651.3645961.5251111.6856263.00.7362010.7670680.2364290.2946100.3527920.9860871.619383
b2.00.4263941.362554-0.537077-0.0553420.4263940.9081301.3898662.00.4040960.596939-0.0180040.1930460.4040960.6151450.826195
#面向列的多函数应用
tips = pd.read_csv('examples/tips.csv')
tips['tip_pct'] = tips['tip']/tips['total_bill']
tips[:6]
total_billtipsmokerdaytimesizetip_pct
016.991.01NoSunDinner20.059447
110.341.66NoSunDinner30.160542
221.013.50NoSunDinner30.166587
323.683.31NoSunDinner20.139780
424.593.61NoSunDinner40.146808
525.294.71NoSunDinner40.186240
grouped = tips.groupby(['day','smoker'])
grouped_pct = grouped['tip_pct']
grouped_pct.agg('mean')
day   smoker
Fri   No        0.151650
      Yes       0.174783
Sat   No        0.158048
      Yes       0.147906
Sun   No        0.160113
      Yes       0.187250
Thur  No        0.160298
      Yes       0.163863
Name: tip_pct, dtype: float64
grouped_pct.agg(['mean', 'std', peak_to_peak])
meanstdpeak_to_peak
daysmoker
FriNo0.1516500.0281230.067349
Yes0.1747830.0512930.159925
SatNo0.1580480.0397670.235193
Yes0.1479060.0613750.290095
SunNo0.1601130.0423470.193226
Yes0.1872500.1541340.644685
ThurNo0.1602980.0387740.193350
Yes0.1638630.0393890.151240
#修改聚合名字
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])
foobar
daysmoker
FriNo0.1516500.028123
Yes0.1747830.051293
SatNo0.1580480.039767
Yes0.1479060.061375
SunNo0.1601130.042347
Yes0.1872500.154134
ThurNo0.1602980.038774
Yes0.1638630.039389
functions = ['count','mean','max']
result = grouped['tip_pct','total_bill'].agg(functions)
result
tip_pcttotal_bill
countmeanmaxcountmeanmax
daysmoker
FriNo40.1516500.187735418.42000022.75
Yes150.1747830.2634801516.81333340.17
SatNo450.1580480.2919904519.66177848.33
Yes420.1479060.3257334221.27666750.81
SunNo570.1601130.2526725720.50666748.17
Yes190.1872500.7103451924.12000045.35
ThurNo450.1602980.2663124517.11311141.19
Yes170.1638630.2412551719.19058843.11
ftuples = [('Durchschnitt', 'mean'),('Abweichung', np.var)]
grouped['tip_pct', 'total_bill'].agg(ftuples)
tip_pcttotal_bill
DurchschnittAbweichungDurchschnittAbweichung
daysmoker
FriNo0.1516500.00079118.42000025.596333
Yes0.1747830.00263116.81333382.562438
SatNo0.1580480.00158119.66177879.908965
Yes0.1479060.00376721.276667101.387535
SunNo0.1601130.00179320.50666766.099980
Yes0.1872500.02375724.120000109.046044
ThurNo0.1602980.00150317.11311159.625081
Yes0.1638630.00155119.19058869.808518
#对一个列或不同的列应用不同的函数
grouped.agg({'tip' : np.max, 'size' : 'sum'})
grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],
                'size' : 'sum'})
tip_pctsize
minmaxmeanstdsum
daysmoker
FriNo0.1203850.1877350.1516500.0281239
Yes0.1035550.2634800.1747830.05129331
SatNo0.0567970.2919900.1580480.039767115
Yes0.0356380.3257330.1479060.061375104
SunNo0.0594470.2526720.1601130.042347167
Yes0.0656600.7103450.1872500.15413449
ThurNo0.0729610.2663120.1602980.038774112
Yes0.0900140.2412550.1638630.03938940
#apply:一般性的拆分应用合并
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips,n=6)
total_billtipsmokerdaytimesizetip_pct
10914.314.00YesSatDinner20.279525
18323.176.50YesSunDinner40.280535
23211.613.39NoSatDinner20.291990
673.071.00YesSatDinner10.325733
1789.604.00YesSunDinner20.416667
1727.255.15YesSunDinner20.710345
tips.groupby('smoker').apply(top)
total_billtipsmokerdaytimesizetip_pct
smoker
No8824.715.85NoThurLunch20.236746
18520.695.00NoSunDinner50.241663
5110.292.60NoSunDinner20.252672
1497.512.00NoThurLunch20.266312
23211.613.39NoSatDinner20.291990
Yes10914.314.00YesSatDinner20.279525
18323.176.50YesSunDinner40.280535
673.071.00YesSatDinner10.325733
1789.604.00YesSunDinner20.416667
1727.255.15YesSunDinner20.710345
tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
total_billtipsmokerdaytimesizetip_pct
smokerday
NoFri9422.753.25NoFriDinner20.142857
Sat21248.339.00NoSatDinner40.186220
Sun15648.175.00NoSunDinner60.103799
Thur14241.195.00NoThurLunch50.121389
YesFri9540.174.73YesFriDinner40.117750
Sat17050.8110.00YesSatDinner30.196812
Sun18245.353.50YesSunDinner30.077178
Thur19743.115.00YesThurLunch40.115982
result = tips.groupby('smoker')['tip_pct'].describe()
result
countmeanstdmin25%50%75%max
smoker
No151.00.1593280.0399100.0567970.1369060.1556250.1850140.291990
Yes93.00.1631960.0851190.0356380.1067710.1538460.1950590.710345
result.unstack('smoker')
       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64
#禁止分组键
tips.groupby('smoker',group_keys=False).apply(top)
total_billtipsmokerdaytimesizetip_pct
8824.715.85NoThurLunch20.236746
18520.695.00NoSunDinner50.241663
5110.292.60NoSunDinner20.252672
1497.512.00NoThurLunch20.266312
23211.613.39NoSatDinner20.291990
10914.314.00YesSatDinner20.279525
18323.176.50YesSunDinner40.280535
673.071.00YesSatDinner10.325733
1789.604.00YesSunDinner20.416667
1727.255.15YesSunDinner20.710345
#分位数和桶分析
frame = pd.DataFrame({'data1':np.random.randn(1000),
                    'data2':np.random.randn(1000)})
quartiles = pd.cut(frame.data1,4)
quartiles[:10]
0    (-1.415, 0.195]
1     (0.195, 1.805]
2    (-1.415, 0.195]
3     (0.195, 1.805]
4    (-1.415, 0.195]
5    (-1.415, 0.195]
6     (0.195, 1.805]
7    (-1.415, 0.195]
8    (-1.415, 0.195]
9    (-1.415, 0.195]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.032, -1.415] < (-1.415, 0.195] < (0.195, 1.805] < (1.805, 3.414]]
def get_stats(group):
    return{'min':group.min(),'max':group.max(),
          'count':group.count(),'mean':group.mean()}
grouped = frame.data2.groupby(quartiles)
grouped.apply(get_stats).unstack()
countmaxmeanmin
data1
(-3.032, -1.415]75.02.0550510.217488-2.419102
(-1.415, 0.195]511.03.1343270.002096-2.709974
(0.195, 1.805]379.03.146306-0.014438-2.561375
(1.805, 3.414]35.02.2096090.293813-1.865174
grouping = pd.qcut(frame.data1,10,labels=False)#大小相同的桶
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
countmaxmeanmin
data1
0100.02.0550510.128959-2.419102
1100.03.134327-0.106525-2.131247
2100.02.1789580.041848-1.964788
3100.02.4437150.176331-2.587572
4100.02.2319730.016023-2.709974
5100.02.432079-0.082424-2.189209
6100.03.1463060.072272-2.282556
7100.01.831667-0.011231-2.538154
8100.02.060099-0.060095-2.561375
9100.02.2096090.046783-2.049609
s = pd.Series(np.random.randn(6))
s[::2] = np.nan
s
0         NaN
1   -0.162668
2         NaN
3    0.273627
4         NaN
5    0.345890
dtype: float64
s.fillna(s.mean())
0    0.152283
1   -0.162668
2    0.152283
3    0.273627
4    0.152283
5    0.345890
dtype: float64
states = ['Ohio', 'New York', 'Vermont', 'Florida',
              'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East']*4 + ['West']*4
data = pd.Series(np.random.randn(8),index=states)
data
Ohio         -1.141165
New York      0.077290
Vermont       1.178992
Florida       1.413521
Oregon       -0.840100
Nevada       -0.262705
California    0.410535
Idaho        -2.736216
dtype: float64
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data
Ohio         -1.141165
New York      0.077290
Vermont            NaN
Florida       1.413521
Oregon       -0.840100
Nevada             NaN
California    0.410535
Idaho              NaN
dtype: float64
data.groupby(group_key).mean()
East    0.116549
West   -0.214783
dtype: float64
fill_mean = lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
Ohio         -1.141165
New York      0.077290
Vermont       0.116549
Florida       1.413521
Oregon       -0.840100
Nevada       -0.214783
California    0.410535
Idaho        -0.214783
dtype: float64
#随机采样和排列
suits = ['H','S','C','D']
card_val = (list(range(1,11))+[10]*3)*4
base_names = ['A']+list(range(2,11))+['J','K','Q']
cards = []
for suit in ['H','S','C','D']:
    cards.extend(str(num)+suit for num in base_names)
deck = pd.Series(card_val,index=cards)
deck[:13]
AH      1
2H      2
3H      3
4H      4
5H      5
6H      6
7H      7
8H      8
9H      9
10H    10
JH     10
KH     10
QH     10
dtype: int64
def draw(deck,n=5):
    return deck.sample(n)
draw(deck)
QH    10
AS     1
2S     2
4C     4
5C     5
dtype: int64
get_suit = lambda card:card[-1]
deck.groupby(get_suit).apply(draw,n=2)
C  9C      9
   2C      2
D  QD     10
   10D    10
H  5H      5
   KH     10
S  3S      3
   2S      2
dtype: int64
#分组加权平均数和相关系数
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
                                    'b', 'b', 'b', 'b'],
                       'data': np.random.randn(8),
                       'weights': np.random.rand(8)})
df


categorydataweights
0a-0.3260190.107540
1a1.3339570.220552
2a-0.0555860.189389
3a-0.9471620.598072
4b-0.7644370.505189
5b1.2040080.921015
6b-1.0481680.660169
7b-1.4958280.525462
grouped = df.groupby('category')
get_wavg = lambda g:np.average(g['data'],weights=g['weights'])#加权平均数
grouped.apply(get_wavg)
category
a   -0.284928
b   -0.289163
dtype: float64
close_px = pd.read_csv('examples/stock_px_2.csv', parse_dates=True,
                           index_col=0)
close_px.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2214 entries, 2003-01-02 to 2011-10-14
Data columns (total 4 columns):
AAPL    2214 non-null float64
MSFT    2214 non-null float64
XOM     2214 non-null float64
SPX     2214 non-null float64
dtypes: float64(4)
memory usage: 86.5 KB
close_px[-4:]
AAPLMSFTXOMSPX
2011-10-11400.2927.0076.271195.54
2011-10-12402.1926.9677.161207.25
2011-10-13408.4327.1876.371203.66
2011-10-14422.0027.2778.111224.58
spx_corr = lambda x:x.corrwith(x['SPX'])
rets = close_px.pct_change().dropna()
get_year = lambda x: x.year
by_year = rets.groupby(get_year)
by_year.apply(spx_corr)
AAPLMSFTXOMSPX
20030.5411240.7451740.6612651.0
20040.3742830.5885310.5577421.0
20050.4675400.5623740.6310101.0
20060.4282670.4061260.5185141.0
20070.5081180.6587700.7862641.0
20080.6814340.8046260.8283031.0
20090.7071030.6549020.7979211.0
20100.7101050.7301180.8390571.0
20110.6919310.8009960.8599751.0
#透视表和交叉表
tips.pivot_table(index=['day', 'smoker'])
sizetiptip_pcttotal_bill
daysmoker
FriNo2.2500002.8125000.15165018.420000
Yes2.0666672.7140000.17478316.813333
SatNo2.5555563.1028890.15804819.661778
Yes2.4761902.8754760.14790621.276667
SunNo2.9298253.1678950.16011320.506667
Yes2.5789473.5168420.18725024.120000
ThurNo2.4888892.6737780.16029817.113111
Yes2.3529413.0300000.16386319.190588
 tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],
                     columns='smoker')
sizetip_pct
smokerNoYesNoYes
timeday
DinnerFri2.0000002.2222220.1396220.165347
Sat2.5555562.4761900.1580480.147906
Sun2.9298252.5789470.1601130.187250
Thur2.000000NaN0.159744NaN
LunchFri3.0000001.8333330.1877350.188937
Thur2.5000002.3529410.1603110.163863
 tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],
                     columns='smoker', margins=True)
sizetip_pct
smokerNoYesAllNoYesAll
timeday
DinnerFri2.0000002.2222222.1666670.1396220.1653470.158916
Sat2.5555562.4761902.5172410.1580480.1479060.153152
Sun2.9298252.5789472.8421050.1601130.1872500.166897
Thur2.000000NaN2.0000000.159744NaN0.159744
LunchFri3.0000001.8333332.0000000.1877350.1889370.188765
Thur2.5000002.3529412.4590160.1603110.1638630.161301
All2.6688742.4086022.5696720.1593280.1631960.160803
tips.pivot_table('tip_pct', index=['time', 'smoker'], columns='day',
                    aggfunc=len, margins=True)
dayFriSatSunThurAll
timesmoker
DinnerNo3.045.057.01.0106.0
Yes9.042.019.0NaN70.0
LunchNo1.0NaNNaN44.045.0
Yes6.0NaNNaN17.023.0
All19.087.076.062.0244.0
tips.pivot_table('tip_pct', index=['time', 'size', 'smoker'],
                 columns='day', aggfunc='mean', fill_value=0)
dayFriSatSunThur
timesizesmoker
Dinner1No0.0000000.1379310.0000000.000000
Yes0.0000000.3257330.0000000.000000
2No0.1396220.1627050.1688590.159744
Yes0.1712970.1486680.2078930.000000
3No0.0000000.1546610.1526630.000000
Yes0.0000000.1449950.1526600.000000
4No0.0000000.1500960.1481430.000000
Yes0.1177500.1245150.1933700.000000
5No0.0000000.0000000.2069280.000000
Yes0.0000000.1065720.0656600.000000
6No0.0000000.0000000.1037990.000000
Lunch1No0.0000000.0000000.0000000.181728
Yes0.2237760.0000000.0000000.000000
2No0.0000000.0000000.0000000.166005
Yes0.1819690.0000000.0000000.158843
3No0.1877350.0000000.0000000.084246
Yes0.0000000.0000000.0000000.204952
4No0.0000000.0000000.0000000.138919
Yes0.0000000.0000000.0000000.155410
5No0.0000000.0000000.0000000.121389
6No0.0000000.0000000.0000000.173706
#交叉表计算分组频率
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值