Pandas数据处理4:高性能计算eval()和query()

import pandas as pd 
import numpy as np
#数据透视表(pivot table)
import seaborn as sns 
titanic = sns.load_dataset('titanic')
titanic.head()
survivedpclasssexagesibspparchfareembarkedclasswhoadult_maledeckembark_townalivealone
003male22.0107.2500SThirdmanTrueNaNSouthamptonnoFalse
111female38.01071.2833CFirstwomanFalseCCherbourgyesFalse
213female26.0007.9250SThirdwomanFalseNaNSouthamptonyesTrue
311female35.01053.1000SFirstwomanFalseCSouthamptonyesFalse
403male35.0008.0500SThirdmanTrueNaNSouthamptonnoTrue
#分组:例如这样统计不同性别乘客的生还率
titanic.groupby('sex')[['survived']].mean()
survived
sex
female0.742038
male0.188908
#更复杂的情况
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()
classFirstSecondThird
sex
female0.9680850.9210530.500000
male0.3688520.1574070.135447
#用 DataFrame 的 pivot_table 实现的效果等同于上面
titanic.pivot_table('survived', index='sex', columns='class')
classFirstSecondThird
sex
female0.9680850.9210530.500000
male0.3688520.1574070.135447
#如果想把年龄('age')也加进去作为第三个维度,这就可以通过 pd.cut 函数将年龄进行分段:
age = pd.cut(titanic['age'], [0, 18, 80]) 
titanic.pivot_table('survived', ['sex', age], 'class')
classFirstSecondThird
sexage
female(0, 18]0.9090911.0000000.511628
(18, 80]0.9729730.9000000.423729
male(0, 18]0.8000000.6000000.215686
(18, 80]0.3750000.0714290.133663
#qcut分为两份
fare = pd.qcut(titanic['fare'], 2) 
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])
fare(-0.001, 14.454](14.454, 512.329]
classFirstSecondThirdFirstSecondThird
sexage
female(0, 18]NaN1.0000000.7142860.9090911.0000000.318182
(18, 80]NaN0.8800000.4444440.9729730.9142860.391304
male(0, 18]NaN0.0000000.2608700.8000000.8181820.178571
(18, 80]0.00.0980390.1250000.3913040.0303030.192308
#向量化字符串
data = ['peter', 'Paul', 'MARY', 'gUIDO']
names = pd.Series(data) 
names
0    peter
1     Paul
2     MARY
3    gUIDO
dtype: object
names.str.capitalize() #大写,缺失值自动跳过
0    Peter
1     Paul
2     Mary
3    Guido
dtype: object
#由于 Numexpr 在计算代数式时不需要为临时数组分配全部内存,因此计算比 NumPy 更高效,尤其适合处理大型数组。马上要介绍的 Pandas 的 eval() 和 query()工具其实也是基于 Numexpr 实现的。
import pandas as pd 
nrows, ncols = 100000, 100 
rng = np.random.RandomState(42) 
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4))
print(df1.shape)
(100000, 100)
%timeit df1 + df2 + df3 + df4
%timeit pd.eval('df1 + df2 + df3 + df4')
#这个 eval() 版本的代数式比普通方法快一倍(而且内存消耗更少),结果也是一样的
66.5 ms ± 924 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
30.3 ms ± 599 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
 df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C']) 
 df.head()
ABC
00.6158750.5251670.047354
10.3308580.4128790.441564
20.6890470.5590680.230350
30.2904860.6954790.852587
40.4242800.5343440.245216
#用DataFrame.eval()实现列间运算
#使用 dataframe.eval() 方法的好处是可以借助 列名称 进行运算
result1 = (df['A'] + df['B']) / (df['C'] - 1) 
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
result3 = df.eval('(A + B) / (C - 1)') 
print(result3)
np.allclose(result1, result3)
0     -1.197761
1     -1.331822
2     -1.621667
3     -6.688481
4     -1.270064
         ...   
995   -3.349773
996   -2.163240
997   -0.936554
998   -2.263292
999   -3.781258
Length: 1000, dtype: float64





True
#用df.eval创建新的列
df.eval('D = (A + B) / C', inplace=True) 
df.head()
ABCD
00.6158750.5251670.04735424.095868
10.3308580.4128790.4415641.684325
20.6890470.5590680.2303505.418335
30.2904860.6954790.8525871.156439
40.4242800.5343440.2452163.909296
#DataFrame.eval() 方法还支持通过 @ 符号使用 Python 的局部变量
column_mean = df.mean(1) 
result1 = df['A'] + column_mean 
result2 = df.eval('A + @column_mean') 
np.allclose(result1, result2)
True
#query
result1 = df[(df.A < 0.5) & (df.B < 0.5)] 
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]') 
np.allclose(result1, result2)
True
#和前面介绍过的 DataFrame.eval() 一样,这是一个用 DataFrame 列创建的代数式,但是不能用 DataFrame.eval() 语法 ,因为你要的结果是包含 DataFrame 的全部列。
result2 = df.query('A < 0.5 and B < 0.5') 
np.allclose(result1, result2)
True
#query() 方法也支持用 @ 符号引用局部变量
Cmean = df['C'].mean() 
result1 = df[(df.A < Cmean) & (df.B < Cmean)] 
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)
True

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值