import pandas as pd
import numpy as np
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()
| survived | pclass | sex | age | sibsp | parch | fare | embarked | class | who | adult_male | deck | embark_town | alive | alone |
---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | 7.2500 | S | Third | man | True | NaN | Southampton | no | False |
---|
1 | 1 | 1 | female | 38.0 | 1 | 0 | 71.2833 | C | First | woman | False | C | Cherbourg | yes | False |
---|
2 | 1 | 3 | female | 26.0 | 0 | 0 | 7.9250 | S | Third | woman | False | NaN | Southampton | yes | True |
---|
3 | 1 | 1 | female | 35.0 | 1 | 0 | 53.1000 | S | First | woman | False | C | Southampton | yes | False |
---|
4 | 0 | 3 | male | 35.0 | 0 | 0 | 8.0500 | S | Third | man | True | NaN | Southampton | no | True |
---|
titanic.groupby('sex')[['survived']].mean()
| survived |
---|
sex | |
---|
female | 0.742038 |
---|
male | 0.188908 |
---|
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()
class | First | Second | Third |
---|
sex | | | |
---|
female | 0.968085 | 0.921053 | 0.500000 |
---|
male | 0.368852 | 0.157407 | 0.135447 |
---|
titanic.pivot_table('survived', index='sex', columns='class')
class | First | Second | Third |
---|
sex | | | |
---|
female | 0.968085 | 0.921053 | 0.500000 |
---|
male | 0.368852 | 0.157407 | 0.135447 |
---|
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')
| class | First | Second | Third |
---|
sex | age | | | |
---|
female | (0, 18] | 0.909091 | 1.000000 | 0.511628 |
---|
(18, 80] | 0.972973 | 0.900000 | 0.423729 |
---|
male | (0, 18] | 0.800000 | 0.600000 | 0.215686 |
---|
(18, 80] | 0.375000 | 0.071429 | 0.133663 |
---|
fare = pd.qcut(titanic['fare'], 2)
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])
| fare | (-0.001, 14.454] | (14.454, 512.329] |
---|
| class | First | Second | Third | First | Second | Third |
---|
sex | age | | | | | | |
---|
female | (0, 18] | NaN | 1.000000 | 0.714286 | 0.909091 | 1.000000 | 0.318182 |
---|
(18, 80] | NaN | 0.880000 | 0.444444 | 0.972973 | 0.914286 | 0.391304 |
---|
male | (0, 18] | NaN | 0.000000 | 0.260870 | 0.800000 | 0.818182 | 0.178571 |
---|
(18, 80] | 0.0 | 0.098039 | 0.125000 | 0.391304 | 0.030303 | 0.192308 |
---|
data = ['peter', 'Paul', 'MARY', 'gUIDO']
names = pd.Series(data)
names
0 peter
1 Paul
2 MARY
3 gUIDO
dtype: object
names.str.capitalize()
0 Peter
1 Paul
2 Mary
3 Guido
dtype: object
import pandas as pd
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4))
print(df1.shape)
(100000, 100)
%timeit df1 + df2 + df3 + df4
%timeit pd.eval('df1 + df2 + df3 + df4')
66.5 ms ± 924 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
30.3 ms ± 599 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()
| A | B | C |
---|
0 | 0.615875 | 0.525167 | 0.047354 |
---|
1 | 0.330858 | 0.412879 | 0.441564 |
---|
2 | 0.689047 | 0.559068 | 0.230350 |
---|
3 | 0.290486 | 0.695479 | 0.852587 |
---|
4 | 0.424280 | 0.534344 | 0.245216 |
---|
result1 = (df['A'] + df['B']) / (df['C'] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
result3 = df.eval('(A + B) / (C - 1)')
print(result3)
np.allclose(result1, result3)
0 -1.197761
1 -1.331822
2 -1.621667
3 -6.688481
4 -1.270064
...
995 -3.349773
996 -2.163240
997 -0.936554
998 -2.263292
999 -3.781258
Length: 1000, dtype: float64
True
df.eval('D = (A + B) / C', inplace=True)
df.head()
| A | B | C | D |
---|
0 | 0.615875 | 0.525167 | 0.047354 | 24.095868 |
---|
1 | 0.330858 | 0.412879 | 0.441564 | 1.684325 |
---|
2 | 0.689047 | 0.559068 | 0.230350 | 5.418335 |
---|
3 | 0.290486 | 0.695479 | 0.852587 | 1.156439 |
---|
4 | 0.424280 | 0.534344 | 0.245216 | 3.909296 |
---|
column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1, result2)
True
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)
True
result2 = df.query('A < 0.5 and B < 0.5')
np.allclose(result1, result2)
True
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)
True