import pandas as pd
import numpy as np
titanic = pd.read_csv("../../data/titanic.csv")
titanic.head(2)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
筛选满足条件的行
本质:基于等长的 bool Series 筛选数据,True就把数据留下,否则跳过。
布尔Series生成方法:
1、关系表达式:titanic[“Age”] > 35,但是需注意:每个条件表达式外用(),用&|而不能用and not
2、isin()函数:titanic[“Pclass”].isin([2, 3])
3、notna()函数:titanic[titanic[“Age”].notna()]
4、Series map/apply函数
5、dataFrame apply 函数
tect = pd.Series([True, True] + [False] * (len(titanic) - 2))
titanic[tect]
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
titanic[titanic["Age"] > 35].head(2)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S |
---|
df2 = pd.DataFrame({'a': ['one', 'one', 'two', 'three', 'two', 'one', 'six'],
'b': ['x', 'y', 'y', 'x', 'y', 'x', 'x'],
'c': np.random.randn(7)})
df2
| a | b | c |
---|
0 | one | x | 1.785649 |
---|
1 | one | y | 0.867109 |
---|
2 | two | y | -0.752323 |
---|
3 | three | x | -2.338581 |
---|
4 | two | y | 0.799080 |
---|
5 | one | x | 0.396302 |
---|
6 | six | x | 2.006012 |
---|
criterion = df2["a"].map(lambda x: x.startswith("t"))
df2[criterion]
| a | b | c |
---|
2 | two | y | -0.752323 |
---|
3 | three | x | -2.338581 |
---|
4 | two | y | 0.799080 |
---|
df2[df2["a"].apply(lambda x: x.startswith("t"))]
| a | b | c |
---|
2 | two | y | -0.752323 |
---|
3 | three | x | -2.338581 |
---|
4 | two | y | 0.799080 |
---|
df2[criterion & (df2['b'] == 'x')]
df2[df2.apply(lambda x: x[0] == "one" and x[1] != "y", axis=1)]
| a | b | c |
---|
0 | one | x | 1.785649 |
---|
5 | one | x | 0.396302 |
---|
df2.loc[criterion & (df2['b'] == 'y')]
| a | b | c |
---|
2 | two | y | -0.752323 |
---|
4 | two | y | 0.799080 |
---|