7.1处理缺失数据
Nan(not a number )在pandas表示缺失值
import pandas as pd
import numpy as np
string_data=pd.Series(['aardvark','artichoke',np.nan,'avocado'])
string_data
0 aardvark
1 artichoke
2 NaN
3 avocado
dtype: object
string_data.isnull()
0 False
1 False
2 True
3 False
dtype: bool
string_data[0]=None
string_data.isnull()
0 True
1 False
2 True
3 False
dtype: bool
- dropna:删除缺失数据
- fillna:插值方法填充缺失数据
- isnull: 返回布尔值,表明哪些是缺失值
- notnull :isnull的反面
滤除缺失数据
from numpy import nan as NA
data=pd.Series([1,NA,3.5,NA,7])
data.dropna()
0 1.0
2 3.5
4 7.0
dtype: float64
data[data.notnull()]
0 1.0
2 3.5
4 7.0
dtype: float64
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
[NA, NA, NA], [NA, 6.5, 3.]])
data
|
0 |
1 |
2 |
0 |
1.0 |
6.5 |
3.0 |
1 |
1.0 |
NaN |
NaN |
2 |
NaN |
NaN |
NaN |
3 |
NaN |
6.5 |
3.0 |
cleaned=data.dropna()
cleaned
data.dropna(how='all')
|
0 |
1 |
2 |
0 |
1.0 |
6.5 |
3.0 |
1 |
1.0 |
NaN |
NaN |
3 |
NaN |
6.5 |
3.0 |
data[4]=NA
data
|
0 |
1 |
2 |
4 |
0 |
1.0 |
6.5 |
3.0 |
NaN |
1 |
1.0 |
NaN |
NaN |
NaN |
2 |
NaN |
NaN |
NaN |
NaN |
3 |
NaN |
6.5 |
3.0 |
NaN |
data.dropna(axis=1,how='all')
|
0 |
1 |
2 |
0 |
1.0 |
6.5 |
3.0 |
1 |
1.0 |
NaN |
NaN |
2 |
NaN |
NaN |
NaN |
3 |
NaN |
6.5 |
3.0 |
df=pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1]=NA
df.iloc[:2,2]=NA
df
|
0 |
1 |
2 |
0 |
1.230124 |
NaN |
NaN |
1 |
-0.671868 |
NaN |
NaN |
2 |
-0.596658 |
NaN |
0.002418 |
3 |
-1.061044 |
NaN |
-0.246041 |
4 |
-0.677290 |
-1.394329 |
-1.870510 |
5 |
-0.313459 |
0.133874 |
-1.172282 |
6 |
-0.495465 |
-0.954127 |
0.150156 |
df.dropna()
|
0 |
1 |
2 |
4 |
-0.677290 |
-1.394329 |
-1.870510 |
5 |
-0.313459 |
0.133874 |
-1.172282 |
6 |
-0.495465 |
-0.954127 |
0.150156 |
df.dropna(thresh=2)
|
0 |
1 |
2 |
2 |
-0.596658 |
NaN |
0.002418 |
3 |
-1.061044 |
NaN |
-0.246041 |
4 |
-0.677290 |
-1.394329 |
-1.870510 |
5 |
-0.313459 |
0.133874 |
-1.172282 |
6 |
-0.495465 |
-0.954127 |
0.150156 |
填充缺失数据
df.fillna(0)
|
0 |
1 |
2 |
0 |
1.230124 |
0.000000 |
0.000000 |
1 |
-0.671868 |
0.000000 |
0.000000 |
2 |
-0.596658 |
0.000000 |
0.002418 |
3 |
-1.061044 |
0.000000 |
-0.246041 |
4 |
-0.677290 |
-1.394329 |
-1.870510 |
5 |
-0.313459 |
0.133874 |
-1.172282 |
6 |
-0.495465 |
-0.954127 |
0.150156 |
df.fillna({
1:0.5,2:0})
|
0 |
1 |
2 |
0 |
1.230124 |
0.500000 |
0.000000 |
1 |
-0.671868 |
0.500000 |
0.000000 |
2 |
-0.596658 |
0.500000 |
0.002418 |
3 |
-1.061044 |
0.500000 |
-0.246041 |
4 |
-0.677290 |
-1.394329 |
-1.870510 |
5 |
-0.313459 |
0.133874 |
-1.172282 |
6 |
-0.495465 |
-0.954127 |
0.150156 |
_=df.fillna(0,inplace=True)
df
|
0 |
1 |
2 |
0 |
1.230124 |
0.000000 |
0.000000 |
1 |
-0.671868 |
0.000000 |
0.000000 |
2 |
-0.596658 |
0.000000 |
0.002418 |
3 |
-1.061044 |
0.000000 |
-0.246041 |
4 |
-0.677290 |
-1.394329 |
-1.870510 |
5 |
-0.313459 |
0.133874 |
-1.172282 |
6 |
-0.495465 |
-0.954127 |
0.150156 |
df=pd.DataFrame(np.random.randn(6,3))
df.iloc[2:,1]=NA
df.iloc[4:,2]=NA
df
|
0 |
1 |
2 |
0 |
0.536292 |
-0.231305 |
-0.944116 |
1 |
-0.216595 |
1.808402 |
1.086082 |
2 |
-0.457510 |
NaN |
-0.617013 |
3 |
-0.163709 |
NaN |
0.450099 |
4 |
0.969959 |
NaN |
NaN |
5 |
1.136978 |
NaN |
NaN |
df.fillna(method='ffill')
|
0 |
1 |
2 |
0 |
0.536292 |
-0.231305 |
-0.944116 |
1 |
-0.216595 |
1.808402 |
1.086082 |
2 |
-0.457510 |
1.808402 |
-0.617013 |
3 |
-0.163709 |
1.808402 |
0.450099 |
4 |
0.969959 |
1.808402 |
0.450099 |
5 |
1.136978 |
1.808402 |
0.450099 |
df.fillna(method='ffill',limit=2)
|
0 |
1 |
2 |
0 |
0.536292 |
-0.231305 |
-0.944116 |
1 |
-0.216595 |
1.808402 |
1.086082 |
2 |
-0.457510 |
1.808402 |
-0.617013 |
3 |
-0.163709 |
1.808402 |
0.450099 |
4 |
0.969959 |
NaN |
0.450099 |
5 |
1.136978 |
NaN |
0.450099 |
data=pd.Series([1., NA, 3.5, NA, 7])
data
0 1.0
1 NaN
2 3.5
3 NaN
4 7.0
dtype: float64
data.fillna(data.mean())
0 1.000000
1 3.833333
2 3.500000
3 3.833333
4 7.000000
dtype: float64
fillna参数:
- value:用于填充缺失值的标量值或字典对象
- method:插值方式,未指定方式为ffill
- axis:默认为axis=0
- inplace:如果为true,对原件更改
- limit:向前/后可以连续填充最大数目
7.2数据转换
移除重复数据
data = pd.DataFrame({
'k1': ['one', 'two'] * 3 + ['two'],
'k2': [1, 1, 2, 3, 3, 4, 4]})
data
|
k1 |
k2 |
0 |
one |
1 |
1 |
two |
1 |
2 |
one |
2 |
3 |
two |
3 |
4 |
one |
3 |
5 |
two |
4 |
6 |
two |
4 |
data.duplicated()
0 False
1 False
2 False
3 False
4 False
5 False
6 True
dtype: bool
data.drop_duplicates()
|
k1 |
k2 |
0 |
one |
1 |
1 |
two |
1 |
2 |
one |
2 |
3 |
two |
3 |
4 |
one |
3 |
5 |
two |
4 |
data['v1']=range(7)
data
|
k1 |
k2 |
v1 |
0 |
one |
1 |
0 |
1 |
two |
1 |
1 |
2 |
one |
2 |
2 |
3 |
two |
3 |
3 |
4 |
one |