import numpy as np
import pandas as pd
#一、处理缺失数据
data1 = pd.Series(['a','b',np.nan,'d'])
#
# ##检测缺失值
print(data1.isnull()) #判断是否null值
data1[0] = None #这个也是null值
print(data1.isnull())
##处理缺失值
#方法1
print(data1.dropna())
#方法2
print(data1[data1.notnull()])
data2 = pd.DataFrame([[1.,6.5,3.],[1.,np.nan,np.nan],[np.nan,np.nan,np.nan],[np.nan,6.7,7.]])
print(data2.dropna()) #dropna会丢弃任何含有缺失值的行
'''
0 1 2
0 1.0 6.5 3.0
'''
print(data2.dropna(how='all')) #how='all' 丢弃全为nan的行
'''
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
3 NaN 6.7 7.0
'''
print(data2.dropna(axis=1,how='all'))
'''
0 1 2
0 1.0 6.5 3.0
1 1.0 NaN NaN
2 NaN NaN NaN
3 NaN 6.7 7.0
'''
df = pd.DataFrame(np.random.randn(7,3))
# print(df)
df.iloc[:4,1] = np.nan
print(df)
df.iloc[:2,2] = np.nan
print(df.dropna(thresh=2)) #删除缺失值为2的数据
'''
0 1 2
2 -0.020213 NaN -0.574773
3 0.209291 NaN 0.647866
4 1.790216 -1.095708 -0.511631
5 -1.427289 0.660130 -1.234292
6 0.133132 -1.020107 0.770566
'''
##填充数据
print(df.fillna(0)) #用0填充
print(df.fillna({1:0.9,2:0})) #列索引为1的用0.9填充,列索引为2的用0填充
df.fillna({1:0.9,2:0},inplace=True) #inplace=True就地修改
df2 = pd.DataFrame(np.random.randn(6,3))
df2.iloc[2:,1] = np.nan
df2.iloc[4:,2] = np.nan
print(df2)
print(df2.fillna(method='ffill')) #用本列中 最后一个非nan值填充
# FutureWarning: DataFrame.fillna with 'method' is deprecated and will raise in a future version. Use obj.ffill() or obj.bfill() instead.
print(df2.ffill()) #效果同上
print(df2.fillna(method='ffill',limit=2))
print(df2.ffill(limit=2)) #效果同上