import pandas as pd print(50*'-', '空值清洗', 50*'-') df1 = pd.read_csv('F:/python学习文件/实验文件夹/Pandas学习/property-data.csv') print(30*'-', '源数据表', 30*'-') print(df1) missing_values = ["n/a", "na", "--"] # 指定空值或者要清晰的值,PD默认n/a和NA为空数据,na不是空数据,要指定空数据类型 df2 = pd.read_csv('F:/python学习文件/实验文件夹/Pandas学习/property-data.csv', na_values=missing_values) print(30*'-', '指定空值后原表中的全部置换为NaN', 30*'-') print(df2) print(30*'-', '源数据表删除包含空字符的行', 30*'-') df3 = df1.dropna() print(df3) df1.dropna(subset=['ST_NUM'], inplace=True) print(30*'-', '源数据表移除"ST_NUM"中为空的行', 30*'-') print(df1) df1['PID'].fillna(100005100, inplace=True) print(30*'-', '源数据表"PID"列空值全部填充为100005100', 30*'-') print(df1) df1.fillna(0, inplace=True) print(30*'-', '源数据表全部空值全部填充为0', 30*'-') print(df1) x = df2["ST_NUM"].mean() # 当然可以用中位数median()或者众数mode() df2["ST_NUM"].fillna(x, inplace=True) print(30*'-', 'df2数据表"ST_NUM"列空值替换成其平均值', 30*'-') print(df2, x) print(50*'-', '日期数据格式清洗', 50*'-') data = { "Date": ['2020/12/01', '2020/12/02', '20201226', '20201227', '20201228', '20201231'], "duration": [50, 40, 45, 25, 65, 89] } df4 = pd.DataFrame(data) df4['Date'] = pd.to_datetime(df4['Date']) print(df4) print(50*'-', '异常值修改', 50*'-') stu = {"name": ['Google', 'Runoob', 'Taobao'], "age": [50, 40, 12345]} df5 = pd.DataFrame(stu) for i in df5.index: if df5.loc[i, 'age'] >= 100: df5.loc[i, 'age'] = 80 print(df5) print(50*'-', '删除重复值', 50*'-') stu1 = {"name": ['Google', 'Google', 'Taobao'], "age": [50, 50, 12345]} df6 = pd.DataFrame(stu1) df6.drop_duplicates(inplace=True) print(df6)