1、处理缺失数据
import pandas as pd
log_data = pd.read_csv('log.csv')
log_data
print(log_data.isnull())
print(log_data['paused'].isnull())
print(log_data[log_data['volume'].notnull()])
log_data.set_index(['time', 'user'], inplace=True)
log_data.sort_index(inplace=True)
print(log_data)
print(log_data.fillna(0))
print(log_data.dropna())
print(log_data.ffill())
print(log_data.bfill())
2、数据变形
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
'k2': [1, 1, 2, 3, 3, 4, 4]})
print(data)
判断数据是否重复
data.duplicated()
data.drop_duplicates()
data['v1'] = range(7)
print(data)
data.drop_duplicates(['k1'])
data.drop_duplicates(['k1', 'k2'], keep='last')
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'],
'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
print(data)
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}
lowercased = data['food'].str.lower()
data['animal'] = lowercased.map(meat_to_animal)
print(data)
data['animal2'] = data['food'].map(lambda x: meat_to_animal[x.lower()])
print(data)
data = pd.Series([1., -999., 2., -999., -1000., 3.])
print(data)
import numpy as np
data.replace(-999, np.nan)
data.replace([-999, -1000], np.nan)
data.replace([-999, -1000], [np.nan, 0])
data.replace({-999: np.nan, -1000: 0})
3、离散化和分箱操作
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
print(type(cats))
print(cats)
print(cats.codes)
print(cats.categories)
pd.value_counts(cats)
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
cats = pd.cut(ages, bins, labels=group_names)
print(cats.get_values())
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
'data1': range(6)})
print(df)
pd.get_dummies(df['key'])
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com', 'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
print(data)
print(data.str.contains('gmail'))
print(data.str[:5])
split_df = data.str.split('@', expand=True)
print(split_df)
split_df[0].str.cat(split_df[1], sep='@')
print(split_df)