3.3 分组聚合和透视图
(1)数据排序
import pandas as pd
vgsales = pd.read_csv('vgsales.csv', encoding='utf-8', dtype={'Year': float})
# ascending=True按照升序排序,na_position='last'缺失值排最后
vgsales.sort_values('Year', ascending=True, na_position='last', inplace=True)
vgsales.reset_index(drop=True, inplace=True) # 重置索引
vgsales.sort_values(['NA_Sales', 'EU_Sales'])
(2)分组聚合
import pandas as pd
import numpy as np
vgsales = pd.read_csv('vgsales.csv', dtype={'Year': float})
var_name = ['NA', 'EU', 'JP', 'GLO']
np.min(vgsales[var_name], axis=0)
np.median(vgsales[var_name], axis=0)
np.max(vgsales[var_name], axis=0)
vgsales[var_name].min(axis=0)
vgsales[var_name].max(axis=0)
vgsales[var_name].cumsum(axis=0)
vgsales[var_name].quantile([0, 0.2, 0.5, 1], axis=0)
vgsales[var_name].describe()
# 分类汇总
groupde = vgsales.groupby('Year')
groupde.mean()
groupde.max()
groupde.count()
groupde.size()
groupde.cumcount()
names = ['Year', 'Genre', 'NA_Sales', 'EU_Sales', 'JP_Sales']
groupde = vgsales[names].groupby(by=['Year', 'Genre'])
groupde.mean()
groupde.median().loc[([1980, 1986], 'Fighting'), :]
# 聚合函数
groupde = vgsales[names].groupby('Year')
groupde.agg([np.mean, np.sum])
groupde.agg([np.mean, np.sum]).loc[[1980, 1981], ('NA_Sales', ['mean', 'sum'])]
groupde.agg({'NA_Sales': np.sum, 'EU_Sales': [np.mean, np.sum]})
# 自定义函数
def DoubleSum(data):
s = data.sum()*2
return s
groupde.agg({'EU_Sales': DoubleSum})
var_names = ['Year', 'Genre', 'NA_Sales', 'EU_Sales', 'JP_Sales']
vgsales[var_names].apply(np.sum, axis=1)
(3)透视图和交叉表
透视图:
import pandas as pd
import numpy as np
import os
# 透视表
os.chdir('路径')
df = pd.read_csv('文件名', dtype={'Year': float})
pd.pivot_table(df, index='Year', values='Global_Sales', columns='Genre', aggfunc=np.mean, fill_value=0, margins=True,
margins_name='总计求均值')
# 交叉表(用来计算频率和频数)
pd.crosstab(index=df['Platform'], columns=df['Genre'], margins=True)
pd.crosstab(index=df['Platform'], columns=df['Genre'], normalize='all', margins=True)
3.4 Pandas数据预处理
(1)Pandas其他函数运用
import pandas as pd
import numpy as np
import os
os.chdir('地址')
sec_cars = pd.read_csv('文件名', na_values='暂无')
sec_cars.dtypes
sec_cars['Boarding_time'] = pd.to_datetime(sec_cars['Boarding_time'], format='%Y年%m月', errors='coerce')
sec_cars['New_price'] = sec_cars['New_price'].str[:-1].astype('float')
sec_cars['Discharge'].value_counts()
sec_cars['Discharge'] = sec_cars['Discharge'].str.replace('--', '缺失值')
sec_cars.describe()
sec_cars['diff_day'] = pd.datetime.today() - sec_cars['Boarding_time']
sec_cars['diff_day'] = pd.datetime.today().year
df = pd.read_csv('文件名')
df['tel'] = df['tel'].apply(lambda x: x.replace(x[3:7], '****'))
df['email'] = df['email'].apply(lambda x: x.split('@')[1])
df = pd.read_csv('文件名')
data = df.sample(n=50000, replace=False)
data['data'] = data['InvoiceDate'].dt.date
data['total_price'] = data[['Quantity', 'UnitPrice']].apply(np.prod, axis=1)
grouped_data = data.groupby(by='data').sum()
# 移动平均
grouped_data['SMA5'] = grouped_data['total_price'].rolling(5).mean()
grouped_data['SMA10'] = grouped_data['total_price'].rolling(10).mean()
# 向下平移一个单位(-1向上)
grouped_data['total_price'].shift(1)
(2)数据清洗
1. 重复值处理
import pandas as pd
import numpy as np
import os
os.chdir('路径')
df = pd.read_csv('文件名')
# 判断重复
df.duplicated()
df[df.duplicated(subset=['appname', 'comments'], keep='last')]
np.sum(df.duplicated())
# 去掉重复值
df.drop_duplicates()
df.drop_duplicates(subset=['comments', 'install'], inplace=True)
2. 缺失值处理
import pandas as pd
import numpy as np
import os
os.chdir('路径')
df = pd.read_excel('文件名')
np.sum(df.isnull())
df.apply(lambda x: sum(x.isnull()) / len(x), axis=0)
# 直接删除法
df.dropna()
df.dropna(how='any', axis=0)
df.dropna(how='all', axis=0)
df.drop(['age', 'gender'], axis=1)
df.drop(labels=['age', 'gender'], axis=1)
df.dropna(subset=['age', 'gender'], how='any')
# 替换法
df.age.fillna(df.age.mean()) # 平均
df.gender.fillna(df.gender.mode()[0]) # 众数
df.fillna(20)
df.age.fillna(20)
df.fillna(value={'gender':df.gender.mode()[0], 'age': df.age.mean(), 'income': df.income.median()})
# 前向填补,后向填补
df.fillna(method='ffill')
df.fillna(method='bfill')
# 插值法
df.fillna(method='linear')
df.fillna(method='polynomial', order=1)
3. 异常值处理
import pandas as pd
import os
os.chdir('地址')
sunspots = pd.read_csv('文件名', sep=',')
# 标准差法(均值的正负两个标准差)
xbar = sunspots.counts.mean()
xstd = sunspots.counts.std()
xbar + 2 * xstd
xbar - 2 * xstd
any(sunspots.counts > xbar + 2 * xstd)
any(sunspots.counts < xbar - 2 * xstd)
sunspots.counts.plot(kind='hist')
# 箱型图法
Q1 = sunspots.counts.quantile(q=0.25) # 下四分位数
Q3 = sunspots.counts.quantile(q=0.75) # 上四分位数
IQR = Q3 - Q1
any(sunspots.counts > Q3 + 1.5 * IQR)
any(sunspots.counts < Q1 - 1.5 * IQR)
sunspots.counts.plot(kind='box')
# 解决:替换法
sunspots.counts.describe()
UL = Q3 + 1.5 * IQR
replace_value = sunspots.counts[sunspots.counts < UL].max()
sunspots.loc[sunspots.counts > UL, 'counts'] = replace_value
P1 = sunspots.counts.quantile(q=0.01)
P99 = sunspots.counts.quantile(q=0.99)
sunspots['counts_new'] = sunspots['counts']
sunspots.loc[sunspots['counts'] > P99, 'counts_new'] = P99
sunspots.loc[sunspots['counts'] < P1, 'counts_new'] = P1
(3)数据离散化
import pandas as pd
import numpy as np
import os
os.chdir('地址')
sunspots = pd.read_csv('文件名', sep=',')
# 等宽分段(容易受异常值影响)
sunspots['counts_bin'] = pd.cut(sunspots['counts'], 4, labels=range(1, 5))
sunspots['counts_bin'] = pd.cut(sunspots['counts'], 4, labels=['1', '2', '3', 'four'])
# 等频分段
k = 4
w = [i/k for i in range(k+1)]
sunspots['counts_bin'] = pd.qcut(sunspots['counts'], w, labels=range(0, 4))
k = 4
w1 = sunspots['counts'].quantile([i/k for i in range(k+1)])
pd.cut(sunspots['counts'], w1, labels=range(0, 4))