#目录
5.Aggregating data
5.Aggregating data
#import required dictionary
import pandas as pd
import numpy as np
#read the dataset
data_BM = pd.read_csv('bigmart_data.csv')
#drop the null values
data_BM = data_BM.dropna(how = "any")
#reset index after dropping
data_BM = data_BM.reset_index(drop=True)
#view the top result
#data_BM.head()
#group price based on item type
#sort by Item_Type values
#排序方式默认按照字母表顺序
price_by_item = data_BM.groupby("Item_Type")
# display first few rows
#price_by_item.head()
#mean price by item
price_by_item.Item_MRP.mean()
#out<<
Item_Type
Baking Goods 125.795653
Breads 141.300639
Breakfast 134.090683
Canned 138.551179
Dairy 149.481471
Frozen Foods 140.095830
Fruits and Vegetables 145.418257
Hard Drinks 140.102908
Health and Hygiene 131.437324
Household 149.884244
Meat 140.279344
Others 137.640870
Seafood 146.595782
Snack Foods 147.569955
Soft Drinks 130.910182
Starchy Foods 151.256747
Name: Item_MRP, dtype: float64
#group on multiple columns
multiple_groups = data_BM[:10].groupby(['Item_Type','Item_Fat_Content'])
#multiple_groups.head()
#generate crosstab of Outlet_Size and Outlet_Location_Type
pd.crosstab(data_BM["Outlet_Location_Type"],data_BM["Outlet_Size"],margins = True)
#out<<
Outlet_Size High Medium Small All
Outlet_Location_Type
Tier 1 0 930 930 1860
Tier 2 0 0 930 930
Tier 3 932 928 0 1860
All 932 1858 1860 4650
#crate pivot tablet
index = ['Outlet_Establishment_Year'],values = "Item_Outlet_Sales")
#out<<
Item_Outlet_Sales
Outlet_Establishment_Year
1987 2298.995256
1997 2277.844267
1999 2348.354635
2004 2438.841866
2009 1995.498739
#create pivot table
pd.pivot_table(data_BM,index = ['Outlet_Establishment_Year','Outlet_Size'],values = "Item_Outlet_Sales")
#out<<
Item_Outlet_Sales
Outlet_Establishment_Year Outlet_Size
1987 High 2298.995256
1997 Small 2277.844267
1999 Medium 2348.354635
2004 Small 2438.841866
2009 Medium 1995.498739
本文通过使用Pandas和NumPy库,对BigMart销售数据进行了深入分析。从数据预处理到聚合操作,展示了如何按商品类型分组计算平均售价,以及如何创建交叉表和透视表来揭示销售趋势和模式。

被折叠的 条评论
为什么被折叠?



