# Basic packagesimport numpy as np # linear algebraimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)import random as rd # generating random numbersimport datetime # manipulating date formats# Vizimport matplotlib.pyplot as plt # basic plottingimport seaborn as sns # for prettier plots# TIME SERIESfrom statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs
# settingsimport warnings
warnings.filterwarnings("ignore")
# Import all of them
sales=pd.read_csv("D:/2018_BigData/Python/Kaggle_learning/Predict Future Sales - data science/data-sources/sales_train.csv")
item_cat=pd.read_csv("D:/2018_BigData/Python/Kaggle_learning/Predict Future Sales - data science/data-sources/item_categories.csv")
item=pd.read_csv("D:/2018_BigData/Python/Kaggle_learning/Predict Future Sales - data science/data-sources/items.csv")
shops=pd.read_csv("D:/2018_BigData/Python/Kaggle_learning/Predict Future Sales - data science/data-sources/shops.csv")
test=pd.read_csv("D:/2018_BigData/Python/Kaggle_learning/Predict Future Sales - data science/data-sources/test.csv")
# 源数据文件描述# sales_train.csv - the training set. Daily historical data from January 2013 to October 2015.# test.csv - the test set. You need to forecast the sales for these shops and products for November 2015.# sample_submission.csv - a sample submission file in the correct format.# items.csv - supplemental information about the items/products.# item_categories.csv - supplemental information about the items categories.# shops.csv- supplemental information about the shops.# 数据标签说明# ID - an Id that represents a (Shop, Item) tuple within the test set# shop_id - unique identifier of a shop# item_id - unique identifier of a product# item_category_id - unique identifier of item category# item_cnt_day - number of products sold. You are predicting a monthly amount of this measure# item_price - current price of an item# date - date in format dd/mm/yyyy# date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33# item_name - name of item# shop_name - name of shop# item_category_name - name of item category
sales.head()
date
date_block_num
shop_id
item_id
item_price
item_cnt_day
0
02.01.2013
0
59
22154
999.00
1.0
1
03.01.2013
0
25
2552
899.00
1.0
2
05.01.2013
0
25
2552
899.00
-1.0
3
06.01.2013
0
25
2554
1709.05
1.0
4
15.01.2013
0
25
2555
1099.00
1.0
item_cat.head()
item_category_name
item_category_id
0
PC - Гарнитуры/Наушники
0
1
Аксессуары - PS2
1
2
Аксессуары - PS3
2
3
Аксессуары - PS4
3
4
Аксессуары - PSP
4
item.head()
item_name
item_id
item_category_id
0
! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D
0
40
1
!ABBYY FineReader 12 Professional Edition Full...
1
76
2
***В ЛУЧАХ СЛАВЫ (UNV) D
2
40
3
***ГОЛУБАЯ ВОЛНА (Univ) D
3
40
4
***КОРОБКА (СТЕКЛО) D
4
40
shops.head(3)
shop_name
shop_id
0
!Якутск Орджоникидзе, 56 фран
0
1
!Якутск ТЦ "Центральный" фран
1
2
Адыгея ТЦ "Мега"
2
test.head(3)
ID
shop_id
item_id
0
0
5
5037
1
1
5
5320
2
2
5
5233
#formatting the date column correctly
sales.date=sales.date.apply(lambda x:datetime.datetime.strptime(x,'%d.%m.%Y'))# checkprint(sales.info())print(sales.head(2))
# Aggregate to monthly level the required metrics
monthly_sales=sales.groupby(["date_block_num","shop_id","item_id"])["date","item_price","item_cnt_day"].agg({
"date":["min",'max'],"item_price":"mean","item_cnt_day":"sum"})## Lets break down the line of code here:# aggregate by date-block(month),shop_id and item_id# select the columns date,item_price and item_cnt(sales)# Provide a dictionary which says what aggregation to perform on which column# min and max on the date# average of the item_price# sum of the sales
monthly_sales.head(8)
date
item_price
item_cnt_day
min
max
mean
sum
date_block_num
shop_id
item_id
0
0
32
2013-01-03
2013-01-31
221.0
6.0
33
2013-01-03
2013-01-28
347.0
3.0
35
2013-01-31
2013-01-31
247.0
1.0
43
2013-01-31
2013-01-31
221.0
1.0
51
2013-01-13
2013-01-31
128.5
2.0
61
2013-01-10
2013-01-10
195.0
1.0
75
2013-01-17
2013-01-17
76.0
1.0
88
2013-01-16
2013-01-16
76.0
1.0
# number of items per cat
x=item.groupby(['item_category_id']).count()
x=x.sort_values(by='item_id',ascending=False)
x=x.iloc[0:10].reset_index()
x
item_category_id
item_name
item_id
0
40
5035
5035
1
55
2365
2365
2
37
1780
1780
3
31
1125
1125
4
58
790
790
5
30
756
756
6
72
666
666
7
19
628
628
8
61
598
598
9
23
501
501
# plot
plt.figure(figsize=(8,4))
ax= sns.barplot(x.item_category_id, x.item_id, alpha=0.8)
plt.title("Items per Category")
plt.ylabel('# of items', fontsize=12)
plt.xlabel('Category', fontsize=12)
plt.show()# 为什么前面 x 要赋值前十?因为10个数据值比较适合条形图可视化,而原84个值则数量太多,不宜画图。# seaborn的条形图barplot,明显比matplotlib的条形图,好看很多。# 还有个发现:seaborn的这个barplot,绘制的时候按照item_category_id升序排列了,估计是默认的。# 回到分析正题:销量最高的是40号商品,达到了接近5000的量,其次是55号商品约2500销量和37号1900销量。销量第一名40号是遥遥领先。
# 大神原贴说,我们的目的是预测每个商品在每个商店的下个月销售额,属于时间序列预测类型;# 那首先我们先找个简单的时间序列预测类型练练手,例如就预测下个月所有商品在所有商店的销售总额。
ts=sales.groupby(["date_block_num"])["item_cnt_day"].sum()# 按月周期,求总和销售额
ts.astype('float')# 转换字符格式成浮点数,方便计算。
plt.figure(figsize=(16,8))# 画个图
plt.title('Total Sales of the company')
plt.xlabel('Time')
plt.ylabel('Sales')
plt.plot(ts);#如果此语句替换成 plt.show(),则图形结果是一张有坐标轴和标题的白板,没有任何折线或其他内容。突然想起plot是折线图。。。# 另外,刚刚发现,上边最后一个分号“;”,原来作用类似等同于plt.show().——可减一行代码,不愧是简洁优美。# 回到分析上来,随着时间推移,销售总额的两个峰值,答曰在第12个月和第24个月,刚好是年周期。莫非这是其中一个规律?应该就是的。