#Dodger促销和上座率的预测模型
library(car) # 线性回归的包
library(lattice) # 绘图软件包
# 读入数据,并建立数据框导入
dodgers <- read.csv("/Users/lily/Documents/model_predict/Chapter_2/dodgers.csv")
print(str(dodgers)) # 查看数据结构
'data.frame': 81 obs. of 12 variables:$ month : Factor w/ 7 levels "APR","AUG","JUL",..: 1 1 1 1 1 1 1 1 1 1 ...
$ day : int 10 11 12 13 14 15 23 24 25 27 ...
$ attend : int 56000 29729 28328 31601 46549 38359 26376 44014 26345 44807 ...
$ day_of_week: Factor w/ 7 levels "Friday","Monday",..: 6 7 5 1 3 4 2 6 7 1 ...
$ opponent : Factor w/ 17 levels "Angels","Astros",..: 13 13 13 11 11 11 3 3 3 10 ...
$ temp : int 67 58 57 54 57 65 60 63 64 66 ...
$ skies : Factor w/ 2 levels "Clear ","Cloudy": 1 2 2 2 2 1 2 2 2 1 ...
$ day_night : Factor w/ 2 levels "Day","Night": 1 2 2 2 2 1 2 2 2 2 ...
$ cap : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
$ shirt : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
$ fireworks : Factor w/ 2 levels "NO","YES": 1 1 1 2 1 1 1 1 1 2 ...
$ bobblehead : Factor w/ 2 levels "NO","YES": 1 1 1 1 1 1 1 1 1 1 ...
NULL
# 定义一周七天的次序变量,以便绘图和数据小结时用
dodgers$ordered_day_of_week <- with(data = dodgers,
ifelse((day_of_week == 'Monday'), 1,
ifelse((day_of_week == 'Tuesday'), 2,
ifelse((day_of_week == 'Wednesday'), 3,
ifelse((day_of_week == 'Thursday'), 4,
ifelse((day_of_week == 'Friday'), 5,
ifelse((day_of_week == 'Saturday'), 6, 7)))))))
dodgers$ordered_day_of_week <- factor(dodgers$ordered_day_of_week, levels = 1:7,
labels = c("Mon", "Tue", "Wed", "Thur", "Fri", "Sat", "Sun"))
#以标准绘图观测数据分析,一周七天的观众人数
par(family='STKaiti')
with(data = dodgers, plot(ordered_day_of_week, attend/1000,xlab = "一周内的每一天",
ylab = "出席率(单位:1000)", col = "violet", las = 1))
#当dodger采用摇头娃娃促销时
with(dodgers, table(bobblehead, ordered_day_of_week)) # 星期二的摇头娃娃促销
ordered_day_of_weekbobblehead Mon Tue Wed Thur Fri Sat Sun
NO 12 7 12 3 13 11 12
YES 0 6 0 2 0 2 1
#定义月次序变量,绘制数据小结时用
dodgers$ordered_month <- with(data = dodgers,
ifelse((month == 'APR'), 4,
ifelse((month == 'MAY'), 5,
ifelse((month == 'JUN'), 6,
ifelse((month == 'JUL'), 7,
ifelse((month == 'AUG'), 8,
ifelse((month == 'SEP'), 9, 10)))))))
dodgers$ordered_month <- factor(dodgers$ordered_month, levels = 4:10,
labels = c("April", "MAY", "June", "July", "Aug", "Sept", "Oct"))
#每个月的观众人数
par(family = 'STKaiti')
with(dodgers, plot(ordered_month, attend/1000, xlab = "月份", ylab = "出席率(单位:1000人)",
col = "light blue", las = 1))
#使用更多的变量来观测数据分析,观察观众人数与日场/夜场
#天空是否晴朗、气温,或者是否燃放烟火的关系
group.labels <- c("No Fireworks", "Fireworks")
group.symbols <- c(21, 24)
group.colors <- c("black", "black")
group.fill <- c("black", "red")
xyplot(attend/1000 ~ temp | skies + day_night,
data = dodgers,
groups = fireworks,
pch = group.symbols,
aspect = 0.5, cex = 1.5, col = group.colors, fill = group.fill,
layout = c(2, 2), type = c("p", "g"),
strip = strip.custom(strip.levels = TRUE, strip.names = FALSE, style = 1),
xlab = "Temperature (Degrees Fahrenheit)", ylab = "Attendance(thousands)",
key = list(space = "top",
text = list(rev(group.labels), col = rev(group.colors)),
points = list(pch = rev(group.symbols), col = rev(group.colors),
fill = rev(group.fill))))
# 观众人数与比赛对手,日场、夜场的关系
group.labels <- c("Day", "Night")
group.symbols <- c(1, 20)
group.symbols.size <- c(2, 2.75)
bwplot(opponent ~ attend/1000, data = dodgers, groups = day_night,
xlab = "Attend(thousands)", aspect = 2,
panel = function(x, y, groups, subscripts,...)
{
panel.grid(h = (length(levels(dodgers$opponent)) -1), v = -1)
panel.stripplot(x, y, groups = groups, subscripts = subscripts, cex = group.symbols.size,
pch = group.symbols, col = "darkblue")
},
key = list(space = "top",
text = list(group.labels, col = "black"),
points = list(pch = group.symbols, cex = group.symbols.size, col = "darkblue"))
)
#采用训练并测试的方案
set.seed(1234)
training_test <- c(rep(1, length = trunc((2/3) * nrow(dodgers))),rep(2, length = (nrow(dodgers) - trunc((2/3) * nrow(dodgers)))))
dodgers$training_test <- sample(training_test) # 随机排列
dodgers$training_test <- factor(dodgers$training_test, levels = c(1, 2), labels = c("TRAIN", "TEST"))
dodgers.train <- subset(dodgers, training_test == 'TRAIN')
print(str(dodgers.train)) # 查看数据框属性
dodgers.test <- subset(dodgers, training_test == 'TEST')
print(str(dodgers.test))