本文辑录了《R语言实战——机器学习与数据分析》(电子工业出版社2016年出版)一书第4章至第5章之代码。本书引言请见如下链接:
http://blog.youkuaiyun.com/baimafujinji/article/details/51596171
内容简介:本书系统地介绍了统计分析和机器学习领域中最为重要和流行的多种技术及它们的基本原理,在详解有关算法的基础上,结合大量R语言实例演示了这些理论在实践中的使用方法。具体内容被分成三个部分,即R语言编程基础、基于统计的数据分析方法以及机器学习理论。统计分析与机器学习部分又具体介绍了包括参数估计、假设检验、极大似然估计、非参数检验方法(包括列联分析、符号检验、符号秩检验等)、方差分析、线性回归(包括岭回归和Lasso方法)、逻辑回归、支持向量机、聚类分析(包括K均值算法和EM算法)和人工神经网络等内容。同时,统计理论的介绍也为深化读者对于后续机器学习部分的理解提供了很大助益。知识结构和阅读进度的安排上既兼顾了循序渐进的学习规律,亦统筹考虑了夯实基础的必要性
网上书店地址:
电子工业出版社官网
中国互动出版网China-pub
京东商城(1)
京东商城(2)
Chapter 4
P49~52
curve(dbinom(x, p = 0.5, size = 10), from = 0, to = 10,
+ type = "s", main = "Binomial")
curve(dnbinom(x, size = 10, prob = 0.75), from = 0, to = 10,
+ type = "s",main = "Negative Binomial")
curve(dgeom(x, prob = 0.2), from = 0, to = 10,
+ type = "s", main = "Geometric")
curve(dpois(x, lambda = 3), from = 0, to = 10,
+ type = "s", main = "Poisson")
P53
curve(dexp(x, rate = 1/2), from = 0, to = 5, ylim = c(0,1.5),
+ main = "Exponential",col = "red")
curve(dexp(x, rate = 1), from = 0, to = 5,
+ add = TRUE, col ="blue")
curve(dexp(x, rate = 2),from = 0, to = 5,
+ add = TRUE, col = "green")
text.legend = c("lambda = 0.5","lambda = 1","lambda = 2")
legend("topright",legend = text.legend, lty = c(1,1,1),
+ col = c("red","blue","green"))
P57~58
rnorm(10)
normal.pop <- rnorm(1000)
par(mfrow = c(1,2)) #准备在一行中绘制两个并列的图
plot(density(normal.pop), xlim = c(-4,4), main = "标准正态分布(模拟)")
curve(dnorm(x), from = -4, to = 4, main = "标准正态分布(标准)")
x1 <- 0:10
pmf <- dbinom(x1, 10, 0.5)
plot(pmf ~ x1, type = "h")
cdf <- pbinom(x1, 10, 0.5)
plot(cdf ~ x1, type = "s")
inverse_cdf <- qbinom(cdf, 10, 0.5)
P66~67
sample(1:52, 5)
sample(c("H", "T"), 3, replace=TRUE)
sample(1:6, 10, replace=TRUE)
sample(c(1, 0), 10, replace=TRUE, prob=c(0.8,0.2))
Chapter 5
P72
countries <- c("Brazil","Russia","India","China","South Africa")
GDP <- c(23920, 20790, 18618, 94906, 3660)
pie(GDP, labels = countries, main = "GDP of BRICS countries (2013)")
pie(GDP, labels = countries, clockwise = TRUE,
+ main = "GDP of BRICS countries (2013)")
pie(GDP, labels = countries, col = c("purple", "violetred1",
+ "green3", "cornsilk", "cyan"), main = "GDP of BRICS countries (2013)")
pie(GDP, labels = countries, col = gray(seq(0.4, 1.0, length = 5)),
+ main = "GDP of BRICS countries (2013)")
P73
percentage = round(GDP/sum(GDP)*100, 2)
index <- paste(countries, " ", percentage, "%", sep="")
pie(GDP, labels = index, col = rainbow(length(index)),
+ main= "Pie Chart with Percentages")
library(plotrix)
pie3D(GDP, labels = countries, explode = 0.1,
+ main = "3D Pie Chart")
fan.plot(GDP, labels = countries, main = "Fan Plot")
P74~76
attach(mtcars)
hist(mpg)
hist(mpg, breaks = 12, col = “lightblue”, border = “pink”,
+ xlab = “Miles/Gallon”, main = “Colored Histogram Example.1”)
hist(mpg, breaks = 12, col = “blue1”, xlim = c(10, 35),
+ xlab = “Miles/Gallon”, main = “Colored Histogram Example.2”)
hist(mpg, breaks = 15, xlim = c(10, 35),
+ xlab = "Miles/Gallon", main = "Histogram Example (breaks = 15)")
hist(mpg, breaks = 10, xlim = c(10, 35),
+ xlab = "Miles/Gallon", main = "Histogram Example (breaks = 10)")
pretty(min(mpg):max(mpg),12)
pretty(min(mpg):max(mpg),10)
pretty(min(mpg):max(mpg),15)
hist(mpg, breaks = c(2*5:9, 5*4:7), col = "blue1",
+ ylim = c(0, 0.12), xlab = "Miles/Gallon",
+ main = "Example with Non-equidistant Breaks")
P77~78
hist(mpg, breaks = 12, col = "blue1", ylim = c(0, 0.12),
+ xlim = c(10, 35), freq = FALSE, xlab = "Miles/Gallon",
+ main = "Histogram Example of Density ")
hist(mpg, breaks = 12, col = "blue1", ylim = c(0, 0.12),
+ xlim = c(10, 35), freq = FALSE, xlab = "Miles/Gallon",
+ main = "Histogram Example with Density Curve")
lines(density(mpg), col = 'red', lwd = 2)
h <- hist(mpg, breaks = 12, col = "blue", xlim = c(10, 35),
+ xlab = "Miles/Gallon", main = "Histogram Example with Normal Curve")
xfit <- seq(min(mpg), max(mpg), length = length(mpg))
yfit <- dnorm(xfit, mean=mean(mpg), sd=sd(mpg))
yfit <- yfit*diff(h$mids[1:2])*length(mpg)
lines(xfit, yfit, col = "red", lwd = 2)
box()
d <- density (mpg)
plot(d)
plot(d, main = "Density of Miles/Gallon")
polygon(d, col = "wheat", border = "blue")
rug(jitter(mpg, amount = 0.01), col = "brown")
P79~81
rug(mpg, col = "brown")
plot(density(mtcars[mtcars$cyl==4, ]$mpg), col = "red", lty = 1,
+ xlim = c(5, 40), ylim = c(0, 0.25), xlab = "", main = "")
par(new = TRUE)
plot(density(mtcars[mtcars$cyl==6, ]$mpg), col = "blue", lty = 2,
+ xlim = c(5, 40), ylim = c(0, 0.25), xlab = "", main = "")
par(new = TRUE)
plot(density(mtcars[mtcars$cyl==8, ]$mpg), col = "green", lty = 3,
+ xlim = c(5, 40), ylim = c(0, 0.25),
+ xlab = "Miles/Gallon", main = "MPG Distribution by Cylinders")
text.legend = c("cyl=4","cyl=6", "cyl=8")
legend("topright", legend = text.legend, lty=c(1, 2, 3),
+ col = c("red", "blue", "green"))
curve(dnorm(x,mean(mtcars[mtcars$cyl==4, ]$mpg),
+ sd(mtcars[mtcars$cyl==4, ]$mpg)), from = 5, to = 40,
+ ylim=c(0,0.28),col = "red", lty = 1, xlab = "", ylab="",main="")
par(new=TRUE)
curve(dnorm(x,mean(mtcars[mtcars$cyl==6, ]$mpg),
+ sd(mtcars[mtcars$cyl==6, ]$mpg)), from = 5, to = 40,
+ ylim=c(0,0.28),col = "blue", lty = 2, xlab = "", ylab="",main="")
par(new=TRUE)
curve(dnorm(x,mean(mtcars[mtcars$cyl==8, ]$mpg),
+ sd(mtcars[mtcars$cyl==8, ]$mpg)), from = 5, to = 40,
+ ylim=c(0,0.28),col = "green", lty = 3, xlab = "Miles/Gallon",
+ ylab = "Density", main="MPG Distribution by Cylinders")
text.legend = c("cyl=4","cyl=6", "cyl=8")
legend("topright", legend = text.legend, lty=c(1,2,3),
+ col = c("red", "blue", "green"))
boxplot(mpg, main="Box plot", ylab="Miles per Gallon")
boxplot.stats(x, coef = 1.5, do.conf = TRUE, do.out = TRUE)
P82~83
boxplot.stats(mpg)
fivenum(mpg)
x
fivenum(x)
y
fivenum(y)
my.fivenum<-function(x){
+ x<-sort(x)
+ n <- length(x)
+ n4 <- floor((n + 3)/2)/2
+ d <- c(1, n4, (n + 1)/2, n + 1 - n4, n)
+ return(0.5 * (x[floor(d)] + x[ceiling(d)]))
+ }
summary(mpg)
P84
quantile(mpg)
my.quantile <- function(x) {
+ n <- length(x)
+ probs = seq(0, 1, 0.25)
+ index <- 1 + (n - 1) * probs
+
+ lo <- floor(index)
+ hi <- ceiling(index)
+
+ x <- sort(x, partial = unique(c(lo, hi)))
+ qs <- x[lo]
+ i <- which(index > lo)
+ h <- (index - lo)[i]
+ qs[i] <- (1 - h) * qs[i] + h * x[hi[i]]
+ return(qs)
+ }
boxplot(mpg ~ cyl, data = mtcars, main = "Car Mileage Data",
+ xlab = "Number of Cylinders", ylab = "Miles/Gallon")
P86
boxplot(mpg ~ cyl, data = mtcars, notch = TRUE,
+ main = "Car Mileage Data", ylab = "Miles/Gallon", xaxt = "n")
axis(side = 1, at = c(1, 2, 3), labels = c("4 cylinders",
+ "6 cylinders", "8 cylinders"))
cyl.f <- factor (cyl, levels = c(4, 6, 8),
+ labels = c("4 cyls", "6 cyls", "8 cyls"))
am.f <- factor(am, levels = c(0, 1), labels = c("auto","std"))
boxplot(mpg ~ am.f*cyl.f, data = mtcars, varwidth = TRUE,
+ col = c("wheat", "orange"), xlab = "Types",
+ main = "MPG Distribution by Multi-types")
P87~88
my.data <- matrix(c(5.87, 7.94, 3.77, 7.41, 5.37), nrow = 1)
colnames(my.data) <-c("US", "Japan", "China", "Brazil", "India")
barplot(my.data, ylim = c(0, round(max(my.data))),
+ main = "Barplot Example (Vertical)",
+ xlab = "Countries", ylab = "GDP per Energy")
barplot(my.data, xlim = c(0, round(max(my.data))),
+ horiz = TRUE, main = "Barplot Example (Horizontal)",
+ xlab = "GDP per Energy", ylab = "Countries")
GDP.Energy <- c(3.77, 6.87, 4.56)
par(mar=c(4,10,3,2))
par(las = 1)
barplot(GDP.Energy, horiz = TRUE, cex.names = 0.9,
+ names.arg = c("China", "OECD Countries","Middle Income Countries"))
title(main = list("GDP per Unit Energy Consumption",
+ cex = 1.2, col = "brown", font = 3))
my.data <- matrix(c(38.1, 1.7, 27.8, 28.7, 34.1, 69.6), nrow = 2)
rownames(my.data) <- c("China", "Germany")
colnames(my.data) <- c("primary","secondary","tertiary")
my.data
P89~90
barplot(my.data, main = "Grouped Barplot", xlab = "Industries",
+ ylab = "Employment(%)",col = c("wheat", "orange"),
+ legend = rownames(my.data), args.legend = list(x = "top"))
barplot(my.data, main = "Grouped Barplot",
+ ylim = c(0, round(max(my.data))), xlab = "Industries",
+ ylab = "Employment(%)",col = c("wheat", "orange"), beside=TRUE,
+ legend = rownames(my.data), args.legend = list(x = "top"))
library(vcd)
spine(my.data, main="Employment in Three Industries")
P92~93
x <- seq(-3, 3, by = 0.1)
cdf <- pnorm(x, 0, 1)
plot(cdf ~ x, type = "o")
plot(cdf ~ x, ylim = c(-3,3), type = "l", lty = 2, xlab="", ylab="")
par(new=TRUE)
plot(x ~ cdf, xlim = c(-3,3), type = "l")
plot(qnorm(cdf) ~ cdf, type = "l")
plot(x ~ cdf, type = "l")
P96~97
q.dset <- seq(0.05,0.95,by = 0.1)
q.dset
q.norm <- qnorm (q.dset)
round(q.norm, 2)
par(mfrow = c(1,2))
plot(dset ~ q.norm, main = "Normal Q-Q Plot (Manually)", col = "red")
par(new = TRUE)
qqline(dset)
qqnorm(dset, main = "Normal Q-Q Plot (By R)", col = "blue")
qqline(dset)
exponential.pop <- rexp(1000, rate = 1)
exp.means <- sapply(1:1000, function(x)
+ mean(sample(exponential.pop, size=15)))
my.data <- exp.means[1:50]
qqnorm(my.data)
qqline(my.data)
exp.pop <- rexp(100, rate = 1)
par(mfrow = c(1,2))
qqplot(exp.pop, exp.pop)
qqplot(exp.pop, rexp(100, rate = 1))