《R语言编程艺术》书上代码实现---第六章因子和表

本文链接：https://blog.youkuaiyun.com/weixin_45652435/article/details/127176760

第六章因子和表

6.1因子与水平

#因子设计思想来源于统计学上的分类变量
x=c(5,12,13,12)
xf=factor(x)
xf#5 12 13就是水平
str(xf)
unclass(xf)#xf内核是（1,2,3,2）数据重新编码为水平了
length(xf)
#插入新的水平
xff=factor(x,levels = c(5,12,13,88))
xff
xff[2]=88
xff
xff[2]=28#不可以添加一个”非法“的水平

6.2因子的常用函数

#6.2.1tapply()函数
#将相邻分割成组，针对每组使用指定函数
#tapply(x,f,g) 暂时将x分组，每组对应一个因子水平，得到x的子向量，然后这些子向量应用函数g()
ages=c(25,26,55,37,21,42)
affils=c("R","D","D","R","U","D")
tapply(ages,affils,mean)

d=data.frame(list(gender=c("M","M","F","M","F","F"),age=c(47,59,21,32,33,24),income=c(55000,88000,32450,76500,123000,45650)))
d
d$over25=ifelse(d$age>25,1,0) #转换成分类变量
head(d)
tapply(d$income,list(d$gender,d$over25),mean)
#f为一对因子 性别，年龄是否大于25，tapply会分四组算均值


#6.2.2split()函数
#将向量分割为组
#split(x,f) f是因子或因子的列表 x为向量或数据框
d
split(d$income,list(d$gender,d$over25)) #收入按照性别和年龄是否超过25分割

g=c("M","F","F","I","M","M","F")
split(1:7,g)

findwords <- function(tf) {
   # read in the words from the file, into a vector of mode character
   txt <- scan(tf,"") 
   words <- split(1:length(txt),txt)
   return(words)
}

#6.2.3by()函数
#tapply输入只能是向量
aba <- read.csv("abalone.data",header=T,as.is=T)
#数据框重命名
names(aba)=c(	"sex",	"Length",	"Diam",	"Height",	"Whole",	"Shucked",	"Viscera",	"Shell",	"Rings" )
head(aba)
#对不同性别的编码组分别做直径对长度的回归分析
by(aba,aba$sex,function(m) lm(m[,2]~m[,3]))

6.3表的操作

u=c(22,8,33,6,8,29,-2)
f1=list(c(5,12,13,12,13,5,13),c("a","bc","a","a","bc","a","a"))
tapply(u,f1,length) #列联表
table(f1)
#没找到ct.dat数据

table(c(5,12,13,12,8,5))#创建一维频数表

#三维表
v=data.frame(gender=c("M","M","F"),race=c("W","W","W"),pol=c("L","L","C"))
vt=table(v)
vt
#6.3.1表中有关矩阵和类似数组的操作
class(vt)
table(f1)[1,]#返回第一行
vt/5#table可以除
apply(table(f1),1,sum)#计算边际值

#计算边际值的函数
addmargins(table(f1))
dimnames(table(f1)) #获取维度的名称和水平值

#6.3.2扩展案例：提取子表
#想去除行中选not sure的
subtable <- function(tbl,subnames) {
   # get array of cell counts in tbl
   tblarray <- unclass(tbl)
   # we'll get the subarray of cell counts corresponding to subnames by
   # calling do.call() on the "[" function; we need to build up a list
   # of arguments first
   dcargs <- list(tblarray)
   ndims <- length(subnames)  # number of dimensions 
   for (i in 1:ndims) {
      dcargs[[i+1]] <- subnames[[i]]
   }
   subarray <- do.call("[",dcargs)
   # now we'll build the new table, consisting of the subarray, the
   # numbers of levels in each dimension, and the dimnames() value, plus
   # the "table" class attribute
   dims <- lapply(subnames,length)
   subtbl <- array(subarray,dims,dimnames=subnames)
   class(subtbl) <- "table"
   return(subtbl)
}

#do.call(f,argslist)函数
#list(tblarry,fo.x=c("no","yes"),fo.x.l=c("no","yes"))

#6.3.3扩展案例：在表中寻找频数最大的单元格
# finds the cells in table tbl with the k highest frequencies; handling
# of ties is unrefined
tabdom <- function(tbl,k) {
   # create a data frame representation of tbl, adding a Freq column
   tbldf <- as.data.frame(tbl)
   # determine the proper positions of the frequencies in a sorted order
   freqord <- order(tbldf$Freq,decreasing=TRUE)
   # rearrange the data frame in that order, and take the first k rows
   dom <- tbldf[freqord,][1:k,]
   return(dom)
}


d=c(5,12,13,4,3,28,12,12,9,5,5,13,5,4,12)
dtab=table(d)
#显示出频数前三
tabdom(dtab,3)

6.4其他与因子和表有关的函数

#6.4.1aggregate()函数
#对分组中的每一个变量调用tapply函数
aggregate(aba[,-1],list(aba$sex),median)

#6.4.2cut()函数
y=cut(x,b,labels=F)#调用形式
z=c(0.88811482,0.7298749797,0.479173,0.9827397)
seq(from=0.0,to=1.0,by=0.1)
binmarks=seq(from=0.0,to=1.0,by=0.1)
cut(z,binmarks,labels = F) #z中第一个数据落入第九个区间