#6.2.1tapply()函数
#将相邻分割成组,针对每组使用指定函数
#tapply(x,f,g) 暂时将x分组,每组对应一个因子水平,得到x的子向量,然后这些子向量应用函数g()
ages=c(25,26,55,37,21,42)
affils=c("R","D","D","R","U","D")
tapply(ages,affils,mean)
d=data.frame(list(gender=c("M","M","F","M","F","F"),age=c(47,59,21,32,33,24),income=c(55000,88000,32450,76500,123000,45650)))
d
d$over25=ifelse(d$age>25,1,0) #转换成分类变量
head(d)
tapply(d$income,list(d$gender,d$over25),mean)
#f为一对因子 性别,年龄是否大于25,tapply会分四组算均值
#6.2.2split()函数
#将向量分割为组
#split(x,f) f是因子或因子的列表 x为向量或数据框
d
split(d$income,list(d$gender,d$over25)) #收入按照性别和年龄是否超过25分割
g=c("M","F","F","I","M","M","F")
split(1:7,g)
findwords <- function(tf) {
# read in the words from the file, into a vector of mode character
txt <- scan(tf,"")
words <- split(1:length(txt),txt)
return(words)
}
#6.2.3by()函数
#tapply输入只能是向量
aba <- read.csv("abalone.data",header=T,as.is=T)
#数据框重命名
names(aba)=c( "sex", "Length", "Diam", "Height", "Whole", "Shucked", "Viscera", "Shell", "Rings" )
head(aba)
#对不同性别的编码组分别做直径对长度的回归分析
by(aba,aba$sex,function(m) lm(m[,2]~m[,3]))
6.3表的操作
u=c(22,8,33,6,8,29,-2)
f1=list(c(5,12,13,12,13,5,13),c("a","bc","a","a","bc","a","a"))
tapply(u,f1,length) #列联表
table(f1)
#没找到ct.dat数据
table(c(5,12,13,12,8,5))#创建一维频数表
#三维表
v=data.frame(gender=c("M","M","F"),race=c("W","W","W"),pol=c("L","L","C"))
vt=table(v)
vt
#6.3.1表中有关矩阵和类似数组的操作
class(vt)
table(f1)[1,]#返回第一行
vt/5#table可以除
apply(table(f1),1,sum)#计算边际值
#计算边际值的函数
addmargins(table(f1))
dimnames(table(f1)) #获取维度的名称和水平值
#6.3.2扩展案例:提取子表
#想去除行中选not sure的
subtable <- function(tbl,subnames) {
# get array of cell counts in tbl
tblarray <- unclass(tbl)
# we'll get the subarray of cell counts corresponding to subnames by
# calling do.call() on the "[" function; we need to build up a list
# of arguments first
dcargs <- list(tblarray)
ndims <- length(subnames) # number of dimensions
for (i in 1:ndims) {
dcargs[[i+1]] <- subnames[[i]]
}
subarray <- do.call("[",dcargs)
# now we'll build the new table, consisting of the subarray, the
# numbers of levels in each dimension, and the dimnames() value, plus
# the "table" class attribute
dims <- lapply(subnames,length)
subtbl <- array(subarray,dims,dimnames=subnames)
class(subtbl) <- "table"
return(subtbl)
}
#do.call(f,argslist)函数
#list(tblarry,fo.x=c("no","yes"),fo.x.l=c("no","yes"))
#6.3.3扩展案例:在表中寻找频数最大的单元格
# finds the cells in table tbl with the k highest frequencies; handling
# of ties is unrefined
tabdom <- function(tbl,k) {
# create a data frame representation of tbl, adding a Freq column
tbldf <- as.data.frame(tbl)
# determine the proper positions of the frequencies in a sorted order
freqord <- order(tbldf$Freq,decreasing=TRUE)
# rearrange the data frame in that order, and take the first k rows
dom <- tbldf[freqord,][1:k,]
return(dom)
}
d=c(5,12,13,4,3,28,12,12,9,5,5,13,5,4,12)
dtab=table(d)
#显示出频数前三
tabdom(dtab,3)