############################################################
自定义分层抽样函数:
############################################################
stratified <- function(df, group, size, select = NULL,
replace = FALSE, bothSets = FALSE) {
if (is.null(select)) {
df <- df
} else {
if (is.null(names(select))) stop("'select' must be a named list")
if (!all(names(select) %in% names(df)))
stop("Please verify your 'select' argument")
temp <- sapply(names(select),
function(x) df[[x]] %in% select[[x]])
df <- df[rowSums(temp) == length(select), ]
}
df.interaction <- interaction(df[group], drop = TRUE)
df.table <- table(df.interaction)
df.split <- split(df, df.interaction)
if (length(size) > 1) {
if (length(size) != length(df.split))
stop("Number of groups is ", length(df.split),
" but number of sizes supplied is ", length(size))
if (is.null(names(size))) {
n <- setNames(size, names(df.split))
message(sQuote("size"), " vector entered as:\n\nsize = structure(c(",
paste(n, collapse = ", "), "),\n.Names = c(",
paste(shQuote(names(n)), collapse = ", "), ")) \n\n")
} else {
ifelse(all(names(size) %in% names(df.split)),
n <- size[names(df.split)],
stop("Named vector supplied with names ",
paste(names(size), collapse = ", "),
"\n but the names for the group levels are ",
paste(names(df.split), collapse = ", ")))
}
} else if (size < 1) {
n <- round(df.table * size, digits = 0)
} else if (size >= 1) {
if (all(df.table >= size) || isTRUE(replace)) {
n <- setNames(rep(size, length.out = length(df.split)),
names(df.split))
} else {
message(
"Some groups\n---",
paste(names(df.table[df.table < size]), collapse = ", "),
"---\ncontain fewer observations",
" than desired number of samples.\n",
"All observations have been returned from those groups.")
n <- c(sapply(df.table[df.table >= size], function(x) x = size),
df.table[df.table < size])
}
}
temp <- lapply(
names(df.split),
function(x) df.split[[x]][sample(df.table[x],
n[x], replace = replace), ])
set1 <- do.call("rbind", temp)
if (isTRUE(bothSets)) {
set2 <- df[!rownames(df) %in% rownames(set1), ]
list(SET1 = set1, SET2 = set2)
} else {
set1
}
}
================================================
参数说明:
#######################################################
The arguments to stratified are:
df: The inputdata.framegroup: A character vector of the column or columns that make up the "strata".size: The desired sample size.- If
sizeis a value less than 1, a proportionate sample is taken from each stratum. - If
sizeis a single integer of 1 or more, that number of samples is taken from each stratum. - If
sizeis a vector of integers, the specified number of samples is taken for each stratum. It is recommended that you use a named vector. For example, if you have two strata, "A" and "B", and you wanted 5 samples from "A" and 10 from "B", you would entersize = c(A = 5, B = 10).
- If
select: This allows you to subset the groups in the sampling process. This is alist. For instance, if yourgroupvariable was "Group", and it contained three strata, "A", "B", and "C", but you only wanted to sample from "A" and "C", you can useselect = list(Group = c("A", "C")).replace: For sampling with replacement.
########################################################
例子说明:
########################################################
set.seed(1)
##创建数据框 dat1 <- data.frame(ID = 1:100, A = sample(c("AA", "BB", "CC", "DD", "EE"), 100,replace = TRUE), B = rnorm(100), C = abs(round(rnorm(100), digits = 1)), D = sample(c("CA", "NY", "TX"), 100, replace = TRUE), E = sample(c("M","F"), 100, replace = TRUE))
##
library(devtools)
source_gist("https://gist.github.com/mrdwab/6424112")
## [1] "https://raw.github.com/gist/6424112"
## SHA-1 hash of file is 0006d8548785ec8a5651c3dd599648cc88d153a4
# Generate a couple of sample data.frames to play with
set.seed(1)
dat1 <- data.frame(ID = 1:100, A = sample(c("AA", "BB", "CC", "DD", "EE"), 100,
replace = TRUE), B = rnorm(100), C = abs(round(rnorm(100), digits = 1)),
D = sample(c("CA", "NY", "TX"), 100, replace = TRUE), E = sample(c("M",
"F"), 100, replace = TRUE))
# What do the data look like in general?
summary(dat1)
## ID A B C D E
## Min. : 1.0 AA:13 Min. :-1.9144 Min. :0.000 CA:23 F:54
## 1st Qu.: 25.8 BB:25 1st Qu.:-0.6141 1st Qu.:0.300 NY:42 M:46
## Median : 50.5 CC:19 Median :-0.1176 Median :0.650 TX:35
## Mean : 50.5 DD:26 Mean :-0.0176 Mean :0.825
## 3rd Qu.: 75.2 EE:17 3rd Qu.: 0.5382 3rd Qu.:1.200
## Max. :100.0 Max. : 2.4016 Max. :2.900
# Let's take a 10% sample from all -A- groups in dat1
stratified(dat1, "A", 0.1)
## ID A B C D E
## 92 92 AA 1.1766 1.0 CA F
## 86 86 BB -1.5364 0.1 NY F
## 74 74 BB -0.1796 0.3 CA F
## 60 60 CC 1.6822 0.5 TX F
## 58 58 CC 0.9102 0.5 CA F
## 9 9 DD 0.5697 1.4 TX F
## 42 42 DD 1.2079 0.4 TX M
## 46 46 DD 0.5585 1.0 CA M
## 99 99 EE -1.2863 0.2 NY F
## 72 72 EE 1.3430 0.0 NY F
# Let's take a 10% sample from only 'AA' and 'BB' groups from -A- in dat1
stratified(dat1, "A", 0.1, select = list(A = c("AA", "BB")))
## ID A B C D E
## 69 69 AA 0.4942 0.6 CA M
## 54 54 BB 0.1580 0.3 TX M
## 5 5 BB 1.4330 1.5 NY F
# Let's take 5 samples from all -D- groups in dat1, specified by column
# number
stratified(dat1, group = 5, size = 5)
## ID A B C D E
## 73 73 BB -0.2146 0.6 CA F
## 45 45 CC 1.5868 1.2 CA F
## 87 87 DD -0.3010 0.6 CA M
## 13 13 DD 0.6897 1.1 CA F
## 24 24 AA -0.9341 0.1 CA M
## 40 40 CC 0.2671 0.9 NY F
## 83 83 BB 0.5315 0.6 NY M
## 1 1 BB 0.3981 0.5 NY F
## 56 56 AA 1.7673 2.5 NY M
## 88 88 AA -0.5283 1.2 NY M
## 66 66 BB -0.3928 1.5 TX F
## 22 22 BB -0.7099 0.1 TX M
## 79 79 DD -0.6817 0.2 TX M
## 27 27 AA -0.4433 0.8 TX M
## 51 51 CC -0.6204 0.4 TX F
# Let's take a sample from all -A- groups in dat1, where we specify the
# number wanted from each group
stratified(dat1, "A", size = c(3, 5, 4, 5, 2))
## 'size' vector entered as:
##
## size = structure(c(3, 5, 4, 5, 2), .Names = c("AA", "BB", "CC", "DD",
## "EE"))
## ID A B C D E
## 38 38 AA -0.3042 0.8 NY M
## 10 10 AA -0.1351 1.9 NY M
## 56 56 AA 1.7673 2.5 NY M
## 62 62 BB -0.4616 0.4 CA F
## 74 74 BB -0.1796 0.3 CA F
## 2 2 BB -0.6120 0.0 TX F
## 66 66 BB -0.3928 1.5 TX F
## 22 22 BB -0.7099 0.1 TX M
## 40 40 CC 0.2671 0.9 NY F
## 67 67 CC -0.3200 0.3 CA F
## 16 16 CC 0.1888 2.2 CA M
## 3 3 CC 0.3411 0.3 NY M
## 39 39 DD 0.3700 0.4 CA F
## 49 49 DD -1.2246 0.4 NY F
## 50 50 DD -0.4734 0.4 TX F
## 23 23 DD 0.6107 0.5 NY F
## 17 17 DD -1.8050 0.3 NY M
## 35 35 EE 0.5939 0.5 CA M
## 41 41 EE -0.5425 0.2 NY F
# Use a two-column strata: -E- and -D- -E- varies more slowly, so it is
# better to put that first
stratified(dat1, c("E", "D"), size = 0.15)
## ID A B C D E
## 84 84 BB -1.518394 0.6 CA F
## 34 34 AA -1.523567 1.5 CA F
## 53 53 CC -0.910922 1.6 CA M
## 36 36 DD 0.332950 0.2 CA M
## 97 97 CC 2.087167 0.5 NY F
## 49 49 DD -1.224613 0.4 NY F
## 5 5 BB 1.433024 1.5 NY F
## 37 37 DD 1.063100 1.5 NY M
## 17 17 DD -1.804959 0.3 NY M
## 33 33 CC 1.178087 0.2 NY M
## 14 14 BB 0.028002 0.9 TX F
## 90 90 AA -0.056897 0.0 TX F
## 28 28 BB 0.001105 2.1 TX F
## 80 80 EE -0.324270 0.3 TX M
## 22 22 BB -0.709946 0.1 TX M
# Use a two-column strata (-E- and -D-) but only interested in cases where
# -E- == 'M'
stratified(dat1, c("E", "D"), 0.15, select = list(E = "M"))
## ID A B C D E
## 69 69 AA 0.49419 0.6 CA M
## 32 32 CC -0.13518 1.0 CA M
## 12 12 AA -0.03924 0.2 NY M
## 100 100 DD -1.64061 1.0 NY M
## 56 56 AA 1.76729 2.5 NY M
## 79 79 DD -0.68166 0.2 TX M
## 80 80 EE -0.32427 0.3 TX M
## As above, but where -E- == 'M' and -D- == 'CA' or 'TX'
stratified(dat1, c("E", "D"), 0.15, select = list(E = "M", D = c("CA", "TX")))
## ID A B C D E
## 53 53 CC -0.9109 1.6 CA M
## 36 36 DD 0.3330 0.2 CA M
## 80 80 EE -0.3243 0.3 TX M
## 8 8 DD -1.0441 0.6 TX M
# Use a three-column strata: -E-, -D-, and -A-
s.out <- stratified(dat1, c("E", "D", "A"), size = 2)
## Some groups ---M.CA.BB, M.CA.EE--- contain fewer observations than desired
## number of samples. All observations have been returned from those groups.
list(head(s.out), tail(s.out))
## [[1]]
## ID A B C D E
## 92 92 AA 1.1766 1.0 CA F
## 34 34 AA -1.5236 1.5 CA F
## 24 24 AA -0.9341 0.1 CA M
## 69 69 AA 0.4942 0.6 CA M
## 56 56 AA 1.7673 2.5 NY M
## 10 10 AA -0.1351 1.9 NY M
##
## [[2]]
## ID A B C D E
## 61 61 EE -0.63574 0.2 NY M
## 18 18 EE 1.46555 1.4 NY M
## 21 21 EE 0.47551 2.3 TX F
## 70 70 EE -0.17733 0.0 TX F
## 80 80 EE -0.32427 0.3 TX M
## 77 77 EE -0.07356 0.3 TX M
# How many samples were taken from each strata?
table(interaction(s.out[c("E", "D", "A")]))
##
## F.CA.AA M.CA.AA F.NY.AA M.NY.AA F.TX.AA M.TX.AA F.CA.BB M.CA.BB F.NY.BB
## 2 2 0 2 2 2 2 1 2
## M.NY.BB F.TX.BB M.TX.BB F.CA.CC M.CA.CC F.NY.CC M.NY.CC F.TX.CC M.TX.CC
## 2 2 2 2 2 2 2 2 0
## F.CA.DD M.CA.DD F.NY.DD M.NY.DD F.TX.DD M.TX.DD F.CA.EE M.CA.EE F.NY.EE
## 2 2 2 2 2 2 0 1 2
## M.NY.EE F.TX.EE M.TX.EE
## 2 2 2
# Can we verify the message about group sizes?
names(which(table(interaction(dat1[c("E", "D", "A")])) < 2))
## [1] "F.NY.AA" "M.CA.BB" "M.TX.CC" "F.CA.EE" "M.CA.EE"
names(which(table(interaction(s.out[c("E", "D", "A")])) < 2))
## [1] "F.NY.AA" "M.CA.BB" "M.TX.CC" "F.CA.EE" "M.CA.EE"
本文介绍了一个自定义分层抽样函数,用于从数据集中按特定分层条件进行有放回或不放回的样本抽取。通过该函数,用户可以根据需求灵活地指定分层变量、样本大小、是否进行子集选择以及是否使用放回抽样。
1115





