[置顶]R语言 分层抽样---分层随机抽样(SRS)(二 )

本文介绍了一个自定义分层抽样函数,用于从数据集中按特定分层条件进行有放回或不放回的样本抽取。通过该函数,用户可以根据需求灵活地指定分层变量、样本大小、是否进行子集选择以及是否使用放回抽样。

############################################################

自定义分层抽样函数:

############################################################

stratified <- function(df, group, size, select = NULL,
                       replace = FALSE, bothSets = FALSE) {
  if (is.null(select)) {
    df <- df
  } else {
    if (is.null(names(select))) stop("'select' must be a named list")
    if (!all(names(select) %in% names(df)))
      stop("Please verify your 'select' argument")
    temp <- sapply(names(select),
                   function(x) df[[x]] %in% select[[x]])
    df <- df[rowSums(temp) == length(select), ]
  }
  df.interaction <- interaction(df[group], drop = TRUE)
  df.table <- table(df.interaction)
  df.split <- split(df, df.interaction)
  if (length(size) > 1) {
    if (length(size) != length(df.split))
      stop("Number of groups is ", length(df.split),
           " but number of sizes supplied is ", length(size))
    if (is.null(names(size))) {
      n <- setNames(size, names(df.split))
      message(sQuote("size"), " vector entered as:\n\nsize = structure(c(",
              paste(n, collapse = ", "), "),\n.Names = c(",
              paste(shQuote(names(n)), collapse = ", "), ")) \n\n")
    } else {
      ifelse(all(names(size) %in% names(df.split)),
             n <- size[names(df.split)],
             stop("Named vector supplied with names ",
                  paste(names(size), collapse = ", "),
                  "\n but the names for the group levels are ",
                  paste(names(df.split), collapse = ", ")))
    }
  } else if (size < 1) {
    n <- round(df.table * size, digits = 0)
  } else if (size >= 1) {
    if (all(df.table >= size) || isTRUE(replace)) {
      n <- setNames(rep(size, length.out = length(df.split)),
                    names(df.split))
    } else {
      message(
        "Some groups\n---",
        paste(names(df.table[df.table < size]), collapse = ", "),
        "---\ncontain fewer observations",
        " than desired number of samples.\n",
        "All observations have been returned from those groups.")
      n <- c(sapply(df.table[df.table >= size], function(x) x = size),
             df.table[df.table < size])
    }
  }
  temp <- lapply(
    names(df.split),
    function(x) df.split[[x]][sample(df.table[x],
                                     n[x], replace = replace), ])
  set1 <- do.call("rbind", temp)
 
  if (isTRUE(bothSets)) {
    set2 <- df[!rownames(df) %in% rownames(set1), ]
    list(SET1 = set1, SET2 = set2)
  } else {
    set1
  }
}

================================================
参数说明:
#######################################################

The arguments to stratified are:

  • df: The input data.frame
  • group: A character vector of the column or columns that make up the "strata".
  • size: The desired sample size.
    • If size is a value less than 1, a proportionate sample is taken from each stratum.
    • If size is a single integer of 1 or more, that number of samples is taken from each stratum.
    • If size is a vector of integers, the specified number of samples is taken for each stratum. It is recommended that you use a named vector. For example, if you have two strata, "A" and "B", and you wanted 5 samples from "A" and 10 from "B", you would enter size = c(A = 5, B = 10).
  • select: This allows you to subset the groups in the sampling process. This is a list. For instance, if your group variable was "Group", and it contained three strata, "A", "B", and "C", but you only wanted to sample from "A" and "C", you can use select = list(Group = c("A", "C")).
  • replace: For sampling with replacement.
########################################################
例子说明:
########################################################
set.seed(1)
##创建数据框
dat1 <- data.frame(ID = 1:100, A = sample(c("AA", "BB", "CC", "DD", "EE"), 100,replace = TRUE), B = rnorm(100), C = abs(round(rnorm(100), digits = 1)), D = sample(c("CA", "NY", "TX"), 100, replace = TRUE), E = sample(c("M","F"), 100, replace = TRUE))
##

library(devtools)
source_gist("https://gist.github.com/mrdwab/6424112")
## [1] "https://raw.github.com/gist/6424112"
## SHA-1 hash of file is 0006d8548785ec8a5651c3dd599648cc88d153a4
# Generate a couple of sample data.frames to play with
set.seed(1)
dat1 <- data.frame(ID = 1:100, A = sample(c("AA", "BB", "CC", "DD", "EE"), 100, 
    replace = TRUE), B = rnorm(100), C = abs(round(rnorm(100), digits = 1)), 
    D = sample(c("CA", "NY", "TX"), 100, replace = TRUE), E = sample(c("M", 
        "F"), 100, replace = TRUE))

# What do the data look like in general?
summary(dat1)
##        ID         A            B                 C          D      E     
##  Min.   :  1.0   AA:13   Min.   :-1.9144   Min.   :0.000   CA:23   F:54  
##  1st Qu.: 25.8   BB:25   1st Qu.:-0.6141   1st Qu.:0.300   NY:42   M:46  
##  Median : 50.5   CC:19   Median :-0.1176   Median :0.650   TX:35         
##  Mean   : 50.5   DD:26   Mean   :-0.0176   Mean   :0.825                 
##  3rd Qu.: 75.2   EE:17   3rd Qu.: 0.5382   3rd Qu.:1.200                 
##  Max.   :100.0           Max.   : 2.4016   Max.   :2.900
# Let's take a 10% sample from all -A- groups in dat1
stratified(dat1, "A", 0.1)
##    ID  A       B   C  D E
## 92 92 AA  1.1766 1.0 CA F
## 86 86 BB -1.5364 0.1 NY F
## 74 74 BB -0.1796 0.3 CA F
## 60 60 CC  1.6822 0.5 TX F
## 58 58 CC  0.9102 0.5 CA F
## 9   9 DD  0.5697 1.4 TX F
## 42 42 DD  1.2079 0.4 TX M
## 46 46 DD  0.5585 1.0 CA M
## 99 99 EE -1.2863 0.2 NY F
## 72 72 EE  1.3430 0.0 NY F
# Let's take a 10% sample from only 'AA' and 'BB' groups from -A- in dat1
stratified(dat1, "A", 0.1, select = list(A = c("AA", "BB")))
##    ID  A      B   C  D E
## 69 69 AA 0.4942 0.6 CA M
## 54 54 BB 0.1580 0.3 TX M
## 5   5 BB 1.4330 1.5 NY F
# Let's take 5 samples from all -D- groups in dat1, specified by column
# number
stratified(dat1, group = 5, size = 5)
##    ID  A       B   C  D E
## 73 73 BB -0.2146 0.6 CA F
## 45 45 CC  1.5868 1.2 CA F
## 87 87 DD -0.3010 0.6 CA M
## 13 13 DD  0.6897 1.1 CA F
## 24 24 AA -0.9341 0.1 CA M
## 40 40 CC  0.2671 0.9 NY F
## 83 83 BB  0.5315 0.6 NY M
## 1   1 BB  0.3981 0.5 NY F
## 56 56 AA  1.7673 2.5 NY M
## 88 88 AA -0.5283 1.2 NY M
## 66 66 BB -0.3928 1.5 TX F
## 22 22 BB -0.7099 0.1 TX M
## 79 79 DD -0.6817 0.2 TX M
## 27 27 AA -0.4433 0.8 TX M
## 51 51 CC -0.6204 0.4 TX F
# Let's take a sample from all -A- groups in dat1, where we specify the
# number wanted from each group
stratified(dat1, "A", size = c(3, 5, 4, 5, 2))
## 'size' vector entered as:
## 
## size = structure(c(3, 5, 4, 5, 2), .Names = c("AA", "BB", "CC", "DD",
## "EE"))
##    ID  A       B   C  D E
## 38 38 AA -0.3042 0.8 NY M
## 10 10 AA -0.1351 1.9 NY M
## 56 56 AA  1.7673 2.5 NY M
## 62 62 BB -0.4616 0.4 CA F
## 74 74 BB -0.1796 0.3 CA F
## 2   2 BB -0.6120 0.0 TX F
## 66 66 BB -0.3928 1.5 TX F
## 22 22 BB -0.7099 0.1 TX M
## 40 40 CC  0.2671 0.9 NY F
## 67 67 CC -0.3200 0.3 CA F
## 16 16 CC  0.1888 2.2 CA M
## 3   3 CC  0.3411 0.3 NY M
## 39 39 DD  0.3700 0.4 CA F
## 49 49 DD -1.2246 0.4 NY F
## 50 50 DD -0.4734 0.4 TX F
## 23 23 DD  0.6107 0.5 NY F
## 17 17 DD -1.8050 0.3 NY M
## 35 35 EE  0.5939 0.5 CA M
## 41 41 EE -0.5425 0.2 NY F
# Use a two-column strata: -E- and -D- -E- varies more slowly, so it is
# better to put that first
stratified(dat1, c("E", "D"), size = 0.15)
##    ID  A         B   C  D E
## 84 84 BB -1.518394 0.6 CA F
## 34 34 AA -1.523567 1.5 CA F
## 53 53 CC -0.910922 1.6 CA M
## 36 36 DD  0.332950 0.2 CA M
## 97 97 CC  2.087167 0.5 NY F
## 49 49 DD -1.224613 0.4 NY F
## 5   5 BB  1.433024 1.5 NY F
## 37 37 DD  1.063100 1.5 NY M
## 17 17 DD -1.804959 0.3 NY M
## 33 33 CC  1.178087 0.2 NY M
## 14 14 BB  0.028002 0.9 TX F
## 90 90 AA -0.056897 0.0 TX F
## 28 28 BB  0.001105 2.1 TX F
## 80 80 EE -0.324270 0.3 TX M
## 22 22 BB -0.709946 0.1 TX M
# Use a two-column strata (-E- and -D-) but only interested in cases where
# -E- == 'M'
stratified(dat1, c("E", "D"), 0.15, select = list(E = "M"))
##      ID  A        B   C  D E
## 69   69 AA  0.49419 0.6 CA M
## 32   32 CC -0.13518 1.0 CA M
## 12   12 AA -0.03924 0.2 NY M
## 100 100 DD -1.64061 1.0 NY M
## 56   56 AA  1.76729 2.5 NY M
## 79   79 DD -0.68166 0.2 TX M
## 80   80 EE -0.32427 0.3 TX M
## As above, but where -E- == 'M' and -D- == 'CA' or 'TX'
stratified(dat1, c("E", "D"), 0.15, select = list(E = "M", D = c("CA", "TX")))
##    ID  A       B   C  D E
## 53 53 CC -0.9109 1.6 CA M
## 36 36 DD  0.3330 0.2 CA M
## 80 80 EE -0.3243 0.3 TX M
## 8   8 DD -1.0441 0.6 TX M
# Use a three-column strata: -E-, -D-, and -A-
s.out <- stratified(dat1, c("E", "D", "A"), size = 2)
## Some groups ---M.CA.BB, M.CA.EE--- contain fewer observations than desired
## number of samples. All observations have been returned from those groups.
list(head(s.out), tail(s.out))
## [[1]]
##    ID  A       B   C  D E
## 92 92 AA  1.1766 1.0 CA F
## 34 34 AA -1.5236 1.5 CA F
## 24 24 AA -0.9341 0.1 CA M
## 69 69 AA  0.4942 0.6 CA M
## 56 56 AA  1.7673 2.5 NY M
## 10 10 AA -0.1351 1.9 NY M
## 
## [[2]]
##    ID  A        B   C  D E
## 61 61 EE -0.63574 0.2 NY M
## 18 18 EE  1.46555 1.4 NY M
## 21 21 EE  0.47551 2.3 TX F
## 70 70 EE -0.17733 0.0 TX F
## 80 80 EE -0.32427 0.3 TX M
## 77 77 EE -0.07356 0.3 TX M
# How many samples were taken from each strata?
table(interaction(s.out[c("E", "D", "A")]))
## 
## F.CA.AA M.CA.AA F.NY.AA M.NY.AA F.TX.AA M.TX.AA F.CA.BB M.CA.BB F.NY.BB 
##       2       2       0       2       2       2       2       1       2 
## M.NY.BB F.TX.BB M.TX.BB F.CA.CC M.CA.CC F.NY.CC M.NY.CC F.TX.CC M.TX.CC 
##       2       2       2       2       2       2       2       2       0 
## F.CA.DD M.CA.DD F.NY.DD M.NY.DD F.TX.DD M.TX.DD F.CA.EE M.CA.EE F.NY.EE 
##       2       2       2       2       2       2       0       1       2 
## M.NY.EE F.TX.EE M.TX.EE 
##       2       2       2
# Can we verify the message about group sizes?
names(which(table(interaction(dat1[c("E", "D", "A")])) < 2))
## [1] "F.NY.AA" "M.CA.BB" "M.TX.CC" "F.CA.EE" "M.CA.EE"
names(which(table(interaction(s.out[c("E", "D", "A")])) < 2))
## [1] "F.NY.AA" "M.CA.BB" "M.TX.CC" "F.CA.EE" "M.CA.EE"
















评论 1
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值