[置顶]R语言 分层抽样---分层随机抽样(SRS)(二 )

############################################################

自定义分层抽样函数:

############################################################

stratified <- function(df, group, size, select = NULL,
                       replace = FALSE, bothSets = FALSE) {
  if (is.null(select)) {
    df <- df
  } else {
    if (is.null(names(select))) stop("'select' must be a named list")
    if (!all(names(select) %in% names(df)))
      stop("Please verify your 'select' argument")
    temp <- sapply(names(select),
                   function(x) df[[x]] %in% select[[x]])
    df <- df[rowSums(temp) == length(select), ]
  }
  df.interaction <- interaction(df[group], drop = TRUE)
  df.table <- table(df.interaction)
  df.split <- split(df, df.interaction)
  if (length(size) > 1) {
    if (length(size) != length(df.split))
      stop("Number of groups is ", length(df.split),
           " but number of sizes supplied is ", length(size))
    if (is.null(names(size))) {
      n <- setNames(size, names(df.split))
      message(sQuote("size"), " vector entered as:\n\nsize = structure(c(",
              paste(n, collapse = ", "), "),\n.Names = c(",
              paste(shQuote(names(n)), collapse = ", "), ")) \n\n")
    } else {
      ifelse(all(names(size) %in% names(df.split)),
             n <- size[names(df.split)],
             stop("Named vector supplied with names ",
                  paste(names(size), collapse = ", "),
                  "\n but the names for the group levels are ",
                  paste(names(df.split), collapse = ", ")))
    }
  } else if (size < 1) {
    n <- round(df.table * size, digits = 0)
  } else if (size >= 1) {
    if (all(df.table >= size) || isTRUE(replace)) {
      n <- setNames(rep(size, length.out = length(df.split)),
                    names(df.split))
    } else {
      message(
        "Some groups\n---",
        paste(names(df.table[df.table < size]), collapse = ", "),
        "---\ncontain fewer observations",
        " than desired number of samples.\n",
        "All observations have been returned from those groups.")
      n <- c(sapply(df.table[df.table >= size], function(x) x = size),
             df.table[df.table < size])
    }
  }
  temp <- lapply(
    names(df.split),
    function(x) df.split[[x]][sample(df.table[x],
                                     n[x], replace = replace), ])
  set1 <- do.call("rbind", temp)
 
  if (isTRUE(bothSets)) {
    set2 <- df[!rownames(df) %in% rownames(set1), ]
    list(SET1 = set1, SET2 = set2)
  } else {
    set1
  }
}

================================================
参数说明:
#######################################################

The arguments to stratified are:

  • df: The input data.frame
  • group: A character vector of the column or columns that make up the "strata".
  • size: The desired sample size.
    • If size is a value less than 1, a proportionate sample is taken from each stratum.
    • If size is a single integer of 1 or more, that number of samples is taken from each stratum.
    • If size is a vector of integers, the specified number of samples is taken for each stratum. It is recommended that you use a named vector. For example, if you have two strata, "A" and "B", and you wanted 5 samples from "A" and 10 from "B", you would enter size = c(A = 5, B = 10).
  • select: This allows you to subset the groups in the sampling process. This is a list. For instance, if your group variable was "Group", and it contained three strata, "A", "B", and "C", but you only wanted to sample from "A" and "C", you can use select = list(Group = c("A", "C")).
  • replace: For sampling with replacement.
########################################################
例子说明:
########################################################
set.seed(1)
##创建数据框
dat1 <- data.frame(ID = 1:100, A = sample(c("AA", "BB", "CC", "DD", "EE"), 100,replace = TRUE), B = rnorm(100), C = abs(round(rnorm(100), digits = 1)), D = sample(c("CA", "NY", "TX"), 100, replace = TRUE), E = sample(c("M","F"), 100, replace = TRUE))
##

library(devtools)
source_gist("https://gist.github.com/mrdwab/6424112")
## [1] "https://raw.github.com/gist/6424112"
## SHA-1 hash of file is 0006d8548785ec8a5651c3dd599648cc88d153a4
# Generate a couple of sample data.frames to play with
set.seed(1)
dat1 <- data.frame(ID = 1:100, A = sample(c("AA", "BB", "CC", "DD", "EE"), 100, 
    replace = TRUE), B = rnorm(100), C = abs(round(rnorm(100), digits = 1)), 
    D = sample(c("CA", "NY", "TX"), 100, replace = TRUE), E = sample(c("M", 
        "F"), 100, replace = TRUE))

# What do the data look like in general?
summary(dat1)
##        ID         A            B                 C          D      E     
##  Min.   :  1.0   AA:13   Min.   :-1.9144   Min.   :0.000   CA:23   F:54  
##  1st Qu.: 25.8   BB:25   1st Qu.:-0.6141   1st Qu.:0.300   NY:42   M:46  
##  Median : 50.5   CC:19   Median :-0.1176   Median :0.650   TX:35         
##  Mean   : 50.5   DD:26   Mean   :-0.0176   Mean   :0.825                 
##  3rd Qu.: 75.2   EE:17   3rd Qu.: 0.5382   3rd Qu.:1.200                 
##  Max.   :100.0           Max.   : 2.4016   Max.   :2.900
# Let's take a 10% sample from all -A- groups in dat1
stratified(dat1, "A", 0.1)
##    ID  A       B   C  D E
## 92 92 AA  1.1766 1.0 CA F
## 86 86 BB -1.5364 0.1 NY F
## 74 74 BB -0.1796 0.3 CA F
## 60 60 CC  1.6822 0.5 TX F
## 58 58 CC  0.9102 0.5 CA F
## 9   9 DD  0.5697 1.4 TX F
## 42 42 DD  1.2079 0.4 TX M
## 46 46 DD  0.5585 1.0 CA M
## 99 99 EE -1.2863 0.2 NY F
## 72 72 EE  1.3430 0.0 NY F
# Let's take a 10% sample from only 'AA' and 'BB' groups from -A- in dat1
stratified(dat1, "A", 0.1, select = list(A = c("AA", "BB")))
##    ID  A      B   C  D E
## 69 69 AA 0.4942 0.6 CA M
## 54 54 BB 0.1580 0.3 TX M
## 5   5 BB 1.4330 1.5 NY F
# Let's take 5 samples from all -D- groups in dat1, specified by column
# number
stratified(dat1, group = 5, size = 5)
##    ID  A       B   C  D E
## 73 73 BB -0.2146 0.6 CA F
## 45 45 CC  1.5868 1.2 CA F
## 87 87 DD -0.3010 0.6 CA M
## 13 13 DD  0.6897 1.1 CA F
## 24 24 AA -0.9341 0.1 CA M
## 40 40 CC  0.2671 0.9 NY F
## 83 83 BB  0.5315 0.6 NY M
## 1   1 BB  0.3981 0.5 NY F
## 56 56 AA  1.7673 2.5 NY M
## 88 88 AA -0.5283 1.2 NY M
## 66 66 BB -0.3928 1.5 TX F
## 22 22 BB -0.7099 0.1 TX M
## 79 79 DD -0.6817 0.2 TX M
## 27 27 AA -0.4433 0.8 TX M
## 51 51 CC -0.6204 0.4 TX F
# Let's take a sample from all -A- groups in dat1, where we specify the
# number wanted from each group
stratified(dat1, "A", size = c(3, 5, 4, 5, 2))
## 'size' vector entered as:
## 
## size = structure(c(3, 5, 4, 5, 2), .Names = c("AA", "BB", "CC", "DD",
## "EE"))
##    ID  A       B   C  D E
## 38 38 AA -0.3042 0.8 NY M
## 10 10 AA -0.1351 1.9 NY M
## 56 56 AA  1.7673 2.5 NY M
## 62 62 BB -0.4616 0.4 CA F
## 74 74 BB -0.1796 0.3 CA F
## 2   2 BB -0.6120 0.0 TX F
## 66 66 BB -0.3928 1.5 TX F
## 22 22 BB -0.7099 0.1 TX M
## 40 40 CC  0.2671 0.9 NY F
## 67 67 CC -0.3200 0.3 CA F
## 16 16 CC  0.1888 2.2 CA M
## 3   3 CC  0.3411 0.3 NY M
## 39 39 DD  0.3700 0.4 CA F
## 49 49 DD -1.2246 0.4 NY F
## 50 50 DD -0.4734 0.4 TX F
## 23 23 DD  0.6107 0.5 NY F
## 17 17 DD -1.8050 0.3 NY M
## 35 35 EE  0.5939 0.5 CA M
## 41 41 EE -0.5425 0.2 NY F
# Use a two-column strata: -E- and -D- -E- varies more slowly, so it is
# better to put that first
stratified(dat1, c("E", "D"), size = 0.15)
##    ID  A         B   C  D E
## 84 84 BB -1.518394 0.6 CA F
## 34 34 AA -1.523567 1.5 CA F
## 53 53 CC -0.910922 1.6 CA M
## 36 36 DD  0.332950 0.2 CA M
## 97 97 CC  2.087167 0.5 NY F
## 49 49 DD -1.224613 0.4 NY F
## 5   5 BB  1.433024 1.5 NY F
## 37 37 DD  1.063100 1.5 NY M
## 17 17 DD -1.804959 0.3 NY M
## 33 33 CC  1.178087 0.2 NY M
## 14 14 BB  0.028002 0.9 TX F
## 90 90 AA -0.056897 0.0 TX F
## 28 28 BB  0.001105 2.1 TX F
## 80 80 EE -0.324270 0.3 TX M
## 22 22 BB -0.709946 0.1 TX M
# Use a two-column strata (-E- and -D-) but only interested in cases where
# -E- == 'M'
stratified(dat1, c("E", "D"), 0.15, select = list(E = "M"))
##      ID  A        B   C  D E
## 69   69 AA  0.49419 0.6 CA M
## 32   32 CC -0.13518 1.0 CA M
## 12   12 AA -0.03924 0.2 NY M
## 100 100 DD -1.64061 1.0 NY M
## 56   56 AA  1.76729 2.5 NY M
## 79   79 DD -0.68166 0.2 TX M
## 80   80 EE -0.32427 0.3 TX M
## As above, but where -E- == 'M' and -D- == 'CA' or 'TX'
stratified(dat1, c("E", "D"), 0.15, select = list(E = "M", D = c("CA", "TX")))
##    ID  A       B   C  D E
## 53 53 CC -0.9109 1.6 CA M
## 36 36 DD  0.3330 0.2 CA M
## 80 80 EE -0.3243 0.3 TX M
## 8   8 DD -1.0441 0.6 TX M
# Use a three-column strata: -E-, -D-, and -A-
s.out <- stratified(dat1, c("E", "D", "A"), size = 2)
## Some groups ---M.CA.BB, M.CA.EE--- contain fewer observations than desired
## number of samples. All observations have been returned from those groups.
list(head(s.out), tail(s.out))
## [[1]]
##    ID  A       B   C  D E
## 92 92 AA  1.1766 1.0 CA F
## 34 34 AA -1.5236 1.5 CA F
## 24 24 AA -0.9341 0.1 CA M
## 69 69 AA  0.4942 0.6 CA M
## 56 56 AA  1.7673 2.5 NY M
## 10 10 AA -0.1351 1.9 NY M
## 
## [[2]]
##    ID  A        B   C  D E
## 61 61 EE -0.63574 0.2 NY M
## 18 18 EE  1.46555 1.4 NY M
## 21 21 EE  0.47551 2.3 TX F
## 70 70 EE -0.17733 0.0 TX F
## 80 80 EE -0.32427 0.3 TX M
## 77 77 EE -0.07356 0.3 TX M
# How many samples were taken from each strata?
table(interaction(s.out[c("E", "D", "A")]))
## 
## F.CA.AA M.CA.AA F.NY.AA M.NY.AA F.TX.AA M.TX.AA F.CA.BB M.CA.BB F.NY.BB 
##       2       2       0       2       2       2       2       1       2 
## M.NY.BB F.TX.BB M.TX.BB F.CA.CC M.CA.CC F.NY.CC M.NY.CC F.TX.CC M.TX.CC 
##       2       2       2       2       2       2       2       2       0 
## F.CA.DD M.CA.DD F.NY.DD M.NY.DD F.TX.DD M.TX.DD F.CA.EE M.CA.EE F.NY.EE 
##       2       2       2       2       2       2       0       1       2 
## M.NY.EE F.TX.EE M.TX.EE 
##       2       2       2
# Can we verify the message about group sizes?
names(which(table(interaction(dat1[c("E", "D", "A")])) < 2))
## [1] "F.NY.AA" "M.CA.BB" "M.TX.CC" "F.CA.EE" "M.CA.EE"
names(which(table(interaction(s.out[c("E", "D", "A")])) < 2))
## [1] "F.NY.AA" "M.CA.BB" "M.TX.CC" "F.CA.EE" "M.CA.EE"
















评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值