4

This is my first shot at trying to make a reproducible question. Trying to become a better member of SO.

here is the dataset

reach.dat <- structure(list(stream = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L), .Label = c("Brooks Creek", "Buncombe Hollow Creek", 
"Bypass Channel", "Cape Horn Creek", "Cougar Creek", "Dog Creek", 
"Indian George Creek", "Jim Creek", "Lower Speelyai", "North Siouxon Creek", 
"Ole Creek", "Panamaker Creek", "Siouxon Creek", "Speelyai Creek", 
"West Fork Speelyai Creek", "West Tributary Speelyai Creek"), class = "factor"), 
    reach = structure(c(21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 
    29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 1L, 2L, 3L, 
    4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
    17L, 18L, 19L, 20L, 46L, 47L, 38L, 39L, 40L, 41L, 42L, 43L, 
    44L, 45L), .Label = c("BNH_0001", "BNH_0002", "BNH_0003", 
    "BNH_0004", "BNH_0005", "BNH_0006", "BNH_0007", "BNH_0008", 
    "BNH_0009", "BPC_0001", "BPC_0002", "BPC_0003", "BPC_0004", 
    "BPC_0005", "BPC_0006", "BPC_0007", "BPC_0008", "BPC_0009", 
    "BPC_0010", "BPC_0011", "BRK_001", "BRK_002", "BRK_003", 
    "BRK_004", "BRK_005", "BRK_006", "BRK_007", "BRK_008", "BRK_009", 
    "BRK_010", "BRK_011", "BRK_012", "BRK_013", "BRK_014", "BRK_015", 
    "BRK_016", "BRK_017", "CGR_0001", "CGR_0002", "CGR_0003", 
    "CGR_0004", "CGR_0005", "CGR_0006", "CGR_0007", "CGR_0008", 
    "CHC_0001", "CHC_0002", "DOG_0001", "ING_0001", "ING_0002", 
    "ING_0003", "ING_0004", "ING_0005", "ING_0006", "ING_0007", 
    "JMC_0001", "JMC_0002", "LSPL_0001", "NSX_0001", "NSX_0002", 
    "OLE_0001", "OLE_0002", "OLE_0003", "OLE_0004", "OLE_0005", 
    "OLE_0006", "PMR_0001", "PMR_0002", "SPL_0001", "SPL_0002", 
    "SPL_0003", "SPL_0004", "SPL_0005", "SPL_0006", "SPL_0007", 
    "SPL_0008", "SPL_0009", "SPL_0010", "SPL_0011", "SPL_0012", 
    "SPL_0013", "SPL_0014", "SPL_0015", "SPL_0016", "SPL_0017", 
    "SPL_0018", "SPL_0019", "SPL_0020", "SPL_0021", "SPL_0022", 
    "SPL_0023", "SXN_0001", "SXN_0002", "SXN_0003", "SXN_0004", 
    "SXN_0005", "SXN_0006", "SXN_0007", "SXN_0008", "WFSPL_0001", 
    "WFSPL_0002", "WFSPL_0003", "WFSPL_0004", "WFSPL_0005", "WTSPL_0001", 
    "WTSPL_0002", "WTSPL_0003", "WTSPL_0004", "WTSPL_0005"), class = "factor"), 
    length = c(178.9, 170.1, 185.8, 199.7, 190.3, 190, 172.3, 
    196.4, 179, 183.4, 182.4, 196.7, 188.4, 181.5, 178.9, 196.8, 
    181.2, 116.2, 121.4, 124.2, 180, 111.7, 130.3, 127.5, 143.4, 
    75.7, 591.1, 640.9, 582.2, 659, 534.9, 621.9, 636, 564.2, 
    547.1, 537.2, 589.6, 329.5, 192.7, 454.4, 464.5, 477.6, 544.8, 
    500.1, 473.1, 481.7, 506.4), SUM_LWD = c(0.288093907, 1.324221046, 
    1.763186222, 0.774161242, 0.391907514, 0.889842105, 0, 1.5200611, 
    1.097765363, 0.573173391, 0, 0.622267412, 0.427070064, 1.43338843, 
    1.151928452, 1.935365854, 1.856622517, 0.326333907, 0.07199341, 
    0.037520129, 0, 0, 0, 0.169647059, 0.138493724, 0.098678996, 
    0, 0, 0.217279285, 0.179044006, 0.097868761, 0.210644798, 
    0, 0.708259482, 0.145567538, 0.03877513, 0.026611262, 2.302579666, 
    2.125583809, 0.092033451, 0.176533907, 0.955904523, 0.175624082, 
    1.434553089, 1.038575354, 0.198048578, 1.042061611), Avg_RPD = c(0.352, 
    0.343333333, 0.325, 0.34, 0.225, 0.2475, 0.254, 0.2125, 0.276666667, 
    0.22, 0.32875, 0.23125, 0.215, 0.268, 0.305, 0.243636, 0.335714286, 
    0.2, 0.216666667, 0.24, 0.243333333, 0.56, 0.2575, 0.208, 
    0.335833333, 0.376666667, 0.175, 0.695, 0.75, 0.546666667, 
    1.188333333, 1.58, 0.885, 0.448571429, NA, NA, NA, 0.611818182, 
    0.50875, NA, 0.77, 0.49875, NA, 0.544, 1.017777778, 0.9175, 
    0.623333333), Avg_Substrate_Pools = c(86.10955531, 104.0366873, 
    83.94224019, 88.24737809, 88.93432719, 82.59257502, 84.02947757, 
    81.71253918, 82.76023619, 82.88298227, 84.91750332, 81.21887219, 
    85.05625823, 82.04273063, 84.01489539, 92.41003006, 117.3416424, 
    88.61396078, 88.86511047, 83.38603629, 71.73828707, 76.57563745, 
    86.2069123, 83.62789949, 81.19546417, 80.89002676, 182.5586, 
    160.63761, 134.82532, 123.1769494, 112.0805653, 93.59420959, 
    180.5731068, 82.91509529, NA, 61.9283, NA, 111.7153167, 83.79134743, 
    61.9283, 89.51477005, NA, NA, 86.08708892, 85.3472397, 110.8504333, 
    91.00371559)), .Names = c("stream", "reach", "length", "SUM_LWD", 
"Avg_RPD", "Avg_Substrate_Pools"), row.names = c(NA, 47L), class = "data.frame")

This is a small piece of the data frame, and I have removed several levels of one of the factor variables (stream in this case). So you will see in the beginning of the script, I add a few more columns, remove rows with NA (this is the source of my problem I believe, as this code works fine for variables that do not have any NA cells). Then I drop levels, as the original dataset had many more streams as I just mentioned. The error takes place at the end of the script when I run ht.means.xx I get an error Error: result size (5), expected 4 or 1. I think 5 is the right number, as there are 5 different stream names. I did not remove entire streams by removing rows with NA values. Each row is actually a reach for a particular stream, and each stream has more than 1 reach. So even though a reach (row) is removed for having a NA value, the stream should still be present due to other reaches (rows) of that stream not having NA values.

Here is the script. This is a little confusing, I am trying my best. I think if you read my description and look at the data.frame that is created (its structure) it will make sense.

# Getting Started ---------------------------------------------------------

require(dplyr)
require(sampling)
require(xtable)
require(car)
require(lattice)

## Read in Data on reaches for generating 1st order probabilities

#add a column for stream ID
reach.dat$streamID <- as.numeric(reach.dat$stream)

#add a column for reach ID
reach.dat$reachID <- 1:47

#add probabilities of selecting each reach, within each stream (prop to length)
length.totals <- reach.dat %>%
  group_by(stream) %>%
  summarise(totals = sum(length))

reach.dat <- merge(reach.dat, length.totals, by  = "stream")

reach.dat$length.probs <- with(reach.dat, length/totals)

#reorder columns for organization
reach.dat <- reach.dat[, c("stream","streamID","reach","reachID", "length", 
                           "length.probs", "totals", "Avg_RPD", "Avg_Substrate_Pools", "SUM_LWD")]
reach.dat <- reach.dat[,1:8]

# Remove the NA values in specific columns. Specify Column
completeFun <- function(reach.dat, desiredCols) {
  completeVec <- complete.cases(reach.dat[, desiredCols])
  return(reach.dat[completeVec, ])
}
reach.dat <- completeFun(reach.dat, "Avg_RPD")

droplevels(reach.dat)

nsim <- 100

# Running Simulations -----------------------------------------------------


#this function draws a unequal probability sample within each stream
sample.fun.pi <- function(stream.no, n.vec){
  sample <-
    #ifelse statement to deal with the streams that have only 1 reach
    ifelse(length(reach.dat[reach.dat$streamID==stream.no, "reachID"]) == 1,
           #this line says to sample that one reach, if the stream only has one reach
           reach.dat[which(reach.dat$streamID==stream.no), "reachID"],
           #if more than one reach, draw a uneq prob sample of size n.vec
           list(sample(subset(reach.dat, streamID == stream.no)$reachID, 
                       size = n.vec[stream.no], 
                       #probabilities proportional to length
                       prob = reach.dat[reach.dat$streamID == stream.no, "length.probs"],
                       #without replacement
                       replace = FALSE)))
  return(unlist(sample))
}

strata.uneq.pi <- function(n.vec, nsim){
  #store nsim samples in a matrix called sample.all
  sample.all <- matrix(NA, nrow = sum(n.vec), ncol = nsim)
  for(i in 1:nsim){
    #for each sample, apply the above function to all the streams
    sample.all[, i] <- unlist(apply(cbind(1:length(unique(reach.dat$streamID))), 1, 
                                    sample.fun.pi, n.vec))
  }
  return(sample.all)
}
#define sample sizes
n1.6<-c(1,1,1,1,1)
n7.8<-c(1,1,1,1,1)
n9.10<-c(2,1,1,1,1)

#Set seed
set.seed(303)

#build a matrix of samples for every sample size
sample.1.6 <- strata.uneq.pi(n1.6, nsim)
sample.7.8 <- strata.uneq.pi(n7.8, nsim)
sample.9.10 <- strata.uneq.pi(n9.10, nsim)


#for each sample size, calculate the inclusion probabilities for each reach by counting the number of times that reach is
# selected out of the total number of simulations

pi.1.6 <- numeric(length(reach.dat$reach))
for(i in 1:length(reach.dat$reach)) {
  #total number of times the reach is sampled out of the total num of sims
  pi.1.6[i] <- sum(sample.1.6==i)/nsim
}

pi.7.8 <- numeric(length(reach.dat$reach))
for(i in 1:length(reach.dat$reach)) {
  #total number of times the reach is sampled out of the total num of sims
  pi.7.8[i] <- sum(sample.7.8==i)/nsim
}

pi.9.10 <- numeric(length(reach.dat$reach))
for(i in 1:length(reach.dat$reach)) {
  #total number of times the reach is sampled out of the total num of sims
  pi.9.10[i] <- sum(sample.9.10==i)/nsim
}


# Use this to enter the variable values of choice
reach.dat$y<-(reach.dat$Avg_RPD)
#find number of reaches in each stream for calculating ht estimator
reach.dat$stream <- as.factor(as.character(reach.dat$stream))
num.reaches <- unlist(with(reach.dat, tapply(reach, stream, length)))

#find true mean of responses within each stream
true_y <- reach.dat %>%
  group_by(stream) %>%
  summarise(true_y=mean(y)) %>%
  ungroup()

#this function calculates the ht estimator for each simulated sample from above
#uses the estimated inclusion probabilities
ht.fun <- function(simnum, sample.n, pi.n){
  #reaches that were selected in the sample
  reach <- sample.n[, simnum]
  #estimated inclusion probabilities for those reaches
  pi <- pi.n[sample.n[, simnum]]
  #y-values that were selected
  y <- reach.dat[sample.n[, simnum], "y"]
  #the streams that the selected reaches belong to
  stream <- reach.dat[sample.n[, simnum], "stream"]
  #organize into a dataframe so we can use dplyr on it
  data <- cbind.data.frame(reach, pi, y, stream)
  #use dplyr to calculate the ht estimates of the total and the mean
  ht.strata <- data %>%
    tbl_df %>%
    group_by(stream) %>%
    summarise(total = sum(y/pi)) %>%
    mutate(size = num.reaches) %>%
    mutate(mean = total/size)
  return(ht.strata[, "mean"])
}

#apply to every one of the nsim samples for each sampling rate

ht.means.1.6 <- data.frame(apply(cbind(1:nsim), 1, ht.fun, sample.n = sample.1.6, 
                                 pi.n = pi.1.6))


ht.means.7.8 <- data.frame(apply(cbind(1:nsim), 1, ht.fun, sample.n = sample.7.8, 
                                 pi.n = pi.7.8))


ht.means.9.10 <- data.frame(apply(cbind(1:nsim), 1, ht.fun, sample.n = sample.9.10, 
                                  pi.n = pi.9.10))
Christopher
  • 189
  • 1
  • 10
  • 5
    Your example is reproducible (woo!) but not really _minimal_, which is the other part of [asking a great R question](http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example/5963610#5963610). Shorter, more concise questions are more likely to get answers, because sorting through a lot of code that's fine just to see how it works is a pain. – alistaire Mar 24 '16 at 18:21
  • I feel like the code which is mostly simulations is very relevant to the question. I could be missing something. And because certain functions build off one another I felt I should include them all. I feel it is minimal considering what I started with.... – Christopher Mar 24 '16 at 18:27
  • This line produces an error `reach.dat$length.probs <- with(reach.dat, length/totals)`. There is no column called `totals`. Please run your code again and be sure that we can get to the end without an error. You have `totals.x` and `totals.y` which are you comparing to `length`? – Pierre L Mar 24 '16 at 18:34
  • @Pierre Lafortune I just ran the code again, and it worked for me. – Christopher Mar 24 '16 at 18:38
  • that is because you have `totals` lingering in your global environment. Try `rm(totals)` then try again. Please be aware that we do not have `totals` – Pierre L Mar 24 '16 at 18:39
  • rm(totals) Warning message: In rm(totals) : object 'totals' not found – Christopher Mar 24 '16 at 18:40
  • maybe you are missing this part length.totals <- reach.dat %>% group_by(stream) %>% summarise(totals = sum(length)) – Christopher Mar 24 '16 at 18:41
  • That is one column in `length.totals`. When you merge it to `reach.dat` it is broken into two columns `totals.x` and `totals.y` – Pierre L Mar 24 '16 at 18:42
  • Let us [continue this discussion in chat](http://chat.stackoverflow.com/rooms/107263/discussion-between-christopher-and-pierre-lafortune). – Christopher Mar 24 '16 at 18:43

1 Answers1

1

Here are a number of changes you can make:

#add a column for stream ID
reach.dat$streamID <- as.numeric(reach.dat$stream)

#add a column for reach ID
reach.dat$reachID <- 1:nrow(reach.dat)

#add probabilities of selecting each reach, within each stream (prop to length)
reach.dat <- reach.dat %>%
  group_by(stream) %>%
  mutate(totals = sum(length),
         length.probs=length/totals)

#reorder columns for organization
reach.dat <- reach.dat[, c("stream","streamID","reach","reachID", "length", 
                           "length.probs", "totals", "Avg_RPD", "Avg_Substrate_Pools", "SUM_LWD")]
reach.dat <- reach.dat[,1:8]
reach.dat <- reach.dat[!is.na(reach.dat$Avg_RPD),]
reach.dat <- droplevels(reach.dat)

#add a column for reach ID (Again)
reach.dat$reachID <- 1:nrow(reach.dat)

nsim <- 10

# Running Simulations -----------------------------------------------------
#define sample sizes
sizes <- list(
  n1.6=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
  n7.8=c(1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1),
  n9.10=c(2,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1),
  n11.13=c(2,1,1,1,1,1,1,1,1,1,1,1,1,3,1,1),
  n14=c(2,1,2,1,1,1,1,1,1,1,1,1,1,3,1,1),
  n15=c(3,1,2,1,1,1,1,1,1,1,1,1,1,3,1,1),
  n16=c(3,1,2,1,1,1,1,1,1,1,1,1,1,4,1,1),
  n17.18=c(3,2,2,1,1,1,1,1,1,1,1,1,1,4,1,1),
  n19=c(3,2,2,1,2,1,1,1,1,1,1,1,2,4,1,1),
  n20=c(3,2,2,1,2,1,1,1,1,1,1,1,2,5,1,1),
  n21=c(4,2,2,1,2,1,1,1,1,1,1,1,2,5,1,1),
  n22=c(4,2,2,1,2,1,2,1,1,1,1,1,2,5,1,1),
  n23=c(4,2,3,1,2,1,2,1,1,1,1,1,2,5,1,1),
  n24=c(4,2,3,1,2,1,2,1,1,1,1,1,2,6,1,1),
  n25.26=c(4,2,3,1,2,1,2,1,1,1,2,1,2,6,1,1),
  n27=c(5,2,3,1,2,1,2,1,1,1,2,1,2,6,1,1),
  n28=c(5,3,3,1,2,1,2,1,1,1,2,1,2,6,1,1),
  n29=c(5,3,3,1,2,1,2,1,1,1,2,1,2,7,1,1),
  n30.31=c(5,3,3,1,2,1,2,1,1,1,2,1,2,7,2,2),
  n32=c(5,3,4,1,3,1,2,1,1,1,2,1,3,7,2,2),
  n33.35=c(6,3,4,1,3,1,2,1,1,1,2,1,3,8,2,2),
  n36=c(6,3,4,1,3,1,3,1,1,1,2,1,3,8,2,2),
  n37.38=c(6,3,4,1,3,1,3,1,1,1,2,1,3,9,2,2),
  n39.40=c(7,4,4,1,3,1,3,1,1,1,2,1,3,9,2,2),
  n41=c(7,4,5,1,3,1,3,1,1,1,2,1,3,9,2,2),
  n42.43=c(7,4,5,1,3,1,3,1,1,1,3,1,3,10,2,2),
  n44=c(7,4,5,1,4,1,3,1,1,1,3,1,4,10,2,2),
  n45=c(8,4,5,1,4,1,3,1,1,1,3,1,4,10,2,2),
  n46.49=c(8,4,5,1,4,1,3,1,1,1,3,1,4,11,2,2),
  n50=c(9,5,6,1,4,1,4,1,1,1,3,1,4,12,3,3))

#Set seed
set.seed(303)

s <- split(reach.dat, reach.dat[,"streamID"])
s.reach <- lapply(s, '[[', "reachID")
s.probs <- lapply(s, '[[', "length.probs")
ind1 <- lengths(s.probs) == 1
strata.uneq.pi <- function(n.vec, nsim) {
  replicate(nsim, {out <- vector("list", length(n.vec))
  out[ind1] <- s.reach[ind1]
  out[!ind1] <- Map(sample, x=s.reach[!ind1],
                             size=n.vec[!ind1],
                             prob=s.probs[!ind1],
                             replace=FALSE)
  unlist(out)
  })
}


#build a matrix of samples for every sample size
samples <- lapply(sizes, function(x) strata.uneq.pi(x, nsim))


#for each sample size, calculate the inclusion probabilities for each reach by counting the number of times that reach is
# selected out of the total number of simulations
getpi <- function(smple, nsim) {
  len <- length(reach.dat$reach)
  pi <- numeric(len)
  for(i in 1:len) pi[i] <- sum(smple == i)/nsim
  pi
}

pi.lst <- lapply(samples, getpi, nsim=nsim)

# Use this to enter the variable values of choice
reach.dat$y <- reach.dat$Avg_RPD

#find number of reaches in each stream for calculating ht estimator
num.reaches <- table(reach.dat$stream)

#find true mean of responses within each stream
true_y <- reach.dat %>%
  group_by(stream) %>%
  summarise(true_y=mean(y))
reach.dat$reachID
as.data.frame(reach.dat)
#this function calculates the ht estimator for each simulated sample from above
#uses the estimated inclusion probabilities

ht.fun <- function(sample.n, pi.n) {
  apply(sample.n, 2, function(ind) {
    df <- data.frame(pi=pi.n[ind], 
                     reach.dat[ind, "y"], 
                     reach.dat[ind, "stream"], stringsAsFactors = FALSE)
    #dim(df)
    unique(df$stream)
    df$size <- num.reaches[df$stream]
    df <- na.omit(df %>% group_by(stream) %>%
      summarise(total=sum(y/pi),
             mean=total/size[1]))
    df$mean}
  )
}

#apply to every one of the nsim samples for each sampling rate
ht.lst <- Map(ht.fun, samples, pi.lst)


n <- rep(names(sizes), each=16)

stream <- as.character(rep(sort(unique(reach.dat$stream)), 30))

true_mean <- rep(true_y$true_y,30)

sim_dat <- data.frame(cbind(stream, n, true_mean, do.call(rbind, ht.lst)))
Pierre L
  • 28,203
  • 6
  • 47
  • 69