I have a data.table as follows:
panelID = c(1:50)
year= c(2001:2010)
country = c("NLD", "GRC", "GBR")
n <- 2
library(data.table)
set.seed(123)
DT <- data.table(panelID = rep(sample(panelID), each = n),
country = rep(sample(country, length(panelID), replace = T), each = n),
year = c(replicate(length(panelID), sample(year, n))),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
norm = round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID
DT[DT == 0] <- NA
I have been writing a function to automate the use of the Two Stage Least Squares method. As part of the function, I need to subset all rows which have an NA (but only for the variables used in the 2SLS). For some reason however, it does not seem to subset within the function.
subsetfun <- function (dataset, depvar, indepvars) {
sub_set <- subset(dataset, !is.na(depvar))
sub_set <- subset(sub_set, !is.na(indepvars))
return(sub_set)
}
sub_set <- subsetfun(DT, depvar="some_NA", indepvars=c("panelID", "some_NA_factor"))
The subsetting works outside the function, but for some reason not inside the function.
EDIT: the first line within the function does not give an error. The second line probably has a syntax problem. It gives the error:
Error in `[.data.table`(x, r, vars, with = FALSE) :
i evaluates to a logical vector length 2 but there are 100 rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.
Desired output:
> dput(sub_set)
structure(list(panelID = c(31L, 15L, 14L, 14L, 3L, 42L, 43L,
43L, 37L, 48L, 25L, 25L, 26L, 27L, 5L, 5L, 40L, 28L, 9L, 9L,
29L, 8L, 41L, 41L, 7L, 10L, 36L, 36L, 19L, 4L, 45L, 45L, 17L,
11L, 32L, 32L, 21L, 12L, 49L, 49L, 50L, 13L, 24L, 24L, 30L, 33L,
20L, 20L, 18L, 46L, 22L, 22L, 39L, 38L, 35L, 35L, 47L, 6L, 1L,
1L, 2L, 23L, 44L, 44L, 16L, 34L), country = c("GBR", "NLD", "GRC",
"GRC", "NLD", "NLD", "GBR", "GBR", "NLD", "GRC", "NLD", "NLD",
"GBR", "NLD", "GBR", "GBR", "GRC", "GBR", "GRC", "GRC", "GRC",
"GBR", "GRC", "GRC", "GRC", "GBR", "GBR", "GBR", "NLD", "GRC",
"GRC", "GRC", "NLD", "GRC", "NLD", "NLD", "NLD", "GRC", "GBR",
"GBR", "GBR", "NLD", "GRC", "GRC", "NLD", "GRC", "NLD", "NLD",
"GBR", "GBR", "GRC", "GRC", "GBR", "NLD", "GRC", "GRC", "GRC",
"GBR", "GRC", "GRC", "NLD", "GBR", "GBR", "GBR", "GBR", "GRC"
), year = c(2010L, 2008L, 2003L, 2002L, 2010L, 2006L, 2004L,
2001L, 2006L, 2003L, 2008L, 2001L, 2007L, 2006L, 2007L, 2005L,
2006L, 2007L, 2004L, 2003L, 2009L, 2009L, 2007L, 2002L, 2003L,
2007L, 2004L, 2001L, 2008L, 2008L, 2006L, 2004L, 2008L, 2010L,
2006L, 2001L, 2010L, 2007L, 2008L, 2005L, 2002L, 2008L, 2010L,
2004L, 2005L, 2008L, 2008L, 2009L, 2008L, 2009L, 2007L, 2010L,
2010L, 2007L, 2010L, 2001L, 2009L, 2005L, 2006L, 2009L, 2004L,
2007L, 2010L, 2004L, 2008L, 2010L), some_NA = c(4L, 1L, 5L, 3L,
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L,
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L,
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L,
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L), some_NA_factor = c(1L,
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L,
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L,
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L,
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L,
5L), norm = c(0.05, 0.07, 0.01, 0.04, 0.08, 0.04, 0.09, 0.09,
0.03, 0.01, NA, 0.1, NA, 0.06, 0.03, 0.07, 0.08, 0.07, 0.06,
0.06, 0.1, 0.05, 0.02, 0.05, 0.04, 0.02, 0.06, 0.01, 0.1, 0.06,
0.1, 0.07, 0.04, 0.01, 0.02, 0.09, 0.03, 0.02, NA, 0.04, 0.05,
0.09, 0.05, 0.05, 0.1, 0.03, 0.1, 0.06, 0.08, 0.05, 0.09, 0.02,
0.03, 0.1, 0.03, 0.07, 0.08, 0.03, 0.01, 0.01, 0.1, 0.09, 0.06,
0.04, 0.04, 0.03), Income = c(-2.65, -9.35, -7.31, -14.56, -3.15,
-6.42, -2.65, -0.2, -5.03, -14.56, -3.15, -7.31, -3.39, -0.2,
-5.03, -9.35, -7.31, -7.31, -3.39, -6.42, -2.65, -9.35, -7.31,
-14.56, -3.15, -6.42, -2.65, -0.2, -5.03, -14.56, -3.15, -7.31,
-3.39, -0.2, -5.03, -9.35, -7.31, -7.31, -3.39, -6.42, -2.65,
-9.35, -7.31, -14.56, -3.15, -6.42, -2.65, -0.2, -5.03, -14.56,
-3.15, -7.31, -3.39, -0.2, -5.03, -9.35, -7.31, -7.31, -3.39,
-6.42, -2.65, -9.35, -7.31, -14.56, -3.15, -6.42), Happiness = c(1L,
10L, 2L, 9L, 5L, 4L, 1L, 3L, 6L, 9L, 5L, 7L, 8L, 3L, 6L, 10L,
2L, 7L, 8L, 4L, 1L, 10L, 2L, 9L, 5L, 4L, 1L, 3L, 6L, 9L, 5L,
7L, 8L, 3L, 6L, 10L, 2L, 7L, 8L, 4L, 1L, 10L, 2L, 9L, 5L, 4L,
1L, 3L, 6L, 9L, 5L, 7L, 8L, 3L, 6L, 10L, 2L, 7L, 8L, 4L, 1L,
10L, 2L, 9L, 5L, 4L), Sex = c(1.08, 0.77, 0.54, 0.53, 1.02, 0.72,
1.08, 0.96, 0.64, 0.53, 1.02, 0.45, 1.34, 0.96, 0.64, 0.77, 0.54,
0.45, 1.34, 0.72, 1.08, 0.77, 0.54, 0.53, 1.02, 0.72, 1.08, 0.96,
0.64, 0.53, 1.02, 0.45, 1.34, 0.96, 0.64, 0.77, 0.54, 0.45, 1.34,
0.72, 1.08, 0.77, 0.54, 0.53, 1.02, 0.72, 1.08, 0.96, 0.64, 0.53,
1.02, 0.45, 1.34, 0.96, 0.64, 0.77, 0.54, 0.45, 1.34, 0.72, 1.08,
0.77, 0.54, 0.53, 1.02, 0.72), Age = c(63L, 2L, 19L, 77L, 13L,
22L, 74L, 16L, 69L, 33L, 80L, 28L, 49L, 83L, 66L, 86L, 76L, 6L,
15L, 96L, 59L, 48L, 45L, 10L, 42L, 39L, 18L, 17L, 20L, 98L, 44L,
8L, 95L, 73L, 65L, 26L, 57L, 32L, 81L, 71L, 27L, 92L, 4L, 100L,
62L, 56L, 54L, 87L, 23L, 35L, 75L, 58L, 38L, 52L, 46L, 34L, 72L,
90L, 1L, 93L, 7L, 78L, 68L, 97L, 64L, 25L), Educ = c(0.54, 0.43,
0.62, 0.85, 0.15, 1.36, 0.54, 0.52, 0.47, 0.85, 0.15, 0.81, 1.12,
0.52, 0.47, 0.43, 0.62, 0.81, 1.12, 1.36, 0.54, 0.43, 0.62, 0.85,
0.15, 1.36, 0.54, 0.52, 0.47, 0.85, 0.15, 0.81, 1.12, 0.52, 0.47,
0.43, 0.62, 0.81, 1.12, 1.36, 0.54, 0.43, 0.62, 0.85, 0.15, 1.36,
0.54, 0.52, 0.47, 0.85, 0.15, 0.81, 1.12, 0.52, 0.47, 0.43, 0.62,
0.81, 1.12, 1.36, 0.54, 0.43, 0.62, 0.85, 0.15, 1.36), uniqueID = c(1L,
4L, 5L, 6L, 7L, 10L, 11L, 12L, 13L, 16L, 17L, 18L, 19L, 22L,
23L, 24L, 25L, 28L, 29L, 30L, 31L, 34L, 35L, 36L, 37L, 40L, 41L,
42L, 43L, 46L, 47L, 48L, 49L, 52L, 53L, 54L, 55L, 58L, 59L, 60L,
61L, 64L, 65L, 66L, 67L, 70L, 71L, 72L, 73L, 76L, 77L, 78L, 79L,
82L, 83L, 84L, 85L, 88L, 89L, 90L, 91L, 94L, 95L, 96L, 97L, 100L
)), row.names = c(NA, -66L), .internal.selfref = <pointer: 0x00000231152b1ef0>, class = c("data.table",
"data.frame"))