0

I have a data.table as follows:

panelID = c(1:50)   
year= c(2001:2010)
country = c("NLD", "GRC", "GBR")
n <- 2
library(data.table)
set.seed(123)
DT <- data.table(panelID = rep(sample(panelID), each = n),
                 country = rep(sample(country, length(panelID), replace = T), each = n),
                 year = c(replicate(length(panelID), sample(year, n))),
                 some_NA = sample(0:5, 6),                                             
                 some_NA_factor = sample(0:5, 6),         
                 norm = round(runif(100)/10,2),
                 Income = round(rnorm(10,-5,5),2),
                 Happiness = sample(10,10),
                 Sex = round(rnorm(10,0.75,0.3),2),
                 Age = sample(100,100),
                 Educ = round(rnorm(10,0.75,0.3),2))        
DT [, uniqueID := .I]                                                                        # Creates a unique ID     
DT[DT == 0] <- NA    

I have been writing a function to automate the use of the Two Stage Least Squares method. As part of the function, I need to subset all rows which have an NA (but only for the variables used in the 2SLS). For some reason however, it does not seem to subset within the function.

subsetfun <- function (dataset, depvar, indepvars) {
  sub_set <- subset(dataset, !is.na(depvar))
  sub_set <- subset(sub_set, !is.na(indepvars))
  return(sub_set)
}  

sub_set <- subsetfun(DT, depvar="some_NA", indepvars=c("panelID", "some_NA_factor")) 

The subsetting works outside the function, but for some reason not inside the function.

EDIT: the first line within the function does not give an error. The second line probably has a syntax problem. It gives the error:

Error in `[.data.table`(x, r, vars, with = FALSE) : 
  i evaluates to a logical vector length 2 but there are 100 rows. Recycling of logical i is no longer allowed as it hides more bugs than is worth the rare convenience. Explicitly use rep(...,length=.N) if you really need to recycle.

Desired output:

> dput(sub_set)
structure(list(panelID = c(31L, 15L, 14L, 14L, 3L, 42L, 43L, 
43L, 37L, 48L, 25L, 25L, 26L, 27L, 5L, 5L, 40L, 28L, 9L, 9L, 
29L, 8L, 41L, 41L, 7L, 10L, 36L, 36L, 19L, 4L, 45L, 45L, 17L, 
11L, 32L, 32L, 21L, 12L, 49L, 49L, 50L, 13L, 24L, 24L, 30L, 33L, 
20L, 20L, 18L, 46L, 22L, 22L, 39L, 38L, 35L, 35L, 47L, 6L, 1L, 
1L, 2L, 23L, 44L, 44L, 16L, 34L), country = c("GBR", "NLD", "GRC", 
"GRC", "NLD", "NLD", "GBR", "GBR", "NLD", "GRC", "NLD", "NLD", 
"GBR", "NLD", "GBR", "GBR", "GRC", "GBR", "GRC", "GRC", "GRC", 
"GBR", "GRC", "GRC", "GRC", "GBR", "GBR", "GBR", "NLD", "GRC", 
"GRC", "GRC", "NLD", "GRC", "NLD", "NLD", "NLD", "GRC", "GBR", 
"GBR", "GBR", "NLD", "GRC", "GRC", "NLD", "GRC", "NLD", "NLD", 
"GBR", "GBR", "GRC", "GRC", "GBR", "NLD", "GRC", "GRC", "GRC", 
"GBR", "GRC", "GRC", "NLD", "GBR", "GBR", "GBR", "GBR", "GRC"
), year = c(2010L, 2008L, 2003L, 2002L, 2010L, 2006L, 2004L, 
2001L, 2006L, 2003L, 2008L, 2001L, 2007L, 2006L, 2007L, 2005L, 
2006L, 2007L, 2004L, 2003L, 2009L, 2009L, 2007L, 2002L, 2003L, 
2007L, 2004L, 2001L, 2008L, 2008L, 2006L, 2004L, 2008L, 2010L, 
2006L, 2001L, 2010L, 2007L, 2008L, 2005L, 2002L, 2008L, 2010L, 
2004L, 2005L, 2008L, 2008L, 2009L, 2008L, 2009L, 2007L, 2010L, 
2010L, 2007L, 2010L, 2001L, 2009L, 2005L, 2006L, 2009L, 2004L, 
2007L, 2010L, 2004L, 2008L, 2010L), some_NA = c(4L, 1L, 5L, 3L, 
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 
4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L, 5L, 3L, 4L, 1L), some_NA_factor = c(1L, 
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 
5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 5L, 3L, 2L, 1L, 
5L), norm = c(0.05, 0.07, 0.01, 0.04, 0.08, 0.04, 0.09, 0.09, 
0.03, 0.01, NA, 0.1, NA, 0.06, 0.03, 0.07, 0.08, 0.07, 0.06, 
0.06, 0.1, 0.05, 0.02, 0.05, 0.04, 0.02, 0.06, 0.01, 0.1, 0.06, 
0.1, 0.07, 0.04, 0.01, 0.02, 0.09, 0.03, 0.02, NA, 0.04, 0.05, 
0.09, 0.05, 0.05, 0.1, 0.03, 0.1, 0.06, 0.08, 0.05, 0.09, 0.02, 
0.03, 0.1, 0.03, 0.07, 0.08, 0.03, 0.01, 0.01, 0.1, 0.09, 0.06, 
0.04, 0.04, 0.03), Income = c(-2.65, -9.35, -7.31, -14.56, -3.15, 
-6.42, -2.65, -0.2, -5.03, -14.56, -3.15, -7.31, -3.39, -0.2, 
-5.03, -9.35, -7.31, -7.31, -3.39, -6.42, -2.65, -9.35, -7.31, 
-14.56, -3.15, -6.42, -2.65, -0.2, -5.03, -14.56, -3.15, -7.31, 
-3.39, -0.2, -5.03, -9.35, -7.31, -7.31, -3.39, -6.42, -2.65, 
-9.35, -7.31, -14.56, -3.15, -6.42, -2.65, -0.2, -5.03, -14.56, 
-3.15, -7.31, -3.39, -0.2, -5.03, -9.35, -7.31, -7.31, -3.39, 
-6.42, -2.65, -9.35, -7.31, -14.56, -3.15, -6.42), Happiness = c(1L, 
10L, 2L, 9L, 5L, 4L, 1L, 3L, 6L, 9L, 5L, 7L, 8L, 3L, 6L, 10L, 
2L, 7L, 8L, 4L, 1L, 10L, 2L, 9L, 5L, 4L, 1L, 3L, 6L, 9L, 5L, 
7L, 8L, 3L, 6L, 10L, 2L, 7L, 8L, 4L, 1L, 10L, 2L, 9L, 5L, 4L, 
1L, 3L, 6L, 9L, 5L, 7L, 8L, 3L, 6L, 10L, 2L, 7L, 8L, 4L, 1L, 
10L, 2L, 9L, 5L, 4L), Sex = c(1.08, 0.77, 0.54, 0.53, 1.02, 0.72, 
1.08, 0.96, 0.64, 0.53, 1.02, 0.45, 1.34, 0.96, 0.64, 0.77, 0.54, 
0.45, 1.34, 0.72, 1.08, 0.77, 0.54, 0.53, 1.02, 0.72, 1.08, 0.96, 
0.64, 0.53, 1.02, 0.45, 1.34, 0.96, 0.64, 0.77, 0.54, 0.45, 1.34, 
0.72, 1.08, 0.77, 0.54, 0.53, 1.02, 0.72, 1.08, 0.96, 0.64, 0.53, 
1.02, 0.45, 1.34, 0.96, 0.64, 0.77, 0.54, 0.45, 1.34, 0.72, 1.08, 
0.77, 0.54, 0.53, 1.02, 0.72), Age = c(63L, 2L, 19L, 77L, 13L, 
22L, 74L, 16L, 69L, 33L, 80L, 28L, 49L, 83L, 66L, 86L, 76L, 6L, 
15L, 96L, 59L, 48L, 45L, 10L, 42L, 39L, 18L, 17L, 20L, 98L, 44L, 
8L, 95L, 73L, 65L, 26L, 57L, 32L, 81L, 71L, 27L, 92L, 4L, 100L, 
62L, 56L, 54L, 87L, 23L, 35L, 75L, 58L, 38L, 52L, 46L, 34L, 72L, 
90L, 1L, 93L, 7L, 78L, 68L, 97L, 64L, 25L), Educ = c(0.54, 0.43, 
0.62, 0.85, 0.15, 1.36, 0.54, 0.52, 0.47, 0.85, 0.15, 0.81, 1.12, 
0.52, 0.47, 0.43, 0.62, 0.81, 1.12, 1.36, 0.54, 0.43, 0.62, 0.85, 
0.15, 1.36, 0.54, 0.52, 0.47, 0.85, 0.15, 0.81, 1.12, 0.52, 0.47, 
0.43, 0.62, 0.81, 1.12, 1.36, 0.54, 0.43, 0.62, 0.85, 0.15, 1.36, 
0.54, 0.52, 0.47, 0.85, 0.15, 0.81, 1.12, 0.52, 0.47, 0.43, 0.62, 
0.81, 1.12, 1.36, 0.54, 0.43, 0.62, 0.85, 0.15, 1.36), uniqueID = c(1L, 
4L, 5L, 6L, 7L, 10L, 11L, 12L, 13L, 16L, 17L, 18L, 19L, 22L, 
23L, 24L, 25L, 28L, 29L, 30L, 31L, 34L, 35L, 36L, 37L, 40L, 41L, 
42L, 43L, 46L, 47L, 48L, 49L, 52L, 53L, 54L, 55L, 58L, 59L, 60L, 
61L, 64L, 65L, 66L, 67L, 70L, 71L, 72L, 73L, 76L, 77L, 78L, 79L, 
82L, 83L, 84L, 85L, 88L, 89L, 90L, 91L, 94L, 95L, 96L, 97L, 100L
)), row.names = c(NA, -66L), .internal.selfref = <pointer: 0x00000231152b1ef0>, class = c("data.table", 
"data.frame"))
Tom
  • 2,173
  • 1
  • 17
  • 44
  • What error does it throw if any? – NelsonGon Jan 03 '20 at 12:14
  • The first line doesn't throw an error. The second line might have an added syntax problem. Sorry, should have checked the correct syntax for the second line first. – Tom Jan 03 '20 at 12:18

1 Answers1

1

I am not 100% sure if this is exactly the same as the result, because I can't import the dput statement of the subset, but I was able to filter a subset using dplyr. The main idea was taken from this question dplyr filter with condition on multiple columns

library(tidyverse)
subsetfun <- function (dataset, depvar, indepvars) {

sub_set <- dataset %>% filter_at(vars(indepvars,depvar), all_vars(!is.na(.))) %>% as.data.table(.)
return(sub_set)
}  

sub_set <- subsetfun(DT, depvar="some_NA", indepvars=c("panelID", "some_NA_factor")) 
hannes101
  • 2,410
  • 1
  • 17
  • 40