I am working with the R programming language.
Part 1: I have the following data set ("my_data"):
num_var_1 <- rnorm(1000, 10, 1)
num_var_2 <- rnorm(1000, 10, 5)
num_var_3 <- rnorm(1000, 10, 10)
num_var_4 <- rnorm(1000, 10, 10)
num_var_5 <- rnorm(1000, 10, 10)
factor_1 <- c("A","B", "C")
factor_2 <- c("AA","BB", "CC")
factor_3 <- c("AAA","BBB", "CCC", "DDD")
factor_4 <- c("AAAA","BBBB", "CCCC", "DDDD", "EEEE")
factor_5 <- c("AAAAA","BBBBB", "CCCCC", "DDDDD", "EEEEE", "FFFFFF")
factor_var_1 <- as.factor(sample(factor_1, 1000, replace=TRUE, prob=c(0.3, 0.5, 0.2)))
factor_var_2 <- as.factor(sample(factor_2, 1000, replace=TRUE, prob=c(0.5, 0.3, 0.2)))
factor_var_3 <- as.factor(sample(factor_3, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.2, 0.1)))
factor_var_4 <- as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
factor_var_5 <- as.factor(sample(factor_4, 1000, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
my_data = data.frame(id,num_var_1, num_var_2, num_var_3, num_var_4, num_var_5, factor_var_1, factor_var_2, factor_var_3, factor_var_4, factor_var_5)
> head(my_data)
id num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
1 1 9.439524 5.021006 4.883963 8.496925 11.965498 B AA AAA CCCC AAAA
2 2 9.769823 4.800225 12.369379 6.722429 16.501132 B AA AAA AAAA AAAA
3 3 11.558708 9.910099 4.584108 -4.481653 16.710042 C AA BBB AAAA CCCC
4 4 10.070508 9.339124 22.192276 3.027154 -2.841578 B CC DDD BBBB AAAA
5 5 10.129288 -2.746714 11.741359 35.984902 -10.261096 B AA AAA DDDD DDDD
6 6 11.715065 15.202867 3.847317 9.625850 32.053261 B AA CCC BBBB EEEE
> str(my_data)
'data.frame': 1000 obs. of 11 variables:
$ id : int 1 2 3 4 5 6 7 8 9 10 ...
$ num_var_1 : num 10.25 10.3 12.75 9.18 9.74 ...
$ num_var_2 : num 1.68 8.79 15.58 -2.52 8.39 ...
$ num_var_3 : num 1.64 15.68 -3.24 7.8 6.44 ...
$ num_var_4 : num 19.19 8.53 11.31 6.84 9.7 ...
$ num_var_5 : num 4.912 14.25 -0.112 4.799 1.871 ...
$ factor_var_1: Factor w/ 3 levels "A","B","C": 3 2 1 2 1 2 2 2 1 1 ...
$ factor_var_2: Factor w/ 3 levels "AA","BB","CC": 1 3 1 2 2 2 1 2 2 1 ...
$ factor_var_3: Factor w/ 4 levels "AAA","BBB","CCC",..: 3 2 1 3 3 3 1 1 1 1 ...
$ factor_var_4: Factor w/ 5 levels "AAAA","BBBB",..: 4 1 4 1 4 2 1 5 1 2 ...
$ factor_var_5: Factor w/ 5 levels "AAAA","BBBB",..: 5 2 3 2 3 2 4 1 4 4 ...
> summary(my_data)
id num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
Min. : 1.0 Min. : 6.510 Min. :-7.957 Min. :-20.47 Min. :-17.002 Min. :-21.402 A:319 AA:531 AAA:529 AAAA:524 AAAA:380
1st Qu.: 250.8 1st Qu.: 9.302 1st Qu.: 6.480 1st Qu.: 3.38 1st Qu.: 3.586 1st Qu.: 2.753 B:477 BB:271 BBB:184 BBBB:173 BBBB:258
Median : 500.5 Median :10.025 Median :10.063 Median : 10.21 Median : 10.409 Median : 9.348 C:204 CC:198 CCC:200 CCCC: 97 CCCC:130
Mean : 500.5 Mean : 9.997 Mean : 9.963 Mean : 10.03 Mean : 10.087 Mean : 9.694 DDD: 87 DDDD:125 DDDD:113
3rd Qu.: 750.2 3rd Qu.:10.713 3rd Qu.:13.234 3rd Qu.: 16.48 3rd Qu.: 17.114 3rd Qu.: 16.597 EEEE: 81 EEEE:119
Max. :1000.0 Max. :13.403 Max. :26.731 Max. : 44.68 Max. : 42.056 Max. : 43.102
Part 2 : I also have the following data frame (called "d") that contains a different logical condition on each row:
head(d)
iteration results
1 num_var_5 > 29.80, factor_var_2 = "AA, BB, CC"
2 num_var_1 < 8.44, num_var_4 > 37.01, factor_var_1 = "A, B, C", factor_var_2 = "AA, BB, CC", factor_var_3 = "DDD", factor_var_4 = "AAAA, DDDD", factor_var_5 = "BBBB, CCCC, DDDD, EEEE"
3 num_var_1 < 9.70, num_var_2 < 19.75, num_var_3 < 35.03, num_var_4 > 20.12, num_var_5 < -6.67, factor_var_2 = "AA, BB", factor_var_3 = "AAA, BBB"
4 num_var_4 > 31.51, num_var_5 < 4.49, factor_var_1 = "C", factor_var_4 = "AAAA, BBBB, CCCC, DDDD, EEEE"
5 num_var_4 < 23.87, factor_var_2 = "AA", factor_var_3 = "AAA, DDD", factor_var_5 = "AAAA, CCCC, DDDD, EEEE"
6 num_var_4 < 18.07, factor_var_5 = "AAAA, BBBB, CCCC, EEEE"
7 num_var_3 < 9.19, num_var_4 < 13.21, num_var_5 < 14.59, factor_var_4 = "BBBB, CCCC, EEEE"
8 num_var_1 > 11.07, num_var_3 > 8.17, num_var_4 < 23.00, num_var_5 < 35.93, factor_var_2 = "AA", factor_var_3 = "AAA, BBB, DDD", factor_var_4 = "AAAA, BBBB, EEEE", factor_var_5 = "CCCC, EEEE"
9 num_var_1 < 10.77, num_var_2 > 24.95, num_var_3 > 4.78, factor_var_2 = "CC", factor_var_3 = "AAA, BBB, CCC, DDD"
10 num_var_2 > 2.65, factor_var_1 = "B, C", factor_var_2 = "AA, BB, CC", factor_var_3 = "AAA, BBB, CCC, DDD"
> str(d)
'data.frame': 10 obs. of 2 variables:
$ iteration: int 1 2 3 4 5 6 7 8 9 10
$ results : chr "num_var_5 > 29.80, factor_var_2 = \"AA, BB, CC\"" "num_var_1 < 8.44, num_var_4 > 37.01, factor_var_1 = \"A, B, C\", factor_var_2 = \"AA, BB, CC\", factor_var_3 = "| __truncated__ "num_var_1 < 9.70, num_var_2 < 19.75, num_var_3 < 35.03, num_var_4 > 20.12, num_var_5 < -6.67, factor_var_2 = \""| __truncated__ "num_var_4 > 31.51, num_var_5 < 4.49, factor_var_1 = \"C\", factor_var_4 = \"AAAA, BBBB, CCCC, DDDD, EEEE\"" ...
My Question:
I am trying to take select rows from "my_data" based on the logical statements stored within the rows of "d". For example, I can do this manually:
# row 1 of "d" : num_var_5 > 29.80, factor_var_2 = "AA, BB, CC"
my_data_using_row1_of_d = my_data[which(my_data$num_var_5 > 29.80 & my_data$factor_var_2 %in% c("AA", "BB", "CC") ), ]
# row 6 of "d" : num_var_4 < 18.07, factor_var_5 = "AAAA, BBBB, CCCC, EEEE"
my_data_using_row6_of_d = my_data[which(my_data$num_var_4 > 18.07 & my_data$factor_var_5 %in% c("AAAA", "BBBB", "CCCC", "EEEE") ), ]
# row 1 and row 6 together:
row_1_and_row_6 = rbind(my_data_using_row1_of_d, my_data_using_row6_of_d)
But is there a way to automatically take the logical conditions from different rows of "d" and use them to select rows from "my_data"?
For example:
#pseduocode : 6th row, 2nd column of "d"
my_data_using_row6_of_d = my_data[which(d[6,2]) ), ]
Can someone please show me this?
Thanks!
Note: The data frame "d" is also available in this format ("d2") if it makes selecting rows easier:
> d2
num_var_1 num_var_2 num_var_3 num_var_4 num_var_5 factor_var_1 factor_var_2 factor_var_3 factor_var_4 factor_var_5
1 8.251683 27.791314 30.525573 33.95768 2.388074 B <NA> AAA AAAA DDDD
2 9.012602 NA NA NA 20.236515 A AA BB <NA> <NA> BBBB
3 NA 16.778085 28.097324 5.69020 NA B BB CCC DDD DDD <NA> AAAA BBBB CCCC CCCC CCCC
4 12.838667 -3.694075 13.411877 -2.20004 NA <NA> AA AA BB AAA BBB CCC <NA> AAAA AAAA BBBB CCCC DDDD
5 NA NA 11.922439 17.63757 NA A B AA AA BB <NA> AAAA AAAA BBBB
6 12.768595 NA 28.507646 NA NA C AA BBB DDD DDD AAAA AAAA CCCC DDDD AAAA AAAA BBBB EEEE EEEE
7 NA NA -20.424906 NA 20.147004 <NA> AA AA <NA> AAAA AAAA AAAA CCCC EEEE <NA>
8 NA 6.299722 8.569485 24.82825 -17.715862 <NA> BB AAA AAA BBB CCC <NA> BBBB EEEE
9 10.846757 NA NA NA NA A B C AA BB CC <NA> <NA> BBBB BBBB
10 NA 4.663916 22.335404 NA NA B B C AA BB AAA AAA AAA DDD AAAA AAAA CCCC EEEE EEEE <NA>
References:
EDIT : Better "Readable" Formats of "d" and "d2" (these are not exact matches from the examples in the question, but the format is the same):
#d
d = structure(list(iteration = 1:6, records = c("num_var_2 < 12.51, num_var_3 > 41.50, factor_var_1 = \"A, B\"",
"num_var_1 < 11.16, num_var_3 > 15.63, num_var_4 > -3.87, factor_var_2 = \"BB\", factor_var_4 = \"AAAA, BBBB, DDDD\"",
"num_var_1 < 9.87, num_var_2 < -1.32, num_var_3 > -5.54, num_var_4 > 24.09, num_var_5 < 3.28, factor_var_2 = \"AA, BB, CC\", factor_var_3 = \"CCC\"",
"num_var_1 > 9.72, num_var_2 > -1.93, num_var_3 < 43.27, num_var_4 < 32.11, num_var_5 > -12.77, factor_var_1 = \"B\", factor_var_2 = \"AA\", factor_var_4 = \"AAAA, BBBB, DDDD\", factor_var_5 = \"AAAA\"",
"num_var_1 > 10.51, num_var_2 > 13.61, num_var_3 > 22.14, num_var_4 < -2.75, factor_var_1 = \"A, B, C\", factor_var_3 = \"AAA\", factor_var_4 = \"BBBB, DDDD, EEEE\"",
"factor_var_1 = \"A, B, C\", factor_var_5 = \"BBBB, CCCC, EEEE\""
)), row.names = c(NA, 6L), class = "data.frame")
#d2
d2 = structure(list(num_var_1 = c(0, 9.76982251051672, 11.5587083141491,
10.0705083914246, 10.1292877351609, 11.7150649868833, 0, 8.73493876539347,
9.31314714810647, 9.55433802990004), num_var_2 = c(0, 4.80022478116903,
9.91009879711669, 9.33912433571633, -2.74671387375216, 15.2028672785969,
11.248628679926, 22.0810368657416, 13.4259911902555, 7.76520345519004
), num_var_3 = c(4.88396277810419, 12.3693787912763, 4.58410828378302,
22.1922764704307, 0, 3.84731683903522, -8.06892964085328, 3.56318894103961,
30.4601885213282, 4.39237578868935), num_var_4 = c(8.49692522135921,
6.72242867213538, -4.48165290183992, 3.02715415448378, 0, 9.62584986365951,
0, 8.15473501735272, 16.0982429616145, 9.47273191012025), num_var_5 = c(0,
16.5011318866726, 0, -2.84157765132228, 0, 0, 0, 13.7564226018481,
-1.92968518298101, 21.3254984181008), factor_var_1 = structure(c(2L,
2L, 3L, 2L, 2L, 2L, 3L, 3L, 3L, 2L), .Label = c("A", "B", "C"
), class = "factor"), factor_var_2 = structure(c(NA, 1L, NA,
3L, NA, 1L, NA, 1L, 1L, 1L), .Label = c("AA", "BB", "CC"), class = "factor"),
factor_var_3 = structure(c(1L, 1L, 2L, 4L, NA, 3L, 1L, 2L,
2L, 1L), .Label = c("AAA", "BBB", "CCC", "DDD"), class = "factor"),
factor_var_4 = structure(c(3L, 1L, NA, 2L, 4L, NA, 4L, 1L,
1L, 1L), .Label = c("AAAA", "BBBB", "CCCC", "DDDD", "EEEE"
), class = "factor"), factor_var_5 = structure(c(1L, 1L,
NA, 1L, 4L, NA, 3L, 2L, 4L, 4L), .Label = c("AAAA", "BBBB",
"CCCC", "DDDD", "EEEE"), class = "factor")), row.names = c(NA,
10L), class = "data.frame")