I am working on some simulations in R. However, my code runs very slow, making it unsuitable for high N and high number of sims s. I've identified the bottleneck as the following function (see below). It takes a list of s tibbles as input. Basically, within group g (two rows per g_id) I want to assign the values of my parameters (probability) to the first row within the group (g_first == 1), given that the values for the variables A and B match specific patterns within g (i.e. handcoding an interaction between two categorical variables). The values of A and B are also stored as 1/0 variables (e.g. B_x takes a value of 1 if B is x).
I tried to use a list of data.tables instead of tibbles as input but it actually slowed it down further (maybe bc the subsetting below is slow for data.tables. Further, I couldnt figure out how to setkey() for a list of data.tables).
Per request, I uploaded a sample df (list with 100 simulated tibbles) under this link: https://ufile.io/dsax7us8 . Call the function via lapply(df, assign_p)
. In the code below, you also find a quick glance on one of the elements of the df list.
#a quick glance on the data structure
dput(head(df[[1]]))
structure(list(p_id = c(1L, 1L, 1L, 1L, 2L, 2L), g_id = c(1L,
1L, 2L, 2L, 3L, 3L), pr_id = 1:6, g_first = c(1, 0, 1, 0, 1,
0), A = c(1, 0, 1, 0, 0, 1), B = c("x", "z", "x", "x", "z", "z"
), C = c(1, 1, 1, 1, 1, 0), A_0 = c(0, 1, 0, 1, 1, 0), A_1 = c(1,
0, 1, 0, 0, 1), B_x = c(1, 0, 1, 1, 0, 0), B_z = c(0, 1, 0, 0,
1, 1), C_0 = c(0, 0, 0, 0, 0, 1), C_1 = c(1, 1, 1, 1, 1, 0)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
dput(head(df[[2]]))
structure(list(p_id = c(1L, 1L, 1L, 1L, 2L, 2L), g_id = c(1L,
1L, 2L, 2L, 3L, 3L), pr_id = 1:6, g_first = c(1, 0, 1, 0, 1,
0), A = c(0, 0, 1, 0, 1, 0), B = c("x", "z", "z", "x", "x", "z"
), C = c(1, 1, 1, 0, 1, 0), A_0 = c(1, 1, 0, 1, 0, 1), A_1 = c(0,
0, 1, 0, 1, 0), B_x = c(1, 0, 0, 1, 1, 0), B_z = c(0, 1, 1, 0,
0, 1), C_0 = c(0, 0, 0, 1, 0, 1), C_1 = c(1, 1, 1, 0, 1, 0)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
#params
A_10_B_xx <- 0.04
A_10_B_xz <- 0.04
A_10_B_zz <- 0.003
A_10_B_zx <- 0.003
A_01_B_xx <- -0.04
A_01_B_xz <- -0.04
A_01_B_zz <- -0.003
A_01_B_zx <- -0.003
assign_p <- function(df) {
df$p <- NA_real_
g <- max(df$g_id)
for (i in 1:g) {
#generate logical checks
A_11 = df[df$g_id == i & df$g_first == 1, "A_1"]
A_10 = df[df$g_id == i & df$g_first == 0, "A_1"]
A_01 = df[df$g_id == i & df$g_first == 1, "A_0"]
A_00 = df[df$g_id == i & df$g_first == 0, "A_0"]
B_x1 = df[df$g_id == i & df$g_first == 1, "B_x"]
B_x0 = df[df$g_id == i & df$g_first == 0, "B_x"]
B_z1 = df[df$g_id == i & df$g_first == 1, "B_z"]
B_z0 = df[df$g_id == i & df$g_first == 0, "B_z"]
#multiply logical checks
A_10_B_xx_out <- A_10_B_xx * A_11 * A_00 * B_x1 * B_x0
A_10_B_xz_out <- A_10_B_xz * A_11 * A_00 * B_x1 * B_z0
A_10_B_zz_out <- A_10_B_zz * A_11 * A_00 * B_z1 * B_z0
A_10_B_zx_out <- A_10_B_zx * A_11 * A_00 * B_z1 * B_x0
A_01_B_xx_out <- A_01_B_xx * A_01 * A_10 * B_x1 * B_x0
A_01_B_xz_out <- A_01_B_xz * A_01 * A_10 * B_x1 * B_z0
A_01_B_zz_out <- A_01_B_zz * A_01 * A_10 * B_z1 * B_z0
A_01_B_zx_out <- A_01_B_zx * A_01 * A_10 * B_z1 * B_x0
#add matches to 0.5 and assign to column p
df[df$g_id == i &
df$g_first == 1, "p"] <- 0.50 +
A_10_B_xx_out +
A_10_B_xz_out +
A_10_B_zz_out +
A_10_B_zx_out +
A_01_B_xx_out +
A_01_B_xz_out +
A_01_B_zz_out +
A_01_B_zx_out
}
df
}