I have two data.tables that have 34 columns, where the columns are exactly the same.
Month SpId1 SpId2 ... SpId33
Edit:Here is sample data using the reproduce function from Reproducible Example
AltSuitSp1 <- data.table(structure(list(Month = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,12L, 12L, 12L), .Label = c("1", "10", "11", "12", "2", "3", "4","5", "6", "7", "8", "9"), class = "factor"), SpdSpSuit = c(0,0, 0, 0, 0, 0, 0, 0, 0, 0), SpdIncSuit = c(0, 0, 0,0, 0, 0, 0, 0, 0, 0), SpdGrowSuit = c(0.4625, 0.4625, 0.4625, 0.4625, 0.4625, 0.4625, 0.4625,0.4625, 0.4625, 0.4625), RzbSpSuit = c(0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333,0.283333333, 0.283333333, 0.283333333, 0.283333333), RzbIncSuit = c(0.34,0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.34), RzbGrowSuit = c(0.283333333,0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333), FMSSpSuit = c(0.34,0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.34, 0.34), FMSIncSuit = c(0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425), FMSGrowSuit = c(0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333), BhsSpSuit = c(0.283333333,0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.233333333, 0.233333333, 0.233333333), BhsIncSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), BhsGrowSuit = c(0.283333333,0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.283333333, 0.233333333, 0.233333333, 0.233333333), BrtSpSuit = c(0.866666667,0.866666667, 0.866666667, 0.866666667, 0.866666667, 0.866666667,0.866666667, 0.54, 0.54, 0.54), BrtIncSuit = c(0.8, 0.8, 0.8, 0.8, 0.8,0.8, 0.8, 0.43, 0.43, 0.43), BrtGrSuit = c(0.8, 0.8, 0.8, 0.8, 0.8, 0.8,0.8, 0.86, 0.86, 0.86), CcfSpSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), CcfIncSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0),CcfGrSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), GsfSpSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), GsfIncSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), GsfGrSuit = c(0, 0, 0, 0, 0,0, 0, 0, 0, 0), RbtSpSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), RbtIncSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), RbtGrSuit = c(0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.95, 0.925, 0.925, 0.925), SmbSpSuit = c(0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.7, 0.675, 0.675,0.675), SmbIncSuit = c(0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.766666667,0.766666667, 0.766666667), SmbGrSuit = c(0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.0875, 0.0875, 0.0875), StbSpSuit = c(0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425, 0.425), StbIncSuit = c(0, 0,0, 0, 0, 0, 0, 0, 0, 0), StbGrSuit = c(0, 0, 0, 0, 0,0, 0, 0, 0, 0), HbcSpSuit = c(0, 0, 0, 0, 0, 0, 0,0, 0, 0), HbcIncSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0,0), HbcGrSuit = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.425, 0.425, 0.425)), .Names = c("Month", "SpdSpSuit", "SpdIncSuit", "SpdGrowSuit","RzbSpSuit", "RzbIncSuit", "RzbGrowSuit", "FMSSpSuit", "FMSIncSuit","FMSGrowSuit", "BhsSpSuit", "BhsIncSuit", "BhsGrowSuit", "BrtSpSuit","BrtIncSuit", "BrtGrSuit", "CcfSpSuit", "CcfIncSuit", "CcfGrSuit","GsfSpSuit", "GsfIncSuit", "GsfGrSuit", "RbtSpSuit", "RbtIncSuit","RbtGrSuit", "SmbSpSuit", "SmbIncSuit", "SmbGrSuit", "StbSpSuit","StbIncSuit", "StbGrSuit", "HbcSpSuit", "HbcIncSuit", "HbcGrSuit"), class = c("data.table", "data.frame"), row.names = c(NA, -10L)))
AltSuitDates <- data.table(structure(list(Month = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 10L, 11L,12L), SpdSpT = c(NA, NA, NA, NA, NA, 1L, 1L, NA, NA, NA), SpdIncT = c(NA,NA, NA, NA, NA, 1L, 1L, NA, NA, NA), SpdGrT = c(1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L), RzbSpT = c(NA, NA, NA, 1L, 1L, 1L, NA,NA, NA, NA), RzbIncT = c(NA, NA, NA, 1L, 1L, 1L, NA, NA, NA,NA), RzbGrT = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), FmsSpT = c(NA,NA, 1L, 1L, NA, NA, NA, NA, NA, NA), FmsIncT = c(NA, NA, 1L,1L, 1L, NA, NA, NA, NA, NA), FMSGrT = c(1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L, 1L), BhsSpT = c(NA, NA, NA, 1L, 1L, 1L, NA, NA, NA,NA), BhsIncT = c(NA, NA, NA, 1L, 1L, 1L, NA, NA, NA, NA), BhsGrT = c(1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), BRTsp = c(1L, 1L, 1L, NA,NA, NA, NA, 1L, 1L, 1L), BRTinc = c(1L, 1L, 1L, 1L, NA, NA, NA,1L, 1L, 1L), BRTgr = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), CCFsp = c(NA, NA, NA, NA, 1L, 1L, 1L, NA, NA, NA), CCFinc = c(NA,NA, NA, NA, 1L, 1L, 1L, NA, NA, NA), CCFgr = c(1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L), GSFsp = c(NA, NA, NA, NA, 1L, 1L, 1L,NA, NA, NA), GSFinc = c(NA, NA, NA, NA, 1L, 1L, 1L, NA, NA, NA), GSFgr = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), RBTsp = c(1L,1L, 1L, 1L, 1L, 1L, 1L, NA, NA, NA), RBTinc = c(1L, 1L, 1L, 1L,1L, 1L, 1L, NA, NA, NA), RBTgr = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,1L, 1L, 1L), SMBsp = c(NA, NA, NA, 1L, 1L, 1L, 1L, NA, NA, NA), SMBinc = c(NA, NA, NA, 1L, 1L, 1L, 1L, NA, NA, NA), SMBgr = c(1L,1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), STBsp = c(NA, NA, NA, NA,NA, 1L, 1L, NA, NA, NA), STBinc = c(NA, NA, NA, NA, NA, 1L, 1L,NA, NA, NA), STBgr = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), HBCsp = c(NA, NA, NA, 1L, 1L, 1L, NA, NA, NA, NA), HBCinc = c(NA,NA, NA, 1L, 1L, 1L, NA, NA, NA, NA), HBCgr = c(1L, 1L, 1L, 1L,1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("Month", "SpdSpT", "SpdIncT","SpdGrT", "RzbSpT", "RzbIncT", "RzbGrT", "FmsSpT", "FmsIncT","FMSGrT", "BhsSpT", "BhsIncT", "BhsGrT", "BRTsp", "BRTinc", "BRTgr","CCFsp", "CCFinc", "CCFgr", "GSFsp", "GSFinc", "GSFgr", "RBTsp","RBTinc", "RBTgr", "SMBsp", "SMBinc", "SMBgr", "STBsp", "STBinc","STBgr", "HBCsp", "HBCinc", "HBCgr"), class = c("data.table","data.frame"), row.names = c(NA, -10L)))
Where SpId is a species identifier. One DT is 5+ million rows long (AltSuitSp1), the other is 12 (AltSuitDates). I am using the DT that is 12 rows long (corresponding to 12 months) to update the larger DT. Currently I am using a nested if, else if structure in a for loop to check a condition and update the larger DT based off the smaller DT (see code below)
h <- 1
n <- length(AltSuitSp1[,Month])
stm <- AltSuitSp1[,Month] # AltSuitSp1 is the 5+ million row DT
cond1 <- which(stm == 1) # list of all rows of AltSuitSp1 where the Month is = 1
cond2 <- which(stm == 2) # list of all rows of AltSuitSp1 where the Month is = 2
...
cond12 <- which(stm == 12)
for (h in seq(n)){
if (any(cond1 == h)){
set(AltSuitSp1,h,2:34,(AltSuitSp1[h,2:34,with=F] * AltSuitDates[1,2:34,with=F]))
}else if (any(cond2 == h)){
set(AltSuitSp1,h,2:34,(AltSuitSp1[h,2:34,with=F] * AltSuitDates[2,2:34,with=F]))
}else if ...
}else if (any(cond12)){
set(AltSuitSp1,h,2:34,(AltSuitSp1[h,2:34,with=F] * AltSuitDates[12,2:34,with=F]))
}else
break
}
Now, I have run this code for 1 minute and checked to see how far h had advanced. Currently I am seeing about 29-30 loops a second and h has advanced to about 1800 iterations. However, even at 30 loops per second (which is rather slow: Using Set in DT), this code will take approximately 2 days to finish. However, as the output below shows, it is doing what I want/expect it to do.
AltSuitSp1Results <- data.table(structure(list(Month = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,12L, 12L, 12L), .Label = c("1", "10", "11", "12", "2", "3", "4","5", "6", "7", "8", "9"), class = "factor"), SpdSpSuit = c(NA,NA, NA, NA, NA, NA, NA, 0, 0, 0), SpdIncSuit = c(NA, NA, NA,NA, NA, NA, NA, 0, 0, 0), SpdGrowSuit = c(0, 0, 0, 0, 0, 0, 0,0.4625, 0.4625, 0.4625), RzbSpSuit = c(NA, NA, NA, NA, NA, NA,NA, 0.283333333, 0.283333333, 0.283333333), RzbIncSuit = c(NA,NA, NA, NA, NA, NA, NA, 0.34, 0.34, 0.34), RzbGrowSuit = c(0,0, 0, 0, 0, 0, 0, 0.283333333, 0.283333333, 0.283333333), FMSSpSuit = c(NA,NA, NA, NA, NA, NA, NA, 0.34, 0.34, 0.34), FMSIncSuit = c(NA,NA, NA, NA, NA, NA, NA, 0.425, 0.425, 0.425), FMSGrowSuit = c(0,0, 0, 0, 0, 0, 0, 0.283333333, 0.283333333, 0.283333333), BhsSpSuit = c(NA,NA, NA, NA, NA, NA, NA, 0.233333333, 0.233333333, 0.233333333), BhsIncSuit = c(NA, NA, NA, NA, NA, NA, NA, 0, 0, 0), BhsGrowSuit = c(0,0, 0, 0, 0, 0, 0, 0.233333333, 0.233333333, 0.233333333), BrtSpSuit = c(0.866666667,0.866666667, 0.866666667, 0.866666667, 0.866666667, 0.866666667,0.866666667, 0, 0, 0), BrtIncSuit = c(0.8, 0.8, 0.8, 0.8, 0.8,0.8, 0.8, 0.43, 0.43, 0.43), BrtGrSuit = c(0, 0, 0, 0, 0, 0,0, 0.86, 0.86, 0.86), CcfSpSuit = c(NA, NA, NA, NA, NA, NA, NA,0, 0, 0), CcfIncSuit = c(NA, NA, NA, NA, NA, NA, NA, 0, 0, 0),CcfGrSuit = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), GsfSpSuit = c(NA,NA, NA, NA, NA, NA, NA, 0, 0, 0), GsfIncSuit = c(NA, NA,NA, NA, NA, NA, NA, 0, 0, 0), GsfGrSuit = c(0, 0, 0, 0, 0,0, 0, 0, 0, 0), RbtSpSuit = c(NA, NA, NA, NA, NA, NA, NA,0, 0, 0), RbtIncSuit = c(NA, NA, NA, NA, NA, NA, NA, 0, 0,0), RbtGrSuit = c(0, 0, 0, 0, 0, 0, 0, 0.925, 0.925, 0.925), SmbSpSuit = c(NA, NA, NA, NA, NA, NA, NA, 0.675, 0.675,0.675), SmbIncSuit = c(NA, NA, NA, NA, NA, NA, NA, 0.766666667,0.766666667, 0.766666667), SmbGrSuit = c(0, 0, 0, 0, 0, 0,0, 0.0875, 0.0875, 0.0875), StbSpSuit = c(NA, NA, NA, NA,NA, NA, NA, 0.425, 0.425, 0.425), StbIncSuit = c(NA, NA,NA, NA, NA, NA, NA, 0, 0, 0), StbGrSuit = c(0, 0, 0, 0, 0,0, 0, 0, 0, 0), HbcSpSuit = c(NA, NA, NA, NA, NA, NA, NA,0, 0, 0), HbcIncSuit = c(NA, NA, NA, NA, NA, NA, NA, 0, 0,0), HbcGrSuit = c(0, 0, 0, 0, 0, 0, 0, 0.425, 0.425, 0.425)), .Names = c("Month", "SpdSpSuit", "SpdIncSuit", "SpdGrowSuit","RzbSpSuit", "RzbIncSuit", "RzbGrowSuit", "FMSSpSuit", "FMSIncSuit","FMSGrowSuit", "BhsSpSuit", "BhsIncSuit", "BhsGrowSuit", "BrtSpSuit","BrtIncSuit", "BrtGrSuit", "CcfSpSuit", "CcfIncSuit", "CcfGrSuit","GsfSpSuit", "GsfIncSuit", "GsfGrSuit", "RbtSpSuit", "RbtIncSuit","RbtGrSuit", "SmbSpSuit", "SmbIncSuit", "SmbGrSuit", "StbSpSuit","StbIncSuit", "StbGrSuit", "HbcSpSuit", "HbcIncSuit", "HbcGrSuit"), class = c("data.table", "data.frame"), row.names = c(NA, -10L)))
Obviously, I am not going about this in an efficient manner and am doing some sloppy programming. However, I am struggling to figure out where exactly I can optimize my code. Am I trying to reinvent a built-in DT function DT? Am I in one of the Circles; Have I missed a place where I can vectorize: R Inferno?
Basically, I need to update columns 2:34 in the AltSuitSp1 DT, based off of the AltSuitDates DT, using the Month column as the condition to know which row to use from AltSuitDates DT to update AltSuitSp1. Any help is appreciated.