There might be a simple solution to this, but I can't think of it.
I have a table that has two columns - NodeID and ClusterID. Each Cluster is made up of Nodes. Nodes can exist in multiple clusters.What I want to do is go through the list of ClusterIDs and cluster the clusters.
The process I go through is:
- Select first ClusterID, then select all NodeIDs associated with that ClusterID
- Select all ClusterIDs that are associated with those NodeIDs.
- I repeat this until the list of NodeIDs doesn't increase in size.
- I then return to the full list of ClusterIDs and I remove all of the ones that have appeared in the working list.
I then select the next starting ID and repeat the process.
This is fine, but there are thousands of clusters and it will probably take a few hours to run. Therefore, I would like to optimize the code to run in parallel, but I am stumped how to do that without issues.
Assuming you create a static master list, you can split the list of clusterIDs into parts and have each worker handle a section, using the same process, but eliminating found clusterIDs from the working list not the master list. This works, but you will create duplicates, when associated clusterIDs show up in different worker's lists.
If I let them all run, then I will have to go back and check for duplicates by seeing if the NodeIDs show up in more then one list. This is do able, but seems inefficient.
Is there a more efficient way to do this work?
master_UniqueClusterIDs <- structure(list(ClusterID = c("AlterRefRecord_100_100_16_150",
"AlterRefRecord_100_101_16_151", "AlterRefRecord_100_102_16_152",
"AlterRefRecord_100_103_16_153", "AlterRefRecord_100_105_16_155",
"AlterRefRecord_100_106_16_156", "AlterRefRecord_100_107_16_157",
"AlterRefRecord_100_108_16_158", "AlterRefRecord_100_10_16_58",
"AlterRefRecord_100_111_16_161", "AlterRefRecord_100_115_16_165",
"AlterRefRecord_100_119_16_169", "AlterRefRecord_100_120_16_170",
"AlterRefRecord_100_121_16_171", "AlterRefRecord_100_122_16_172",
"AlterRefRecord_100_131_16_181", "AlterRefRecord_100_133_16_183",
"AlterRefRecord_100_136_16_186", "AlterRefRecord_100_139_16_189",
"AlterRefRecord_100_13_16_61", "AlterRefRecord_100_14_16_62",
"AlterRefRecord_100_15_16_63", "AlterRefRecord_100_17_16_65",
"AlterRefRecord_100_1_16_48", "AlterRefRecord_100_20_16_70",
"AlterRefRecord_100_23_16_72", "AlterRefRecord_100_27_16_76",
"AlterRefRecord_100_28_16_77", "AlterRefRecord_100_29_16_78",
"AlterRefRecord_100_2_16_49", "AlterRefRecord_100_30_16_79",
"AlterRefRecord_100_31_16_80", "AlterRefRecord_100_32_16_81",
"AlterRefRecord_100_33_16_82", "AlterRefRecord_100_35_16_84",
"AlterRefRecord_100_38_16_87", "AlterRefRecord_100_39_16_88",
"AlterRefRecord_100_41_16_90", "AlterRefRecord_100_43_16_92",
"AlterRefRecord_100_44_16_93", "AlterRefRecord_100_47_16_96",
"AlterRefRecord_100_48_16_97", "AlterRefRecord_100_49_16_98",
"AlterRefRecord_100_4_16_52", "AlterRefRecord_100_54_16_103",
"AlterRefRecord_100_56_16_105", "AlterRefRecord_100_58_16_107",
"AlterRefRecord_100_59_16_108", "AlterRefRecord_100_62_16_111",
"AlterRefRecord_100_63_16_112", "AlterRefRecord_100_64_16_113",
"AlterRefRecord_100_65_16_114", "AlterRefRecord_100_6_16_54",
"AlterRefRecord_100_71_16_121", "AlterRefRecord_100_72_16_122",
"AlterRefRecord_100_74_16_124", "AlterRefRecord_100_78_16_128",
"AlterRefRecord_100_84_16_134", "AlterRefRecord_100_85_16_135",
"AlterRefRecord_100_86_16_136", "AlterRefRecord_100_87_16_137",
"AlterRefRecord_100_88_16_138", "AlterRefRecord_100_89_16_139",
"AlterRefRecord_100_8_16_56", "AlterRefRecord_100_90_16_140",
"AlterRefRecord_100_91_16_141", "AlterRefRecord_100_92_16_142",
"AlterRefRecord_100_93_16_143", "AlterRefRecord_100_94_16_144",
"AlterRefRecord_100_95_16_145", "AlterRefRecord_100_97_16_147",
"AlterRefRecord_100_99_16_149", "AlterRefRecord_101_101_16_151",
"AlterRefRecord_101_102_16_152", "AlterRefRecord_101_103_16_153",
"AlterRefRecord_101_105_16_155", "AlterRefRecord_101_106_16_156",
"AlterRefRecord_101_108_16_158", "AlterRefRecord_101_10_16_58",
"AlterRefRecord_101_115_16_165", "AlterRefRecord_101_119_16_169",
"AlterRefRecord_101_120_16_170", "AlterRefRecord_101_121_16_171",
"AlterRefRecord_101_122_16_172", "AlterRefRecord_101_131_16_181",
"AlterRefRecord_101_136_16_186", "AlterRefRecord_101_139_16_189",
"AlterRefRecord_101_13_16_61", "AlterRefRecord_101_15_16_63",
"AlterRefRecord_101_17_16_65", "AlterRefRecord_101_1_16_48",
"AlterRefRecord_101_20_16_70", "AlterRefRecord_101_23_16_72",
"AlterRefRecord_101_27_16_76", "AlterRefRecord_101_28_16_77",
"AlterRefRecord_101_30_16_79", "AlterRefRecord_101_31_16_80",
"AlterRefRecord_101_32_16_81", "AlterRefRecord_101_33_16_82",
"AlterRefRecord_101_35_16_84")), .Names = "ClusterID", row.names = c(NA,
100L), class = "data.frame")
WS_ClusterTable <- structure(list(NodeID = c(14240L, 133399L, 46191L, 15955L, 46531L,
38692L, 36740L, 11536L, 36966L, 43992L, 42118L, 12682L, 133206L,
25687L, 28591L, 129265L, 36848L, 44253L, 26883L, 32346L, 27122L,
23376L, 23432L, 31887L, 39870L, 99938L, 68767L, 37814L, 49133L,
26759L, 15957L, 32725L, 12758L, 45055L, 47234L, 12522L, 14671L,
42296L, 38910L, 46321L, 79613L, 32761L, 21281L, 51924L, 85561L,
16077L, 19069L, 16731L, 25087L, 24225L, 113682L, 27324L, 51568L,
55478L, 16468L, 51924L, 85561L, 18095L, 14734L, 115162L, 20198L,
52842L, 55552L, 41410L, 32734L, 23058L, 18259L, 51752L, 20268L,
11572L, 45063L, 20432L, 55151L, 43490L, 38843L, 89766L, 19283L,
31875L, 12352L, 38773L, 44337L, 31977L, 24609L, 38902L, 32049L,
41152L, 36610L, 20741L, 25882L, 14031L, 22963L, 41342L, 84910L,
37080L, 44297L, 26815L, 38627L, 51102L, 22480L, 39869L, 97999L,
68766L, 37828L, 14671L, 49224L, 15958L, 24890L, 42340L, 12564L,
42988L, 41671L, 36313L), ClusterID = c("AlterRefRecord_100_100_16_150",
"AlterRefRecord_100_100_16_150", "AlterRefRecord_100_101_16_151",
"AlterRefRecord_100_102_16_152", "AlterRefRecord_100_103_16_153",
"AlterRefRecord_100_105_16_155", "AlterRefRecord_100_106_16_156",
"AlterRefRecord_100_107_16_157", "AlterRefRecord_100_108_16_158",
"AlterRefRecord_100_10_16_58", "AlterRefRecord_100_111_16_161",
"AlterRefRecord_100_115_16_165", "AlterRefRecord_100_115_16_165",
"AlterRefRecord_100_119_16_169", "AlterRefRecord_100_120_16_170",
"AlterRefRecord_100_120_16_170", "AlterRefRecord_100_121_16_171",
"AlterRefRecord_100_122_16_172", "AlterRefRecord_100_131_16_181",
"AlterRefRecord_100_133_16_183", "AlterRefRecord_100_136_16_186",
"AlterRefRecord_100_139_16_189", "AlterRefRecord_100_13_16_61",
"AlterRefRecord_100_14_16_62", "AlterRefRecord_100_15_16_63",
"AlterRefRecord_100_15_16_63", "AlterRefRecord_100_17_16_65",
"AlterRefRecord_100_1_16_48", "AlterRefRecord_100_20_16_70",
"AlterRefRecord_100_23_16_72", "AlterRefRecord_100_27_16_76",
"AlterRefRecord_100_28_16_77", "AlterRefRecord_100_29_16_78",
"AlterRefRecord_100_2_16_49", "AlterRefRecord_100_30_16_79",
"AlterRefRecord_100_31_16_80", "AlterRefRecord_100_32_16_81",
"AlterRefRecord_100_33_16_82", "AlterRefRecord_100_35_16_84",
"AlterRefRecord_100_38_16_87", "AlterRefRecord_100_38_16_87",
"AlterRefRecord_100_39_16_88", "AlterRefRecord_100_41_16_90",
"AlterRefRecord_100_43_16_92", "AlterRefRecord_100_43_16_92",
"AlterRefRecord_100_44_16_93", "AlterRefRecord_100_47_16_96",
"AlterRefRecord_100_48_16_97", "AlterRefRecord_100_49_16_98",
"AlterRefRecord_100_4_16_52", "AlterRefRecord_100_4_16_52", "AlterRefRecord_100_54_16_103",
"AlterRefRecord_100_56_16_105", "AlterRefRecord_100_58_16_107",
"AlterRefRecord_100_59_16_108", "AlterRefRecord_100_62_16_111",
"AlterRefRecord_100_62_16_111", "AlterRefRecord_100_63_16_112",
"AlterRefRecord_100_64_16_113", "AlterRefRecord_100_64_16_113",
"AlterRefRecord_100_65_16_114", "AlterRefRecord_100_6_16_54",
"AlterRefRecord_100_71_16_121", "AlterRefRecord_100_72_16_122",
"AlterRefRecord_100_74_16_124", "AlterRefRecord_100_78_16_128",
"AlterRefRecord_100_84_16_134", "AlterRefRecord_100_85_16_135",
"AlterRefRecord_100_86_16_136", "AlterRefRecord_100_87_16_137",
"AlterRefRecord_100_88_16_138", "AlterRefRecord_100_89_16_139",
"AlterRefRecord_100_8_16_56", "AlterRefRecord_100_90_16_140",
"AlterRefRecord_100_91_16_141", "AlterRefRecord_100_91_16_141",
"AlterRefRecord_100_92_16_142", "AlterRefRecord_100_93_16_143",
"AlterRefRecord_100_94_16_144", "AlterRefRecord_100_95_16_145",
"AlterRefRecord_100_97_16_147", "AlterRefRecord_100_99_16_149",
"AlterRefRecord_101_101_16_151", "AlterRefRecord_101_102_16_152",
"AlterRefRecord_101_103_16_153", "AlterRefRecord_101_105_16_155",
"AlterRefRecord_101_106_16_156", "AlterRefRecord_101_108_16_158",
"AlterRefRecord_101_10_16_58", "AlterRefRecord_101_115_16_165",
"AlterRefRecord_101_119_16_169", "AlterRefRecord_101_120_16_170",
"AlterRefRecord_101_120_16_170", "AlterRefRecord_101_121_16_171",
"AlterRefRecord_101_122_16_172", "AlterRefRecord_101_131_16_181",
"AlterRefRecord_101_136_16_186", "AlterRefRecord_101_139_16_189",
"AlterRefRecord_101_13_16_61", "AlterRefRecord_101_15_16_63",
"AlterRefRecord_101_15_16_63", "AlterRefRecord_101_17_16_65",
"AlterRefRecord_101_1_16_48", "AlterRefRecord_101_20_16_70",
"AlterRefRecord_101_23_16_72", "AlterRefRecord_101_27_16_76",
"AlterRefRecord_101_28_16_77", "AlterRefRecord_101_30_16_79",
"AlterRefRecord_101_31_16_80", "AlterRefRecord_101_32_16_81",
"AlterRefRecord_101_33_16_82", "AlterRefRecord_101_35_16_84")), .Names = c("NodeID",
"ClusterID"), row.names = c(NA, -112L), class = "data.frame")
library(doParallel)
library(foreach)
SplitLists <- split(master_UniqueClusterIDs, sample(1:32, nrow(master_UniqueClusterIDs), replace=T))
registerDoParallel(7)
getDoParWorkers()
foreach(ggg = 1:32 ) %dopar% {
print(ggg)
UniqueClusterIDs <- SplitLists[[ggg]][,1]
BS_ClusterID <- 1 ### Start
BS_ClusterTable <- data.frame()
while(length(UniqueClusterIDs)>0){
print(paste(ggg, "big", length(UniqueClusterIDs)))
NodeIDs_start <- WS_ClusterTable[WS_ClusterTable$ClusterID == UniqueClusterIDs[1],]$NodeID
######################################
Continue <- 1
# While number of clusters and the number of node_ids grows, continue searching
while(Continue == 1 ) {
#######
ClusterIDs <- WS_ClusterTable[WS_ClusterTable$NodeID %in% NodeIDs_start[,1],]$ClusterID
NodeIDs_2 <- WS_ClusterTable[WS_ClusterTable$ClusterID %in% ClusterIDs[,1],]$NodeID
if( nrow(NodeIDs_2) > nrow(NodeIDs_start)){
NodeIDs_start <- NodeIDs_2
Continue <- 1
} else {
Continue <- 0
#### Now that you have reached the end of this cluster, Insert it and then remove
BS_ClusterID <- BS_ClusterID+1
FinalID <- paste("BS_ClusterID",BS_ClusterID,ggg,sep="_")
NodeIDs_2 ## all the nodes
NewRows <- data.frame(NodeID= unique(NodeIDs_2), BS_ClusterID=FinalID)
BS_ClusterTable <- rbind(BS_ClusterTable,NewRows)
### Remove all listed clusters from the clusterlist
UniqueClusterIDs <- UniqueClusterIDs[! UniqueClusterIDs %in% ClusterIDs[,1]]
}
}
}
}