I am making a map of a dataset with 20 million data points in ggplot. A single map (with facetting) takes 10-15 mins, so I checked if using multiple cores in parallel mode would work better. Using foreach, maps took more time to make. Some foreach runs were running for 30min-1 hour without producing any results. I know that the answer to this question (Why is the parallel package slower than just using apply?) shows that in parallel computation, sometimes it takes more time to combine the result from the separate parallel processes than running the task itself. But I saw some run examples when I was searching that minute-runs can be improved. Do you think it is possible?
Due to the sheer amount of data I have, I sampled my dataset:
structure(list(lat = c(46.791667, 52.958333, 57.375, 62.625,
74.041667, 60.208333, 30.208333, 56.791667, 57.375, 40.958333,
56.541667, 38.958333, 35.291667, 43.625, 71.375, 66.875, 74.375,
66.458333, 47.791667, 48.041667, 41.541667, 40.875, 57.208333,
64.375, 42.625, 43.958333, 69.958333, 72.375, 36.875, 66.958333,
39.791667, 36.625, 52.625, 65.708333, 42.208333, 53.708333, 35.458333,
58.625, 34.875, 57.291667, 59.708333, 61.708333, 72.041667, 59.958333,
32.208333, 43.625, 39.541667, 62.625, 41.208333, 32.291667, 48.958333,
47.291667, 60.375, 49.458333, 37.208333, 65.708333, 57.958333,
31.041667, 63.875, 43.625, 54.541667, 55.541667, 45.458333, 72.375,
54.708333, 37.958333, 32.375, 60.125, 59.041667, 37.875, 42.958333,
44.375, 59.791667, 49.208333, 34.375, 53.208333, 59.458333, 53.375,
45.458333, 72.125, 66.208333, 60.958333, 47.625, 60.291667, 41.125,
67.541667, 54.625, 55.541667, 37.541667, 44.291667, 44.458333,
40.041667, 49.458333, 39.625, 73.375, 41.458333, 71.375, 31.041667,
66.791667, 42.541667), lon = c(-6.125, -19.541667, -29.291667,
-9.2083333, -11.541667, -6.625, -25.708333, -14.458333, -48.291667,
-63.541667, -41.291667, -12.541667, -48.291667, -58.708333, 6.625,
-2.7083333, -69.375, -19.291667, -27.208333, -36.625, -17.791667,
-50.541667, -38.708333, 9.375, -56.208333, -44.958333, -59.041667,
8.875, -21.125, -24.791667, -40.375, -26.208333, -31.875, -11.875,
-60.958333, -39.125, -32.458333, -54.791667, -44.541667, -37.958333,
-48.625, -10.541667, 3.5416667, -17.791667, -16.041667, -9.9583333,
-32.708333, 0.875, -18.625, -54.208333, -63.125, -56.458333,
-55.125, -22.708333, -40.958333, -56.208333, -33.625, -69.125,
-58.125, -17.541667, -23.541667, -17.041667, -18.458333, -64.791667,
-25.208333, -35.875, -44.791667, -11.291667, -58.291667, -46.208333,
-41.208333, -5.0416667, -38.208333, -13.875, -55.291667, -17.291667,
-48.625, -38.791667, -59.375, -19.291667, 5.5416667, -19.625,
-41.375, -66.291667, -17.625, -14.208333, -39.291667, -48.875,
-16.541667, -21.375, -46.375, 2.625, -60.291667, -40.375, 2.4583333,
-16.458333, 4.875, -66.291667, -4.2916667, -36.458333), entity = structure(c(2L,
2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L,
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L,
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L), .Label = c("Cc", "Li"), class = "factor"), watcon = structure(c(1L,
2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L,
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L,
2L, 2L, 2L), .Label = c("calm", "stormy"), class = "factor"),
step = structure(c(1L, 3L, 2L, 4L, 4L, 2L, 3L, 3L, 3L, 4L,
2L, 2L, 3L, 2L, 1L, 3L, 3L, 3L, 4L, 4L, 4L, 1L, 1L, 1L, 2L,
4L, 1L, 2L, 2L, 4L, 2L, 4L, 4L, 3L, 4L, 2L, 2L, 1L, 2L, 2L,
1L, 4L, 3L, 2L, 3L, 3L, 2L, 4L, 1L, 3L, 2L, 2L, 3L, 4L, 4L,
2L, 2L, 3L, 4L, 1L, 3L, 2L, 1L, 4L, 2L, 3L, 1L, 3L, 1L, 2L,
1L, 4L, 4L, 3L, 1L, 1L, 1L, 3L, 1L, 2L, 2L, 4L, 3L, 4L, 4L,
4L, 1L, 2L, 1L, 4L, 3L, 4L, 2L, 4L, 1L, 4L, 1L, 2L, 2L, 4L
), .Label = c("abundance", "enccomb", "adscomb", "infcomb"
), class = "factor")), row.names = c(NA, -100L), class = "data.frame")
The code I am using is:
library(oceanmap)
library(sf)
library(ggmap)
library(rnaturalearth)
library(rnaturalearthdata)
library(rgeos)
library(tidyverse)
world <- ne_countries(scale = "medium", returnclass = "sf")
library(doParallel)
library(foreach)
cl <- makeCluster(5)
doParallel::registerDoParallel(cl)
entities <- unique(sample$entity)
foreach(i=1:length(entities), .packages = c("tidyverse", "dplyr")) %dopar% {
ggplot (data=world) + geom_sf(color="white", fill="white") + coord_sf(xlim = c(-34,-30), ylim = c(53,62) , expand = FALSE) + geom_point(subset(sample, sample$entity==entities[i]), mapping=aes(x = lon, y = lat, color = log10(value))) + facet_grid(watcon~step2) +
ggtitle(entities[i])
ggsave(filename = paste0(i,".png"))
}
stopCluster(cl)
P.S. I am using ggplot because of the opportunities for editing the map itself. Lattice is faster but lacks the customization that I am looking for.
Any help on how to improve my foreach code or if it's even possible is greatly appreciated!