1

I am making a map of a dataset with 20 million data points in ggplot. A single map (with facetting) takes 10-15 mins, so I checked if using multiple cores in parallel mode would work better. Using foreach, maps took more time to make. Some foreach runs were running for 30min-1 hour without producing any results. I know that the answer to this question (Why is the parallel package slower than just using apply?) shows that in parallel computation, sometimes it takes more time to combine the result from the separate parallel processes than running the task itself. But I saw some run examples when I was searching that minute-runs can be improved. Do you think it is possible?

Due to the sheer amount of data I have, I sampled my dataset:

structure(list(lat = c(46.791667, 52.958333, 57.375, 62.625, 
74.041667, 60.208333, 30.208333, 56.791667, 57.375, 40.958333, 
56.541667, 38.958333, 35.291667, 43.625, 71.375, 66.875, 74.375, 
66.458333, 47.791667, 48.041667, 41.541667, 40.875, 57.208333, 
64.375, 42.625, 43.958333, 69.958333, 72.375, 36.875, 66.958333, 
39.791667, 36.625, 52.625, 65.708333, 42.208333, 53.708333, 35.458333, 
58.625, 34.875, 57.291667, 59.708333, 61.708333, 72.041667, 59.958333, 
32.208333, 43.625, 39.541667, 62.625, 41.208333, 32.291667, 48.958333, 
47.291667, 60.375, 49.458333, 37.208333, 65.708333, 57.958333, 
31.041667, 63.875, 43.625, 54.541667, 55.541667, 45.458333, 72.375, 
54.708333, 37.958333, 32.375, 60.125, 59.041667, 37.875, 42.958333, 
44.375, 59.791667, 49.208333, 34.375, 53.208333, 59.458333, 53.375, 
45.458333, 72.125, 66.208333, 60.958333, 47.625, 60.291667, 41.125, 
67.541667, 54.625, 55.541667, 37.541667, 44.291667, 44.458333, 
40.041667, 49.458333, 39.625, 73.375, 41.458333, 71.375, 31.041667, 
66.791667, 42.541667), lon = c(-6.125, -19.541667, -29.291667, 
-9.2083333, -11.541667, -6.625, -25.708333, -14.458333, -48.291667, 
-63.541667, -41.291667, -12.541667, -48.291667, -58.708333, 6.625, 
-2.7083333, -69.375, -19.291667, -27.208333, -36.625, -17.791667, 
-50.541667, -38.708333, 9.375, -56.208333, -44.958333, -59.041667, 
8.875, -21.125, -24.791667, -40.375, -26.208333, -31.875, -11.875, 
-60.958333, -39.125, -32.458333, -54.791667, -44.541667, -37.958333, 
-48.625, -10.541667, 3.5416667, -17.791667, -16.041667, -9.9583333, 
-32.708333, 0.875, -18.625, -54.208333, -63.125, -56.458333, 
-55.125, -22.708333, -40.958333, -56.208333, -33.625, -69.125, 
-58.125, -17.541667, -23.541667, -17.041667, -18.458333, -64.791667, 
-25.208333, -35.875, -44.791667, -11.291667, -58.291667, -46.208333, 
-41.208333, -5.0416667, -38.208333, -13.875, -55.291667, -17.291667, 
-48.625, -38.791667, -59.375, -19.291667, 5.5416667, -19.625, 
-41.375, -66.291667, -17.625, -14.208333, -39.291667, -48.875, 
-16.541667, -21.375, -46.375, 2.625, -60.291667, -40.375, 2.4583333, 
-16.458333, 4.875, -66.291667, -4.2916667, -36.458333), entity = structure(c(2L, 
2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 
2L, 2L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 
1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 
2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 
2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 
2L, 1L, 1L), .Label = c("Cc", "Li"), class = "factor"), watcon = structure(c(1L, 
2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 
2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 
1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 
2L, 2L, 2L), .Label = c("calm", "stormy"), class = "factor"), 
    step = structure(c(1L, 3L, 2L, 4L, 4L, 2L, 3L, 3L, 3L, 4L, 
    2L, 2L, 3L, 2L, 1L, 3L, 3L, 3L, 4L, 4L, 4L, 1L, 1L, 1L, 2L, 
    4L, 1L, 2L, 2L, 4L, 2L, 4L, 4L, 3L, 4L, 2L, 2L, 1L, 2L, 2L, 
    1L, 4L, 3L, 2L, 3L, 3L, 2L, 4L, 1L, 3L, 2L, 2L, 3L, 4L, 4L, 
    2L, 2L, 3L, 4L, 1L, 3L, 2L, 1L, 4L, 2L, 3L, 1L, 3L, 1L, 2L, 
    1L, 4L, 4L, 3L, 1L, 1L, 1L, 3L, 1L, 2L, 2L, 4L, 3L, 4L, 4L, 
    4L, 1L, 2L, 1L, 4L, 3L, 4L, 2L, 4L, 1L, 4L, 1L, 2L, 2L, 4L
    ), .Label = c("abundance", "enccomb", "adscomb", "infcomb"
    ), class = "factor")), row.names = c(NA, -100L), class = "data.frame")

The code I am using is:

library(oceanmap)
library(sf)
library(ggmap)
library(rnaturalearth)
library(rnaturalearthdata)
library(rgeos)
library(tidyverse)

world <- ne_countries(scale = "medium", returnclass = "sf")

library(doParallel)
library(foreach)
cl <- makeCluster(5)
doParallel::registerDoParallel(cl)
entities <- unique(sample$entity)

foreach(i=1:length(entities), .packages = c("tidyverse", "dplyr")) %dopar% {
  ggplot (data=world) + geom_sf(color="white", fill="white") +  coord_sf(xlim = c(-34,-30), ylim = c(53,62) , expand = FALSE) + geom_point(subset(sample, sample$entity==entities[i]), mapping=aes(x = lon, y = lat, color = log10(value))) + facet_grid(watcon~step2) +
    ggtitle(entities[i])
  ggsave(filename = paste0(i,".png"))
}

stopCluster(cl)

P.S. I am using ggplot because of the opportunities for editing the map itself. Lattice is faster but lacks the customization that I am looking for.

Any help on how to improve my foreach code or if it's even possible is greatly appreciated!

Kaye11
  • 359
  • 5
  • 17
  • Have you tried using `future_map()` from the `furrr` package? Or other parallel packages in general? Any differences? – Adam B. Apr 14 '20 at 06:46
  • Sorry @Kaye11, your example isn't working for me; facet_grid(watcon ~ step2)' looks to be causing an issue (no 'step2' in the sample df?) – jared_mamrot Apr 14 '20 at 07:08
  • 3
    The problem most likely is that foreach has to send the whole dataset to each worker (check your RAM usage). You should split your dataset outside of the loop and iterate over the subsets. – Roland Apr 14 '20 at 09:01
  • @Adam B : I tried snowfall and it is the same. – Kaye11 Apr 14 '20 at 13:06
  • @jpmam1 sorry, step and step2 are interchangeable. If you change my code to facet_grid (watcon~step), it should work – Kaye11 Apr 14 '20 at 13:08
  • How many maps are you making? I think @Roland has the best solution: "split the dataset and iterate over each subset". If you're running the job on a server it might be worth looking into GNU parallel to achieve this e.g. `parallel Rscript plot_multifactor.R ::: subset_{1..32}.csv` (this solution has worked well for me in the past) – jared_mamrot Apr 14 '20 at 23:48
  • Thanks for the advices! I am making maps that are facetted. I can try to split it but then if there's another way not to split them, I will prefer that. I will try the `future_map()` when I get the time and let you know! – Kaye11 Apr 21 '20 at 16:23

0 Answers0