I have a table cluster (with more than one column):
head(cluster[,c('cuil_direccion')])
[1] "PJE INDEA 98 5 "
[2] "PJE INDE 98 5 "
[3] "B 34 VIV RECRE 57 00 "
[4] "S CASA DE GO 600 "
[5] "RCCA 958 00 o "
[6] "JUAN B 1900 "
I need to run a function that for each line extracts the numbers and paste them in a list. I'm using: str_extract_all. Since the table is huge I'd like to split data and use different cores for each split. I tried:
library(foreach)
library(doParallel)
registerDoParallel(cores=detectCores(all.tests=TRUE))
crea_tabla <- function(x){
xlst <- split(x, 1:nrow(x))
pred <- foreach(i = xlst, .combine = rbind) %dopar% {
library(stringr)
d<-data.frame(dir='a', E_numdir=1)
j=1
DIR<-i$cuil_direccion[j]
E_NUMDIR <- str_extract_all(DIR,"\\(?[0-9]+\\)?")[[1]]
d<-rbind(d, data.frame( dir=DIR ,
E_numdir=toString(E_NUMDIR)))
j=1+j
}
}
then I ran
crea_tabla(cluster)
And I get an empty result.
I'm not sure about the way doparallel uses data. E.G this part:
library(stringr)
d<-data.frame(dir='a', E_numdir=1)
j=1
Should I write before or after %dopar%?
EDITION
num_cores<-detectCores(all.tests=TRUE)
registerDoParallel(cores=detectCores(all.tests=TRUE))
crea_tabla <- function(x, num_cores){
xlst <- split(x, 1:nrow(x))
j=1
d<-data.frame(dir='a', E_numdir=1)
pred <- foreach(i = seq_along(xlst), .combine = rbind) %dopar% {
print(i*num_cores/nrow(x))
library(stringr)
DIR<-xlst[[i]]$cuil_direccion
E_NUMDIR <- str_extract_all(DIR,"\\(?[0-9]+\\)?")[[1]]
data.frame(dir=DIR , E_numdir=toString(E_NUMDIR))
}
d <- rbind(d, pred)
return(d)
}
a<-crea_tabla(cluster, num_cores)