I have two data.frames block and transactions
- block - columns: hash, timestamp
- transactions- columns: tx_hash,address,amount
as both data.frames have 200000000 rows inner join using
merge(x=block,y=transactions,by.x="hash",by.y="tx_hash",all.x=TRUE)
does not work
I found parallelized merge operation in r (https://xrgb.blogspot.com/2013/03/rmark-parallel-fast-merging.html). how to modify it for my use?
mc.fast.merging = function(data.list, nparts, cluster){
if(!is.list(data.list)) stop("data.list isn't a list")
while(length(data.list) != 1){ #Loop until everything is merged
if(length(data.list) > nparts){
starts = seq(1, length(data.list), nparts)
ends = seq(nparts, length(data.list), nparts) #starts and ends are of equal size if length(data.list) divides nparts.
if(length(ends) < length(starts)) ends = c(ends, length(data.list)) #making sure things are even
sections = matrix(c(starts, ends), ncol=2, byrow=FALSE)
sections = apply(sections, 1, list)
}else{
sections = list(c(1, length(data.list)))
}
if(length(sections) !=1){
data.list = parLapply(cluster, sections, function(x, data.list){
if(is.list(x)) x = x[[1]]
#the standard way starts ->
part = data.list[[x[1]]]
for(i in x[1]:x[2]){
part = merge(part, data.list[[i]], all=TRUE, sort=FALSE)
}
#<- standard way ends
return(part)
}, data.list = data.list)
}else{
data.list = lapply(sections, function(x, data.list){
if(is.list(x)) x = x[[1]]
part = data.list[[x[1]]]
for(i in x[1]:x[2]){
part = merge(part, data.list[[i]], all=TRUE, sort=FALSE)
}
return(part)
}, data.list = data.list)
}
}
return(data.list[[1]]) #returning the merged data frame
}