require(plyr)
library(reshape)
library(iterators)
library(parallel)
library(foreach)
library(doParallel)
getCosine <- function(x,y)
{
this.cosine <- sum(x*y) / (sqrt(sum(x*x)) * sqrt(sum(y*y)))
return(this.cosine)
}
visitordata <- read.csv("~/Hotels.csv", sep = ",", header = TRUE , stringsAsFactors = FALSE )
visitordata <- subset(visitordata, Product.Views > 0)
head(visitordata)
Visitor_ID Products Product.Views
2 1001863689_3519696751 CZ1XQZ 2
3 1001863689_3519696751 CZR3CN 1
4 1001863689_3519696751 CZTNKN 3
5 121021834007_98749174 CZ2LB0 1
6 11029477426_678878300 CZTNKN 1
7 21029477426_678878300 CZVDHR 1
ColumnBasedData <- reshape(visitordata, idvar="Visitor_ID", timevar="Products", direction="wide")
ColumnBasedData[is.na(ColumnBasedData)] <- 0
x <<- (ColumnBasedData[,!(names(ColumnBasedData) %in% c("Visitor_ID"))])
head(x)
Product.Views.CZ1XQZ Product.Views.CZR3CN Product.Views.CZTNKN Product.Views.CZVDHR Product.Views.CZ36D3 Product.Views.CZE0EN
2 1 1 1 0 0 0
6 0 0 1 1 0 0
9 0 0 0 0 1 1
24 0 0 0 0 0 0
37 0 0 0 0 0 0
40 0 0 0 0 0 0
holder <- matrix(NA, nrow=ncol(x),ncol=ncol(x),dimnames=list(colnames(x),colnames(x)))
dataframe_y <<- as.data.frame(holder)
cl<-makeCluster(detectCores() -1)
doParallel::registerDoParallel(cl)
ls <- foreach(i = 1:ncol(x)) %dopar% {
for(j in 1:ncol(x)) {
dataframe_y[i,j] <- getCosine(x[i],x[j])
}
}
stopCluster(cl)
write.csv(dataframe_y,file="~/cosine.csv")
It works with %do%
but doesn't with %dopar%
. With %dopar%
, dataframe_y
returns null. Any idea?
Edit : Libraries, funtions, data examples. I will process with big data, so I am trying to use parallel-processing. Script takes more than one day to complete without parallel-processing.