4

Let's assume that I have a data table with People who watched Movies, like

library(data.table)
DT = fread("
User,        Movie
Alice ,      Fight Club
Alice,       The Godfather
Bob,         Titanic
Charlotte,   The Godfather")

I want to compute, for each pair of movies, the number of people who watched both and the number of people who watched at least one, i.e.

Movie1        Movie2           WatchedOne   WatchedBoth
Fight Club    The Godfather    2            1
The Godfather Titanic          3            0
Fight Club    Titanic          2            0

I have millions of rows and I would need a blazingly fast data.table function :-)

Thanks for help!

Uwe
  • 41,420
  • 11
  • 90
  • 134
mac
  • 165
  • 8

3 Answers3

2

Another way:

DT = DT[, .(Users = list(User)), keyby='Movie']

Y = data.table(t(combn(DT$Movie, 2)))
setnames(Y, c('Movie1','Movie2'))

Y[DT, on=.(Movie1==Movie), Movie1.Users:= Users]
Y[DT, on=.(Movie2==Movie), Movie2.Users:= Users]

#Y[, WatchedOne:= lengths(Map(union, Movie1.Users, Movie2.Users))]
Y[, WatchedBoth:= lengths(Map(intersect, Movie1.Users, Movie2.Users))]
# better:
Y[, WatchedOne:= lengths(Movie1.Users) + lengths(Movie2.Users) - WatchedBoth]

> Y[, -(3:4)]
#           Movie1        Movie2 WatchedBoth WatchedOne
# 1:    Fight Club The Godfather           1          2
# 2:    Fight Club       Titanic           0          2
# 3: The Godfather       Titanic           0          3
sirallen
  • 1,947
  • 14
  • 21
  • is it possible to parallelize the code in order to exploit multiple processors? Thank you – mac Sep 06 '17 at 10:22
  • yes, it's possible -- I think the bottleneck would be the computation of `union` and `intersect`. maybe first try computing the union as `lengths(Movie1.Users) + lengths(Movie2.Users) - WatchedBoth` and see where that gets you. Much more efficient than what I wrote – sirallen Sep 06 '17 at 17:15
1

This achieves what you are after

library(data.table)

mydt <- data.table(User = c("Alice", "Alice", "Bob", "Charlotte"), 
               Movie = c("Fight Club", "The Godfather", "Titanic", "The Godfather"))
##
mydt2 <-  data.table(t(mydt[,combn(unique(Movie), 2, simplify = FALSE)]))
names(mydt2) <- c("Movie1", "Movie2")
##
temp <- apply(mydt2, 1, function(x) mydt[Movie %in% x, .N, by = User])
mydt2[, WatchedOne := lapply(temp, function(x) x[, length(N)])]
mydt2[, WatchedBoth := lapply(temp, function(x) x[, sum(N==2)])]

# Movie1        Movie2 WatchedOne WatchedBoth
# 1:    Fight Club The Godfather          2           1
# 2:    Fight Club       Titanic          2           0
# 3: The Godfather       Titanic          3           0
simone
  • 577
  • 1
  • 7
  • 15
  • is it possible to parallelize the code in order to exploit multiple processors? Thank you – mac Sep 06 '17 at 10:21
0

@sirallen @simone Thank you for your answers, I tried both ways. However, I found the fastest way to be the following:

DT_comb <- as.data.table( t( combn( movie, 2) ) )

colnames(DT_comb) <- c("movie1", "movie2")

function_1 <- function(movie_i, movie_j){
  ur_i = DT[movie == movie_i, user_ID]
  ur_j = DT[movie == movie_j, user_ID]
  x = length(intersect(ur_i, ur_j))
  return(x)
}

function_2 <- function(movie_i, movie_j){
  ur_i = DT[movie == movie_i, user_ID]
  ur_j = DT[movie == movie_j, user_ID]
  x = length(union(ur_i, ur_j))
  return(x)
}

cl <- makeCluster(detectCores() - 1)

clusterExport(cl=cl, varlist=c("DT", "function_1", "function_2"))

clusterCall(cl, function() library(data.table))

DT_comb$Watched_One <- clusterMap(cl,
                                  function_1,
                                  DT_corr$movie1,
                                  DT_corr$movie2)

DT_comb$Watched_Both <- clusterMap(cl,
                                   function_2,
                                   DT_corr$movie1,
                                   DT_corr$movie2)

stopCluster(cl)

Maybe your solutions are even faster than mine when parallelized? :-)

mac
  • 165
  • 8