2

I'm computing a distance matrix, to be used for some clustering. The distance matrix size will result 4.6 gb, so I need a very fast code in order to compute it!

I've read that the best way to do this is to "vectorialize" my function, however I'm not really good at programming with R and for the moment I came up just with a solution that has 2 nested loops!

The distance function takes as input 2 geographical coordinates and 2 strings and computes the distance in the following way:

require(Imap)

mydist <- function (lat1,lon1,lingua1,lat2,lon2,lingua2,DT){
  delta=0.1
  gamma=3
  d=sqrt(delta*gdist(lon1,lat1,lon2,lat2)^2 + gamma*(DT[language1 %in% lingua1 & language2 %in%lingua2]$distance)^2)
}

It reads the distance of my two strings from the data.table DT where I have stored all the possible distances of my strings

The function that allocates the matrix is:

require(bigmemory)

distmatrix <- function(twit2,DT){
  N=dim(twit2)[1]
  distmat = big.matrix(N,N)
  for(i in 1:N){
    for(j in 1:N){
      distmat[i,j]=mydist(twit2[i,]$longitude,twit2[i,]$latitude,twit2[i,]$language,twit2[j,]$longitude,twit2[j,]$latitude,twit2[j,]$language,DT)
    }
  }
  return(distmat)
}

EDIT: I'm working on an alternative way, which is to use library(fossil) in which is implemented a vectorial version of the geodesic distance computation

Also, I've moved DT to DT2 which is now a square matrix

library(fossil)

lingdist <- function(lang1,lang2, DT2){
  list=colnames(DT2)
  i=which(list==lang1)
  j=which(list==lang2)
  return(DT2[i,j])
}

distmatrix <- function(twit2,DT2){
  N=dim(twit2)[1]
  long<-as.vector(twit2$longitude)
  lat <-as.vector(twit2$latitude)
  lang<-as.vector(twit2$language)
  distmat = t(as.matrix(earth.dist(as.matrix(cbind(long,lat)))))
  for(i in 1:N) {
    for (j in i:N) {
      distmat[i,j]=sqrt(distmat[i,j]*distmat[i,j] + lingdist(lang[i],lang[j],DT2))
    }
  }
  return(distmat)
}

This has achieved a substantial speedup, 20x, with "small" input (up to 5k rows) but fails to allocate distmat with my whole dataframe (24k rows)

Do you have any idea on how to solve it?

EDIT2: here is a small version of my data bases

dput(DT2[1:5,1:5])
structure(c(0, 0.808204378308, 0.873223132147, 0.885209298235, 
0.849854297278, 0.808204378308, 0, 0.881177373275, 0.854352223232, 
0.854317529225, 0.873223132147, 0.881177373275, 0, 0.834454614055, 
0.861541199715, 0.885209298235, 0.854352223232, 0.834454614055, 
0, 0.76583938666, 0.849854297278, 0.854317529225, 0.861541199715, 
0.76583938666, 0), .Dim = c(5L, 5L), .Dimnames = list(c("1", 
"2", "3", "4", "5"), c("AA.SEMITIC.ARABIC_GULF_SPOKEN", "Alt.TURKIC.TURKISH", 
"An.MESO-PHILIPPINE.TAGALOG", "IE.BALTIC.LITHUANIAN", "IE.CELTIC.WELSH"
)))

 dput(twit4[1:40,])
structure(list(day = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), nil = c(28L, 
28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 71L, 71L, 
20L, 5L, 24L, 49L, 50L, 28L, 28L, 22L, 22L, 21L, 21L, 24L, 20L, 
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 24L
), longitude = c(9.2235078, 9.22355903, 9.22362504, 9.22318987, 
9.22355654, 9.22361992, 9.22348964, 9.22366317, 9.22383346, 9.2238841, 
9.22374533, 9.22351081, 9.1361611, 9.1361805, 9.2144687, 9.1871549, 
9.2504309, 9.14652258, 9.16928, 9.22321188, 9.22387642, 9.2237509, 
9.22372656, 9.22278207, 9.2225214, 9.2470243, 9.22405217, 9.22404052, 
9.22405638, 9.22396956, 9.22402622, 9.2239671, 9.2239646, 9.22400299, 
9.22400299, 9.22403204, 9.22396816, 9.22404027, 9.22407831, 9.246786
), latitude = c(45.45206021, 45.45202558, 45.4523043, 45.45211746, 
45.45204048, 45.45232425, 45.45207132, 45.45205533, 45.45218499, 
45.45216514, 45.45220716, 45.45214255, 45.5053803, 45.5053559, 
45.4871762, 45.4539539, 45.4660934, 45.45278042, 45.455855, 45.45882439, 
45.46055371, 45.47414199, 45.47947343, 45.48080458, 45.48119442, 
45.4658805, 45.49167007, 45.49168084, 45.49160813, 45.49164877, 
45.49165014, 45.49163468, 45.49165405, 45.49169004, 45.49169004, 
45.49160814, 45.49164155, 45.49161845, 45.49160889, 45.4660437
), language = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 4L, 4L, 1L, 8L, 4L, 4L, 8L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("AA.SEMITIC.ARABIC_GULF_SPOKEN", 
"AA.SEMITIC.HEBREW", "Alt.TURKIC.TURKISH", "An.MESO-PHILIPPINE.TAGALOG", 
"AuA.VIET-MUONG.VIETNAMESE", "IE.ARMENIAN.EASTERN_ARMENIAN", 
"IE.BALTIC.LATVIAN", "IE.BALTIC.LITHUANIAN", "IE.CELTIC.WELSH", 
"IE.GERMANIC.DANISH", "IE.GERMANIC.DUTCH", "IE.GERMANIC.ICELANDIC", 
"IE.GERMANIC.NORWEGIAN_BOKMAAL", "IE.GERMANIC.STANDARD_GERMAN", 
"IE.GERMANIC.SWEDISH", "IE.GREEK.GREEK", "IE.INDIC.HINDI", "IE.IRANIAN.PERSIAN", 
"IE.ROMANCE.FRENCH", "IE.ROMANCE.PORTUGUESE", "IE.ROMANCE.ROMANIAN", 
"IE.ROMANCE.SPANISH", "IE.SLAVIC.BOSNIAN", "IE.SLAVIC.BULGARIAN", 
"IE.SLAVIC.CROATIAN", "IE.SLAVIC.POLISH", "IE.SLAVIC.RUSSIAN", 
"IE.SLAVIC.SERBOCROATIAN", "IE.SLAVIC.SLOVAK", "IE.SLAVIC.SLOVENIAN", 
"IE.SLAVIC.UKRAINIAN", "Jap.JAPANESE.JAPANESE", "Kor.KOREAN.KOREAN", 
"Krt.KARTVELIAN.GEORGIAN", "Oth.CREOLES_AND_PIDGINS.HAITIAN_CREOLE", 
"ST.CHINESE.CANTONESE", "TK.KAM-TAI.THAI", "Ura.FINNIC.ESTONIAN", 
"Ura.FINNIC.FINNISH", "Ura.UGRIC.HUNGARIAN"), class = "factor")), .Names = c("day", 
"nil", "longitude", "latitude", "language"), row.names = c("2", 
"6", "7", "8", "13", "15", "16", "20", "25", "29", "30", "32", 
"84", "86", "195", "266", "322", "467", "495", "521", "524", 
"534", "542", "546", "550", "580", "624", "640", "668", "676", 
"679", "699", "742", "751", "754", "768", "779", "800", "803", 
"857"), class = "data.frame")
mariob6
  • 469
  • 1
  • 6
  • 16
  • 2
    one hack will be to just use upper triangular matrix. You really need not to compute the distance between points (A and B) and then between points (B and A). To achieve this just changing `for(j in 1:N)` to `for(j in i:N)` will help. – abhiieor Jun 20 '16 at 09:03
  • 2
    You should do the data.frame subsetting outside the loop, e.g., `long <- twit2$longitude` and then use `long[i]` inside the loop. But most likely, a sufficient speed can only be achieved with compiled code, e.g., using Rcpp. – Roland Jun 20 '16 at 09:03
  • Do you know that [post](http://stackoverflow.com/questions/2908822/speed-up-the-loop-operation-in-r)? – Christoph Jun 20 '16 at 09:26
  • see my answer in [here](http://stackoverflow.com/questions/27847196/distance-calculation-on-large-vectors-performance/33409695#33409695) – Patric Jun 20 '16 at 23:22
  • 1
    Could you provide a reproducible example? Perhaps a small sample of your data ? – rafa.pereira Jun 20 '16 at 23:34

1 Answers1

0

This is by far the fastest alternative I've found, based on data.table. It requires the data to be organized in a long format, so that each row in your data set has a combination of origin (lat long) and destination (lat long).

4 simple steps

# load library
 library(geosphere)
 library(data.table)

### STEP 1. reshape your matrix with languange distances to long format
setDT(DT)[, language := names(DT)]
DT_long <- melt.data.table(DT,  id.vars="language", variable.name = "language2", value.name = "lingdist")


### STEP 2. Get all possible combinations of origin and destination in long format
df <- expand.grid.df(twit4, twit4)
names(df)[c(3,4,5,8,9,10)] <- c("long_orig", "lat_orig", "language", "long_dest", "lat_dest","language2")


### STEP 3. Efficiently merge the two datasets
setkey(DT_long, "language", "language2")
setkey(df, "language", "language2")
df <- df[DT_long, nomatch=0]


### STEP 4. Calculate distances
df[ , dist := lingdist + distGeo(matrix(c(long_orig, lat_orig), ncol = 2), 
                                 matrix(c(long_dest, lat_dest), ncol = 2))/1000]

This solution should be relatively fast. The bottleneck here is the expand.grid.df operation, which is probably the part of the code that takes more time, specially when working with large data sets as in your case. I'm pretty sure there must be a faster alternative to expand.grid.df though. I'll update this answer when I find one.

rafa.pereira
  • 13,251
  • 6
  • 71
  • 109
  • it's quite an interesting solution, but there are 2 problems: 1) my definition of distance is different, since ti has also a part of "linguistic distance" // 2) with a large data frame I'm not able to process expand.grid.df – mariob6 Jun 21 '16 at 00:17
  • I've updated my answer to incorporate the linguistic distance in the operation. Did you want to sum the two types of distances? Regarding your point 2, `expand.grid.df` is indeed the bottleneck here, but I'm sure there must be a faster alternative to it – rafa.pereira Jun 21 '16 at 08:38