I'm computing a distance matrix, to be used for some clustering. The distance matrix size will result 4.6 gb, so I need a very fast code in order to compute it!
I've read that the best way to do this is to "vectorialize" my function, however I'm not really good at programming with R and for the moment I came up just with a solution that has 2 nested loops!
The distance function takes as input 2 geographical coordinates and 2 strings and computes the distance in the following way:
require(Imap)
mydist <- function (lat1,lon1,lingua1,lat2,lon2,lingua2,DT){
delta=0.1
gamma=3
d=sqrt(delta*gdist(lon1,lat1,lon2,lat2)^2 + gamma*(DT[language1 %in% lingua1 & language2 %in%lingua2]$distance)^2)
}
It reads the distance of my two strings from the data.table DT where I have stored all the possible distances of my strings
The function that allocates the matrix is:
require(bigmemory)
distmatrix <- function(twit2,DT){
N=dim(twit2)[1]
distmat = big.matrix(N,N)
for(i in 1:N){
for(j in 1:N){
distmat[i,j]=mydist(twit2[i,]$longitude,twit2[i,]$latitude,twit2[i,]$language,twit2[j,]$longitude,twit2[j,]$latitude,twit2[j,]$language,DT)
}
}
return(distmat)
}
EDIT: I'm working on an alternative way, which is to use library(fossil) in which is implemented a vectorial version of the geodesic distance computation
Also, I've moved DT to DT2 which is now a square matrix
library(fossil)
lingdist <- function(lang1,lang2, DT2){
list=colnames(DT2)
i=which(list==lang1)
j=which(list==lang2)
return(DT2[i,j])
}
distmatrix <- function(twit2,DT2){
N=dim(twit2)[1]
long<-as.vector(twit2$longitude)
lat <-as.vector(twit2$latitude)
lang<-as.vector(twit2$language)
distmat = t(as.matrix(earth.dist(as.matrix(cbind(long,lat)))))
for(i in 1:N) {
for (j in i:N) {
distmat[i,j]=sqrt(distmat[i,j]*distmat[i,j] + lingdist(lang[i],lang[j],DT2))
}
}
return(distmat)
}
This has achieved a substantial speedup, 20x, with "small" input (up to 5k rows) but fails to allocate distmat with my whole dataframe (24k rows)
Do you have any idea on how to solve it?
EDIT2: here is a small version of my data bases
dput(DT2[1:5,1:5])
structure(c(0, 0.808204378308, 0.873223132147, 0.885209298235,
0.849854297278, 0.808204378308, 0, 0.881177373275, 0.854352223232,
0.854317529225, 0.873223132147, 0.881177373275, 0, 0.834454614055,
0.861541199715, 0.885209298235, 0.854352223232, 0.834454614055,
0, 0.76583938666, 0.849854297278, 0.854317529225, 0.861541199715,
0.76583938666, 0), .Dim = c(5L, 5L), .Dimnames = list(c("1",
"2", "3", "4", "5"), c("AA.SEMITIC.ARABIC_GULF_SPOKEN", "Alt.TURKIC.TURKISH",
"An.MESO-PHILIPPINE.TAGALOG", "IE.BALTIC.LITHUANIAN", "IE.CELTIC.WELSH"
)))
dput(twit4[1:40,])
structure(list(day = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), nil = c(28L,
28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 28L, 71L, 71L,
20L, 5L, 24L, 49L, 50L, 28L, 28L, 22L, 22L, 21L, 21L, 24L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 24L
), longitude = c(9.2235078, 9.22355903, 9.22362504, 9.22318987,
9.22355654, 9.22361992, 9.22348964, 9.22366317, 9.22383346, 9.2238841,
9.22374533, 9.22351081, 9.1361611, 9.1361805, 9.2144687, 9.1871549,
9.2504309, 9.14652258, 9.16928, 9.22321188, 9.22387642, 9.2237509,
9.22372656, 9.22278207, 9.2225214, 9.2470243, 9.22405217, 9.22404052,
9.22405638, 9.22396956, 9.22402622, 9.2239671, 9.2239646, 9.22400299,
9.22400299, 9.22403204, 9.22396816, 9.22404027, 9.22407831, 9.246786
), latitude = c(45.45206021, 45.45202558, 45.4523043, 45.45211746,
45.45204048, 45.45232425, 45.45207132, 45.45205533, 45.45218499,
45.45216514, 45.45220716, 45.45214255, 45.5053803, 45.5053559,
45.4871762, 45.4539539, 45.4660934, 45.45278042, 45.455855, 45.45882439,
45.46055371, 45.47414199, 45.47947343, 45.48080458, 45.48119442,
45.4658805, 45.49167007, 45.49168084, 45.49160813, 45.49164877,
45.49165014, 45.49163468, 45.49165405, 45.49169004, 45.49169004,
45.49160814, 45.49164155, 45.49161845, 45.49160889, 45.4660437
), language = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 4L, 4L, 1L, 8L, 4L, 4L, 8L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("AA.SEMITIC.ARABIC_GULF_SPOKEN",
"AA.SEMITIC.HEBREW", "Alt.TURKIC.TURKISH", "An.MESO-PHILIPPINE.TAGALOG",
"AuA.VIET-MUONG.VIETNAMESE", "IE.ARMENIAN.EASTERN_ARMENIAN",
"IE.BALTIC.LATVIAN", "IE.BALTIC.LITHUANIAN", "IE.CELTIC.WELSH",
"IE.GERMANIC.DANISH", "IE.GERMANIC.DUTCH", "IE.GERMANIC.ICELANDIC",
"IE.GERMANIC.NORWEGIAN_BOKMAAL", "IE.GERMANIC.STANDARD_GERMAN",
"IE.GERMANIC.SWEDISH", "IE.GREEK.GREEK", "IE.INDIC.HINDI", "IE.IRANIAN.PERSIAN",
"IE.ROMANCE.FRENCH", "IE.ROMANCE.PORTUGUESE", "IE.ROMANCE.ROMANIAN",
"IE.ROMANCE.SPANISH", "IE.SLAVIC.BOSNIAN", "IE.SLAVIC.BULGARIAN",
"IE.SLAVIC.CROATIAN", "IE.SLAVIC.POLISH", "IE.SLAVIC.RUSSIAN",
"IE.SLAVIC.SERBOCROATIAN", "IE.SLAVIC.SLOVAK", "IE.SLAVIC.SLOVENIAN",
"IE.SLAVIC.UKRAINIAN", "Jap.JAPANESE.JAPANESE", "Kor.KOREAN.KOREAN",
"Krt.KARTVELIAN.GEORGIAN", "Oth.CREOLES_AND_PIDGINS.HAITIAN_CREOLE",
"ST.CHINESE.CANTONESE", "TK.KAM-TAI.THAI", "Ura.FINNIC.ESTONIAN",
"Ura.FINNIC.FINNISH", "Ura.UGRIC.HUNGARIAN"), class = "factor")), .Names = c("day",
"nil", "longitude", "latitude", "language"), row.names = c("2",
"6", "7", "8", "13", "15", "16", "20", "25", "29", "30", "32",
"84", "86", "195", "266", "322", "467", "495", "521", "524",
"534", "542", "546", "550", "580", "624", "640", "668", "676",
"679", "699", "742", "751", "754", "768", "779", "800", "803",
"857"), class = "data.frame")