0

I have simple code, but it took 0.006 sec for one iteration using small data set, i'm afraid because i'm gonna use this on large data set. here is my code

d1 = matrix(0, nrow(x), num.clust)
for(i in 1:ncol(d.num){
  #For numeric attribute
  d1.temp = (w*(d.num - matrix(rep(as.numeric(num.centroid[i,]), nrows),nrow = nrows, byrow = T)))^2

  d1[,i] = rowSums(d1.temp)
}

and this is data frame that i used

> head(d.num)
x3        x4
4.842316 11.754403
6.405585 11.643502
6.590780 11.478245
6.656699 11.293404
> num.centroid
     [,1]     [,2]
[1,] 7.605837 12.59816
[2,] 7.895469 12.92275

w is data frame that has size with d.num. Any suggest to reduce time execution for this case ?

1 Answers1

1

Here is a comparison if, as I suggested, you convert the dataframes to matrices.

# original code
f1 <- function(n){
  d.num <- data.frame(x3=round(rnorm(n)), x4 = round(rnorm(n)))
  w <- d.num
  num.centroid <- matrix(c(7,8,9,10), nrow=2)
  nrows = nrow(d.num)
  d1 <- matrix(NA_real_, nrow=nrows, ncol=ncol(d.num))
  for(i in 1:ncol(d.num)){
    d1.temp = (w*(d.num - matrix(rep(num.centroid[i,], nrows), 
                                 nrow = nrows, byrow = TRUE)))^2

    d1[,i] = rowSums(d1.temp)
  }
  return(d1)
}
# using as.matrix
f2 <- function(n){
  d.num <- data.frame(x3=round(rnorm(n)), x4 = round(rnorm(n)))
  w <- d.num
  d.num <- as.matrix(d.num)
  w <- as.matrix(w)
  num.centroid <- matrix(c(7,8,9,10), nrow=2)
  nrows = nrow(d.num)
  d1 <- matrix(NA_real_, nrow=nrows, ncol=ncol(d.num))
  for(i in 1:ncol(d.num)){
    d1.temp = (w*(d.num - matrix(rep(num.centroid[i,], nrows), 
                                 nrow = nrows, byrow = TRUE)))^2

    d1[,i] = rowSums(d1.temp)
  }
  return(d1)
}

Benchmarks:

> library(microbenchmark)
> n <- 10
> microbenchmark(
+   code1 = f1(n),
+   code2 = f2(n),
+   times = 1000
+ )
Unit: microseconds
  expr      min        lq        mean   median       uq      max neval
 code1 1432.443 1480.8605 1628.889978 1545.789 1631.022 6229.565  1000
 code2  263.284  278.9020  313.293371  290.505  307.239 3138.880  1000
> n <- 1000
> microbenchmark(
+   code1 = f1(n),
+   code2 = f2(n),
+   times = 1000
+ )
Unit: microseconds
  expr      min       lq        mean   median        uq        max neval
 code1 1884.934 1924.873 2290.409508 1974.183 2111.8490 114038.521  1000
 code2  571.192  583.687  642.682019  601.537  637.4595   3499.891  1000

The second code is clearly faster.

Stéphane Laurent
  • 75,186
  • 15
  • 119
  • 225