0

This is my first stab at this:

library(dplyr)

step_size <- 5

grid <- expand.grid(
    x1 = seq(0, 100, step_size)
    , x2 = seq(0, 100, step_size)
    , x3 = seq(0, 100, step_size)
)

grid$sum = grid$x1 + grid$x2 + grid$x3
grid$x1 <- (grid$x1 / grid$sum) * 100
grid$x2 <- (grid$x2 / grid$sum) * 100
grid$x3 <- (grid$x3 / grid$sum) * 100
grid$sum <- grid$x1 + grid$x2 + grid$x3

nrow(grid)

result <- distinct(grid) %>% filter(!is.na(sum))

head(result, 20)
nrow(result)

Basically, I want to create a data frame that contains as many rows as possible that add up to 100 and are uniformly distributed.

is there an easier better approach in R? thanks!

cs0815
  • 16,751
  • 45
  • 136
  • 299

2 Answers2

1

Using data.table...

library(data.table)

grid <- expand.grid(
  x1 = seq(0, 100)
  , x2 = seq(0, 100)
  , x3 = seq(0, 100)
)

setDT(grid)

res <- grid[grid[, rowSums(.SD) == 100], ]
res[, summation := rowSums(.SD)]

Result:

> res[, unique(summation)]
[1] 100

This can also be done in base but data.table is faster:

library(data.table)

grid <- expand.grid(
  x1 = seq(0, 100)
  , x2 = seq(0, 100)
  , x3 = seq(0, 100)
)


grid2 <- expand.grid(
  x1 = seq(0, 100)
  , x2 = seq(0, 100)
  , x3 = seq(0, 100)
)

setDT(grid)

microbenchmark::microbenchmark(
  data.table = {        
    res <- grid[grid[, rowSums(.SD) == 100], ]
  },
  base = {
    res2 <- grid2[rowSums(grid2) == 100, ]
  }
)

Unit: milliseconds
       expr      min       lq     mean   median       uq      max neval cld
 data.table 59.41157  89.6700 109.0462 107.7415 124.2675 183.9730   100  a 
       base 65.70521 109.6471 154.1312 125.4238 156.9168 611.0169   100   b
JdeMello
  • 1,708
  • 15
  • 23
1

Here's a simple function. You can specify how many rows/columns you want, and what each row sums to.

func <- function(cols = 3, rows = 10, rowTotal = 100) {
  dt1 <- replicate(n = cols, runif(n = rows))
  dt1 <- data.frame(apply(X = dt1, MARGIN = 2, FUN = function(x) x / rowSums(dt1) * rowTotal))
  return(dt1)
}

rowSums(func()) # default values (3 cols, 10 rows, each row sums to 100) 
rowSums(func(cols = 5, rows = 10, rowTotal = 50)) # 5 cols, 10 rows, row sums to 50)
hsl
  • 670
  • 2
  • 10
  • 22
  • Not sure if it's possible to generate variables that are uniformly distributed though. You might also want to check out this question: https://stackoverflow.com/questions/11003967/generate-3-random-number-that-sum-to-1-in-r – hsl Jan 17 '19 at 19:08