1

I have a database of matches with players and players'scores for each game. I am trying to create a rating variable for my prediction model. I am using formula from a blogpost.

Here is the dummy dataset:

df = data.frame(
matchid = c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,4),
playerid = c(2,3,4,5,6,7,8,9,10,11,5,2,3,4,6,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,17,19,21,18,20,22,26,24,25,23),
point = c(52,38,34,33,16,19,16,8,10,2,38,37,31,34,21,18,18,13,9,-2,45,34,37,39,12,9,7,-3,-1,-8,47,38,31,17,26,32,28,17,16,9))

Here is my attempt using for loop. The for loop run extremely slow for 30000 games database. Please give me some pointers on how to improve this process / loop. I really have no idea.

## Initialize initial rating for each player 
players_ratings = data.frame(playerid = unique(df$playerid),rating = 1000, stringsAsFactors = FALSE)

## Initialize unique matches 
unique_matches = df$matchid %>% unique 

## Matches with rating 
relative_rating_matches = list(length(df))

### GENERATE RATING 
for(index in 1:length(unique_matches)){
  match = df %>% filter(matchid == unique_matches[[index]])
  position = index

  ## UPDATE RATING
  match = match %>% left_join(players_ratings,by = 'playerid')
  relative_rating_matches[[position]] = match

  print(match)

  ## BUILD ACTUAL RESULTS MATRIX      
  S = matrix(nrow = 10, ncol = 10)
  rownames(S) = match$playerid
  colnames(S) = match$playerid
  for(i in 1:nrow(S)) {
    for(j in 1:ncol(S)) {
      player_row_point = as.numeric(match %>% filter(playerid == rownames(S)[i]) %>% select(point))
      player_col_point = as.numeric(match %>% filter(playerid == colnames(S)[j]) %>% select(point))
      S[i,j] = ifelse(player_col_point == player_row_point,0.5,
                      ifelse(player_col_point > player_row_point,1,0))
    }
  }
  diag(S)= 0
  print(S)

  ## BUILD EXPECTED WIN/LOSS MATRIX 
  E = matrix(nrow = 10, ncol = 10)
  rownames(E) = match$playerid
  colnames(E) = match$playerid

  for(i in 1:nrow(E)) {
    for(j in 1:ncol(E)) {
      player_row_rating = as.numeric(match %>% filter(playerid == rownames(E)[i]) %>% select(rating))
      player_col_rating = as.numeric(match %>% filter(playerid == colnames(E)[j]) %>% select(rating))

      r = 1 + 10^((player_row_rating - player_col_rating)/400)
      expected_result = 1/r
      E[i,j] = expected_result
    }
  }
  diag(E) = 0
  print(E)

  ## GENERATE INCREMENTAL RATING
  R = 20 * (S-E)
  R = as.data.frame(colSums(R)) %>% rownames_to_column()

  print(R)

  ## UPDATE EXISTING RATING DATABASE
  for(i in 1:nrow(R)){
    player_id  = R[i,1]
    incre_rating = ifelse(is.na(R[i,2]),0,R[i,2])

    cur_rating = players_ratings[players_ratings$playerid == player_id,2]

    players_ratings[players_ratings$playerid == player_id,2] = cur_rating + incre_rating
  }
}
Tung
  • 26,371
  • 7
  • 91
  • 115
Khiem Nguyen
  • 129
  • 1
  • 11
  • why don't you make a small sample, like 1000 observations. it will be 30 times faster. – Sal-laS Sep 12 '18 at 17:29
  • @SalmanLashkarara: I think this sample is enough. My example consists of 4 games (10 players per game). It's not any different with more observations. – Khiem Nguyen Sep 12 '18 at 17:32
  • FYI using `%>%` inside `for` loop can make the code slower https://stackoverflow.com/a/38882226/786542 – Tung Sep 12 '18 at 18:48
  • Also consider `dplyr::if_else()` instead of `ifelse` & `data.table` package to gain more speed & use less RAM – Tung Sep 12 '18 at 18:55

0 Answers0