I have a horse racing dataset. For each horse racing record, if the track value is not missing, I want to count the number of horse win in the past two years with the same venue, track, and similar distance. I use apply to loop each row, I only want to add a new column count previous win to the original data set, so the output should have one more column and same row length as the input given. But the speed is very slow. How can I speed up the loop?
rdate: horse racing year-month-date. venue: ST, HV. track: TURF, All WEATHER TRACK. distance: 1200, 1400, 1600, 1800, etc. ind_win: 0(horse did not win 1st place), 1(horse win 1st place).
structure(list(rdate = structure(c(17450, 17475, 17481, 17496,
17510, 17517, 17532, 17566, 17593, 17615, 17629, 17657, 17667,
17796, 17817, 17839, 17856, 17860, 17881, 17881, 17902, 17902
), class = "Date"), venue = c("HV", "ST", "ST", "ST", "ST", "ST",
"ST", "ST", "ST", "ST", "ST", "ST", "HV", "ST", "ST", "ST", "HV",
"ST", "ST", "ST", "ST", "ST"), track = c("TURF", "TURF", "TURF",
"TURF", "TURF", "TURF", "TURF", "TURF", "TURF", "TURF", "TURF",
"TURF", "TURF", "TURF", "TURF", "TURF", "TURF", "TURF", "TURF",
"TURF", "TURF", "TURF"), horsenum = c("A366", "A366", "A366",
"A366", "A366", "A366", "A366", "A366", "A366", "A366", "A366",
"A366", "A366", "B440", "B440", "B440", "A366", "B440", "A366",
"B440", "A366", "B440"), distance = c(1800L, 1800L, 1600L, 1600L,
1800L, 1600L, 1800L, 1800L, 1800L, 1600L, 1800L, 2000L, 1800L,
1200L, 1400L, 1400L, 1650L, 1400L, 1600L, 1400L, 1800L, 1400L
), ind_win = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L)), row.names = c(NA, -22L
), class = "data.frame")
library(tidyverse)
library(lubridate)
HWinCountF <- function(df){
if (!is.na(df["track"])) {
tmp <- subset(jc.data, horsenum == df["horsenum"] & rdate < df["rdate"] & rdate > ymd(df["rdate"]) - years(2) &
venue == df["venue"] & track==df["track"] & distance>=as.integer(df["distance"])-200 &
distance<=as.integer(df["distance"])+200)
if (nrow(tmp) > 0) {
return(nrow(tmp[tmp$ind_win == 1,]))
} else {
return(NA)
}
} else {
return(NA)
}
}
jc.data['h_win_count'] <- apply(jc.data, 1, HWinCountF)