Hi Im currently doing web usage mining. For that I need to loop through all data entries (204002 rows) (each row is a web session containing the timestamp and the page accessed) and do some work on them. Here is a dput of the data:
structure(list(cookie = "1",
paths = list(c("LMCash", "LMCash", "LMCash", "LMCash", "LMCash",
"LMCash", "LMCash", "LMCash", "LMCash", "LMCash", "LMCash",
"LMCash", "LMCash", "LMCash", "LMCash", "LMCash", "LMCash",
"SYSTEM", "SYSTEM", "SYSTEM")), time = list(c("2017-05-01T00:00:00.000Z",
"2017-05-01T00:00:10.000Z", "2017-05-01T00:00:41.000Z", "2017-05-01T00:00:48.000Z",
"2017-05-01T00:03:28.000Z", "2017-05-01T00:03:40.000Z", "2017-05-01T00:03:53.000Z",
"2017-05-01T00:04:09.000Z", "2017-05-01T00:04:17.000Z", "2017-05-01T00:04:26.000Z",
"2017-05-01T00:04:30.000Z", "2017-05-01T00:04:34.000Z", "2017-05-01T00:04:40.000Z",
"2017-05-01T00:05:36.000Z", "2017-05-01T00:05:46.000Z", "2017-05-01T00:05:52.000Z",
"2017-05-01T00:06:00.000Z", "2017-05-01T00:06:38.000Z", "2017-05-01T00:06:57.000Z",
"2017-05-01T00:07:01.000Z")), length = 20L, durationInMin = 7.01666666666667), .Names = c("cookie",
"paths", "time", "length", "durationInMin"), class = c("data.table",
"data.frame"), row.names = c(NA, -1L), .internal.selfref = <pointer: 0x00000000001f0788>)
I look if a session needs to be split into two or more sessions. To do this I look at every timestamp in a session and compare them with the previous timestamp in this session. If the diffrence crosses a border, the session gets split into two sessions. The result is a new Data.Table with the new sessions. The code works but it is very very slow (multiple hours). The speed gets slower over time. First I thought it is the growing list inside the loop, but I checked this by doing the loop without the resultlist. My code is as follows:
function(sessions) {
durationCalc <- function(timeList) {
last <-
strptime(timeList[[1]][length(timeList[[1]])], format = "%Y-%m-%dT%H:%M:%S")
first <-
strptime(timeList[[1]][length(1)], format = "%Y-%m-%dT%H:%M:%S")
res <- as.numeric(difftime(last, first, units = 'mins'))
}
id <- 1
border <- 30
maxCount <- nrow(sessions)
# list for the final sessions
finalSessions <- vector("list", maxCount)
# iterate over every session to break down into smaller sessions
for (i in 1:maxCount) {
print(paste("working on session", i, "of", maxCount))
currentStartPosition <- 1
row <- sessions[i, ]
sessionLength <- length(row$time[[1]])
# if the session containts only one path/timestamp, there is no further processing required
# if it contains two or more, each timestamp has to be checked.
if (sessionLength < 2) {
finalSessions[[id]] <- row
id <- id + 1
}
else{
currentTime <-
strptime(row$time[[1]][1], format = "%Y-%m-%dT%H:%M:%S")
for (j in 2:sessionLength) {
nextTime = strptime(row$time[[1]][j], format = "%Y-%m-%dT%H:%M:%S")
diff <-
as.numeric(difftime(nextTime, currentTime, units = 'mins'))
# if the timestamp is 30 minutes or more later the current sessions (row) gets split
if (diff > border) {
# make a copy of the original row and modify values, then add the modified row to the finalSessions
# the currentStartposition gets the currentTimestamp and the loop continues
currentSession <- row
currentSession$cookie = id
currentSession$time[[1]] <-
list(row$time[[1]][currentStartPosition:j - 1])
currentSession$paths[[1]] <-
list(row$paths[[1]][currentStartPosition:j - 1])
currentSession$durationInMin <-
durationCalc(currentSession$time)
currentSession$length <- length(currentSession$paths[[1]])
currentStartPosition = j
finalSessions[[id]] <- currentSession
id <- id + 1
}
# at last the currentTimestamp gets the next Time stamp, it iterates over the whole timestamp list
currentTime = nextTime
}
# after the loop the final session gets built. copy the original row, modify the values and add it to the finalSessions
currentSession <- row
currentSession$cookie = id
currentSession$time[[1]] <-
list(row$time[[1]][currentStartPosition:sessionLength])
currentSession$paths[[1]] <-
list(row$paths[[1]][currentStartPosition:sessionLength])
currentSession$durationInMin <-
durationCalc(currentSession$time)
currentSession$length <- length(currentSession$paths[[1]])
finalSessions[[id]] <- currentSession
id <- id + 1
}
}
finalSessions <- rbindlist(finalSessions)
}