0

Hi Im currently doing web usage mining. For that I need to loop through all data entries (204002 rows) (each row is a web session containing the timestamp and the page accessed) and do some work on them. Here is a dput of the data:

structure(list(cookie = "1", 
    paths = list(c("LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
    "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
    "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
    "SYSTEM", "SYSTEM", "SYSTEM")), time = list(c("2017-05-01T00:00:00.000Z", 
    "2017-05-01T00:00:10.000Z", "2017-05-01T00:00:41.000Z", "2017-05-01T00:00:48.000Z", 
    "2017-05-01T00:03:28.000Z", "2017-05-01T00:03:40.000Z", "2017-05-01T00:03:53.000Z", 
    "2017-05-01T00:04:09.000Z", "2017-05-01T00:04:17.000Z", "2017-05-01T00:04:26.000Z", 
    "2017-05-01T00:04:30.000Z", "2017-05-01T00:04:34.000Z", "2017-05-01T00:04:40.000Z", 
    "2017-05-01T00:05:36.000Z", "2017-05-01T00:05:46.000Z", "2017-05-01T00:05:52.000Z", 
    "2017-05-01T00:06:00.000Z", "2017-05-01T00:06:38.000Z", "2017-05-01T00:06:57.000Z", 
    "2017-05-01T00:07:01.000Z")), length = 20L, durationInMin = 7.01666666666667), .Names = c("cookie", 
"paths", "time", "length", "durationInMin"), class = c("data.table", 
"data.frame"), row.names = c(NA, -1L), .internal.selfref = <pointer: 0x00000000001f0788>)

I look if a session needs to be split into two or more sessions. To do this I look at every timestamp in a session and compare them with the previous timestamp in this session. If the diffrence crosses a border, the session gets split into two sessions. The result is a new Data.Table with the new sessions. The code works but it is very very slow (multiple hours). The speed gets slower over time. First I thought it is the growing list inside the loop, but I checked this by doing the loop without the resultlist. My code is as follows:

function(sessions) {
      durationCalc <- function(timeList) {
        last <-
          strptime(timeList[[1]][length(timeList[[1]])], format = "%Y-%m-%dT%H:%M:%S")
        first <-
          strptime(timeList[[1]][length(1)], format = "%Y-%m-%dT%H:%M:%S")
        res <- as.numeric(difftime(last, first, units = 'mins'))
      }




      id <- 1
      border <- 30
      maxCount <- nrow(sessions)

      # list for the final sessions
      finalSessions <- vector("list", maxCount)

      # iterate over every session to break down into smaller sessions
      for (i in 1:maxCount) {
        print(paste("working on session", i, "of", maxCount))
        currentStartPosition <- 1
        row <- sessions[i, ]
        sessionLength <- length(row$time[[1]])

        # if the session containts only one path/timestamp, there is no further processing required
    # if it contains two or more, each timestamp has to be checked.
        if (sessionLength < 2) {
          finalSessions[[id]] <- row
          id <- id + 1
        }
        else{
          currentTime <-
            strptime(row$time[[1]][1], format = "%Y-%m-%dT%H:%M:%S")
          for (j in 2:sessionLength) {
            nextTime = strptime(row$time[[1]][j], format = "%Y-%m-%dT%H:%M:%S")
            diff <-
              as.numeric(difftime(nextTime, currentTime, units = 'mins'))
        # if the timestamp is 30 minutes or more later the current sessions (row) gets split 
            if (diff > border) {
        # make a copy of the original row and modify values, then add the modified row to the finalSessions
        # the currentStartposition gets the currentTimestamp and the loop continues
              currentSession <- row
              currentSession$cookie = id
              currentSession$time[[1]] <-
                list(row$time[[1]][currentStartPosition:j - 1])
              currentSession$paths[[1]] <-
                list(row$paths[[1]][currentStartPosition:j - 1])
              currentSession$durationInMin <-
                durationCalc(currentSession$time)
              currentSession$length <- length(currentSession$paths[[1]])
              currentStartPosition = j

              finalSessions[[id]] <- currentSession
              id <- id + 1

            }
            # at last the currentTimestamp gets the next Time stamp, it iterates over the whole timestamp list
            currentTime = nextTime
          }

      # after the loop the final session gets built. copy the original row, modify the values and add it to the finalSessions
          currentSession <- row
          currentSession$cookie = id
          currentSession$time[[1]] <-
            list(row$time[[1]][currentStartPosition:sessionLength])
          currentSession$paths[[1]] <-
            list(row$paths[[1]][currentStartPosition:sessionLength])
          currentSession$durationInMin <-
            durationCalc(currentSession$time)
          currentSession$length <- length(currentSession$paths[[1]])
          finalSessions[[id]] <- currentSession
          id <- id + 1
        }
      }

      finalSessions <- rbindlist(finalSessions)



    }  
lmo
  • 37,904
  • 9
  • 56
  • 69
webusag
  • 3
  • 2
  • can you add some data? (https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) – minem Jun 29 '17 at 13:20
  • Using lapply instead of for loop and vectorising your work should do the trick. – YCR Jun 29 '17 at 13:32
  • @MārtiņšMiglinieks I added a data row – webusag Jun 29 '17 at 13:44
  • @YCR as far as I understand lapply is not the right thing to do in my use case. lapply uses a function on each row, and as a result I get a data.table with the exact size back. In my use case there can be 1-n rows per row. Im new to r so feel free to correct me If Im wrong. – webusag Jun 29 '17 at 13:46
  • lapply apply a function to all the elements of a list. You can nest lapply function inside lapply. – YCR Jun 29 '17 at 13:48
  • Your data has 5 rows as far as I can tell. The best thing you can do is first to convert your data in a long format then to do your data manipulation. Even if you end with 200 millions rows and 5 variables, you should be able to handle it. – YCR Jun 29 '17 at 13:49
  • @YCR No sorry for the bad formating. My data type is a data.table with 5 columns. The example in my text is one row. There are 204002 rows that look like that. – webusag Jun 29 '17 at 13:52
  • @webusag You should try to simplify your example, and try to better explain your data, because I still do not get everything. Does your data.table cells contain vectors? Try using `dput` to shere your data. – minem Jun 29 '17 at 13:58
  • @MārtiņšMiglinieks I added the dput output (sorry didnt know this function). I also added comments to the code, I hope it clears a bit what Im doing. Simplifying the example is a bit difficult – webusag Jun 29 '17 at 15:30
  • @webusag Was I able to answer your question? If so, then I would suggest, that you accept my answer... – minem Jul 06 '17 at 12:08

1 Answers1

0

Try this:

sessions <- structure(list(cookie = "1", 
               paths = list(c("LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
                              "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
                              "LMCash", 
                              "LMCash", "LMCash", "LMCash", "LMCash", "LMCash", 
                              "LMCash", 
                              "SYSTEM", "SYSTEM", "SYSTEM")),
               time = list(c(
"2017-05-01T00:00:00.000Z",
"2017-05-01T00:00:10.000Z", 
"2017-05-01T00:00:41.000Z", 
"2017-05-01T00:00:48.000Z", 
"2017-05-01T00:03:28.000Z", 
"2017-05-01T00:03:40.000Z",
"2017-05-01T00:03:53.000Z", 
"2017-05-01T00:04:09.000Z", 
"2017-05-01T00:04:17.000Z", 
"2017-05-01T00:04:26.000Z", 
"2017-05-01T00:04:30.000Z", 
"2017-05-01T00:04:34.000Z", 
"2017-05-01T00:04:40.000Z", 
"2017-05-01T00:05:36.000Z", 
"2017-05-01T00:05:46.000Z",
"2017-05-01T00:05:52.000Z", 
"2017-05-01T00:06:00.000Z", 
"2017-05-01T00:06:38.000Z", 
"2017-05-01T00:06:57.000Z", 
"2017-05-01T00:40:01.000Z")),
length = 20L,
durationInMin = 7.01666666666667), .Names = c("cookie",
"paths", "time", "length", "durationInMin"),
class = c("data.table", "data.frame"),
row.names = c(NA, -1L))


s <- replicate(1000, sessions, simplify = F)
# str(s)
s <- rbindlist(s)


ff <- function(s) {

  dFormat <- "%Y-%m-%dT%H:%M:%S"
  durationCalc2 <- function(timeList) {
    tt <- timeList
    (tt[length(tt)] - tt[1]) / 60
  }

  id <- 1
  border <- 30
  maxCount <- nrow(s)

  finalSessions <- vector("list", maxCount)

  for (i in 1:maxCount) {
    # print(paste("working on session", i, "of", maxCount))
    cSP <- 1
    row <- s[i, ]
    TIME <- row$time[[1]]
    PATHS <- row$paths[[1]]
    sessionLength <- length(TIME)
    TIMES <- strptime(TIME, format = dFormat)
    TIMES <- as.numeric(TIMES)

    if (sessionLength < 2) {
      finalSessions[[id]] <- row
      id <- id + 1
    } else{
      # currentTime <- strptime(TIME[1], format = dFormat)
      cT2 <- TIMES[1]
      for (j in 2:sessionLength) {
        # nextTime <-  strptime(TIME[j], format = dFormat)
        nT2 <- TIMES[j]
        # diff <- as.numeric(difftime(nextTime, currentTime, units = 'mins'))
        diff <- (nT2 - cT2) /60

        if (diff > border) {
          cS <- row
          cS$cookie = id
          index <- cSP:j - 1
          cS$time[[1]] <- list(TIME[index])
          cS$paths[[1]] <- list(PATHS[index])
          cS$durationInMin <- durationCalc2(TIMES[index])
          cS$length <- length(cS$paths[[1]])
          cSP <- j
          finalSessions[[id]] <- cS
          id <- id + 1
        }
        cT2 <- nT2
      }
      cS <- row
      cS$cookie = id
      cS$time[[1]] <- list(TIME[cSP:sessionLength])
      cS$paths[[1]] <- list(PATHS[cSP:sessionLength])
      newTIMES <- TIMES[cSP:sessionLength]
      cS$durationInMin <- durationCalc2(newTIMES)
      cS$length <- length(cS$paths[[1]])
      finalSessions[[id]] <- cS
      id <- id + 1
    }
  }
  finalSessions <- rbindlist(finalSessions)
  finalSessions
}  

It should be approximate 2 times faster:

system.time(rez1 <- yourFunction(s)) #5.81
system.time(rez2 <- ff(s)) # 2.74

2.58 / 5.81

all.equal(rez1, rez2)

Next time, try to profile your code (if you would have done that, then you would have seen that difftime is slow, and it could be speed up). Also your provided example data was bad, it did not contain example where all the code executes!!

minem
  • 3,640
  • 2
  • 15
  • 29