5

The following is a reproducible example of a situation that I'm experiencing and stuck with (it's a test client I'm using to evaluate various approaches to merging datasets for my dissertation research).

testData <- "https://github.com/abnova/test/blob/master/mergeTestData.zip?raw=true"

tmpFile <- tempfile()
tmpDir <- tempdir()

download.file(testData, tmpFile, method = 'curl',
              extra = '-L', quiet = TRUE)
testFiles <- unzip(tmpFile, exdir = tmpDir)

# To enable desired merge option, uncomment corresponding line

#MERGE_OPTION <- "lapply_merge"
#MERGE_OPTION <- "lapply_merge2"
#MERGE_OPTION <- "reduce_merge"
#MERGE_OPTION <- "reduce_merge2"
#MERGE_OPTION <- "reshape"
#MERGE_OPTION <- "plyr"
#MERGE_OPTION <- "dplyr"
MERGE_OPTION <- "data.table"
#MERGE_OPTION <- "data.table2"

loadData <- function (dataFile) {

  if (file.exists(dataFile)) {
    data <- readRDS(dataFile)
  }
  else { # error() undefined - replaced for stop() for now
    stop("Data file \'", dataFile, "\' not found! Run 'make' first.")
  }
  return (data)
}

loadDataSets <- function (dataDir) {

  dataSets <- list()

  dataFiles <- dir(dataDir, pattern='\\.rds$')
  dataSets <- lapply(seq_along(dataFiles),
                     function(i) {
                       nameSplit <- strsplit(dataFiles[i], "\\.")
                       dataset <- nameSplit[[1]][1]
                       assign(dataset,
                              loadData(file.path(dataDir, dataFiles[i])))
                       return (get(dataset))
                     })
  return (dataSets)
}

# load the datasets of transformed data
dataSets <- loadDataSets(tmpDir)

if (MERGE_OPTION == "lapply_merge") { # Option 1

  flossData <- data.frame(dataSets[[1]][1])

  # merge all loaded datasets by common column ("Project ID")
  silent <- lapply(seq(2, length(dataSets)),
                   function(i) {merge(flossData, dataSets[[1]][i],
                                      by = "Project ID",
                                      all = TRUE)})
}

if (MERGE_OPTION == "lapply_merge2") { # Option 1

  pids <- which(sapply(dataSets,
                       FUN=function(x) {'Project ID' %in% names(x)}))

  flossData <- dataSets[[pids[1]]]

  for (id in pids[2:length(pids)]) {
    flossData <- merge(flossData, dataSets[[id]],
                       by='Project ID', all = TRUE)
  }
}

if (MERGE_OPTION == "reduce_merge") { # Option 2

  flossData <- Reduce(function(...) 
    merge(..., by.x = "row.names", by.y = "Project ID", all = TRUE),
    dataSets)
}

# http://r.789695.n4.nabble.com/merge-multiple-data-frames-tt4331089.html#a4333772
if (MERGE_OPTION == "reduce_merge2") { # Option 2

    mergeAll <- function(..., by = "Project ID", all = TRUE) {
    dotArgs <- list(...)
    dotNames <- lapply(dotArgs, names)
    repNames <- Reduce(intersect, dotNames)
    repNames <- repNames[repNames != by]
    for(i in seq_along(dotArgs)){
      wn <- which( (names(dotArgs[[i]]) %in% repNames) &
                     (names(dotArgs[[i]]) != by))
      names(dotArgs[[i]])[wn] <- paste(names(dotArgs[[i]])[wn],
                                       names(dotArgs)[[i]], sep = ".")
    }
    Reduce(function(x, y) merge(x, y, by = by, all = all), dotArgs)
  }

  flossData <- mergeAll(dataSets)
}

if (MERGE_OPTION == "reshape") { # Option 3

  if (!suppressMessages(require(reshape))) install.packages('reshape')
  library(reshape)
  flossData <- reshape::merge_all(dataSets)
}

if (MERGE_OPTION == "plyr") { # Option 4

  if (!suppressMessages(require(plyr))) install.packages('plyr')
  library(plyr)
  flossData <- plyr::join_all(dataSets)
}

if (MERGE_OPTION == "dplyr") { # Option 5

  if (!suppressMessages(require(dplyr))) install.packages('dplyr')
  library(dplyr)

  flossData <- dataSets[[1]][1]
  flossData <- lapply(dataSets[[1]][-1],
                      function(x) {dplyr::left_join(x, flossData)})
}

if (MERGE_OPTION == "data.table") { # Option 6

  if (!suppressMessages(require(data.table))) 
    install.packages('data.table')
  library(data.table)

  flossData <- data.table(dataSets[[1]], key="Project ID")

  for (id in 2:length(dataSets)) {
    flossData <- merge(flossData, data.table(dataSets[[id]]),
                       by='Project ID', all.x = TRUE, all.y = FALSE)
  }
}

# http://stackoverflow.com/a/17458887/2872891
if (MERGE_OPTION == "data.table2") { # Option 6

  if (!suppressMessages(require(data.table))) 
    install.packages('data.table')
  library(data.table)

  DT <- data.table(dataSets[[1]], key="Project ID")
  flossData <- lapply(dataSets[[1]][-1], function(x) DT[.(x)])
}

# Additional Transformations (see TODO above)

# convert presence of Repo URL to integer
flossData[["Repo URL"]] <- as.integer(flossData[["Repo URL"]] != "")

# convert License Restrictiveness' factor levels to integers
#flossData[["License Restrictiveness"]] <- 
#  as.integer(flossData[["License Restrictiveness"]])

# convert User Community Size from character to integer
flossData[["User Community Size"]] <- 
  as.integer(flossData[["User Community Size"]])

# remove NAs
#flossData <- flossData[complete.cases(flossData[,3]),]
rowsNA <- apply(flossData, 1, function(x) {any(is.na(x))})
flossData <- flossData[!rowsNA,]

print(str(flossData))

The error message is as follows:

Starting bmerge ...done in 0.001 secs
Starting bmerge ...done in 0.002 secs
Error in vecseq(f__, len__, if (allow.cartesian) NULL else as.integer(max(nrow(x),  : 

Join results in 121229 rows; more than 100000 = max(nrow(x),nrow(i)). Check for duplicate key values in i, each of which join to the same group in x over and over again. If that's ok, try including j and dropping by (by-without-by) so that j runs for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and datatable-help for advice.

Current problem is with the enabled data.table option, but, since it's the same package, I would also appreciate an advice on the next option, which uses an alternative data.table syntax for merging (even though I find it too confusing, but for the sake of knowledge completeness). Thank you in advance!

Aleksandr Blekh
  • 2,462
  • 4
  • 32
  • 64
  • Forgot to mention: I'm curious why the error message lists 121229 rows, while my **base** data frame (specified via `all.x = TRUE, all.y = FALSE`) for *left join* has exactly 100000 rows. – Aleksandr Blekh Aug 08 '14 at 13:44
  • What don't you understand from the error message? – Arun Aug 08 '14 at 13:50
  • 2
    @Arun: As I mentioned in my previous comment, I don't understand how the number of rows in data frame, resulted from the *left join* merge, can be higher than the number of rows in the *base* (left) data frame. – Aleksandr Blekh Aug 08 '14 at 14:06

1 Answers1

18

I'd approach the issue in this manner:

First, there's an error message. What does it say?

Join results in 121229 rows; more than 100000 = max(nrow(x),nrow(i)). Check for duplicate key values in i, each of which join to the same group in x over and over again. If that's ok, try including j and dropping by (by-without-by) so that j runs for each group to avoid the large allocation. If you are sure you wish to proceed, rerun with allow.cartesian=TRUE. Otherwise, please search for this error message in the FAQ, Wiki, Stack Overflow and datatable-help for advice.

Great! But I've so many datasets I'm working with, and so many packages and so many functions. I've got to narrow this down to which data set produces this error.

Testing one by one:

ans1 = merge(as.data.table(dataSets[[1]]), as.data.table(dataSets[[2]]), 
                all.x=TRUE, all.y=FALSE, by="Project ID")
## works fine.

ans2 = merge(as.data.table(dataSets[[1]]), as.data.table(dataSets[[3]]), 
                all.x=TRUE, all.y=FALSE, by="Project ID")
## same error

Aha, got the same error.

Reading the second line of the error message:

So, something seems to happen with dataSets[[3]]. It says to check for duplicate key values in i. Let's do that:

dim(dataSets[[3]])
# [1] 81487     3
dim(unique(as.data.table(dataSets[[3]]), by="Project ID"))
# [1] 49999     3

So, dataSets[[3]] has duplicated 'Project ID' values, and so for each duplicated value, all the matching rows from dataSets[[1]] is returned - which is what the 2nd part of the 2nd line explains: each of which join to the same group in x over and over again.

Trying out allow.cartesian=TRUE:

I know that there are duplicate keys and still wish to proceed. But the error message mentions how we can proceed, add "allow.cartesian=TRUE".

ans2 = merge(as.data.table(dataSets[[1]]), as.data.table(dataSets[[3]]), 
                all.x=TRUE, all.y=FALSE, by="Project ID", allow.cartesian=TRUE)

Aha, now it works fine! So what does allow.cartesian = TRUE do? Or why was it added? The error message says to search for the message on stackoverflow (amidst other things).

Searching for allow.cartesian=TRUE on SO:

And the search lands me in on to this Why is allow.cartesian required at times when when joining data.tables with duplicate keys? question, which explains the purpose, and which also contains, under the comment, another link from @Roland: Merging data.tables uses more than 10 GB RAM which points to the initial issue that all started it. Let me read those posts now.


Is base::merge giving a different result?

Now, does base::merge return a different result (with 100,000 rows)?

dim(merge(dataSets[[1]], dataSets[[3]], all.x=TRUE, all.y=FALSE, by="Project ID"))
# [1] 121229      4

Not really. It's giving the same dimension as when using data.table, but it just doesn't care if there are duplicate keys, whereas data.table warns you of potential explosion of the merged results and allows you to make an informed decision.

Community
  • 1
  • 1
Arun
  • 116,683
  • 26
  • 284
  • 387
  • It's an awesome explanation! Greatly appreciate it. So, I should just make sure that there are no duplicate values in the key column, in my case `Project ID`, right? It would be great, if you could advise on the next option (alternative suffix). Great answer, upvoting and accepting. – Aleksandr Blekh Aug 08 '14 at 14:14
  • By the way, I have noticed that you use `as.data.table` instead of `data.table`, as in my original code. Does it make any difference or it's just a personal preference? – Aleksandr Blekh Aug 08 '14 at 14:19