5

It's hard to debug this issue with the foreach package as my reproducible example works just fine but here is a brief description of the problem and what I am trying to achieve.

I'm using some code originally posted by Steve Weston which will split a data.table based on a key column which is a factor. The iterator will loop through the "chunks" of the data.table and have access to both the split of the table and the index value used to generate the split (key).

Whilst this approach has worked for me on various occasions before, on this occasion I am receiving and error within the foreach loop.

Error in { : undefined columns selected

My code which triggers the issue is as follows:

library(foreach)
library(data.table)

str(dat.in)
names(dat.in)
class(dat.in$fc.item)
key(dat.in)
library(foreach)
fc = foreach(dt.sub = isplitDT(dat.in, levels(dat.in$fc.item))) %do%
{
    # code to execute on each core/iteration
    print(dt.sub$key[1])
    dt.sub$value 
}

The data has been output via dput and can be found at the bottom of the question.

I have checked my dat.in object with the following results:

    > str(dat.in)
Classes ‘data.table’ and 'data.frame':  313 obs. of  3 variables:
 $ fc.item: Factor w/ 1 level "A": 1 1 1 1 1 1 1 1 1 1 ...
 $ period : num  1 2 3 4 5 6 7 8 9 10 ...
 $ y      : int  287718 343083 291241 298469 300267 356797 225253 294265 337773 318346 ...
 - attr(*, ".internal.selfref")=<externalptr> 
 - attr(*, "sorted")= chr "fc.item"
> names(dat.in)
[1] "fc.item" "period"  "y"      
> class(dat.in$fc.item)
[1] "factor"
> key(dat.in)
[1] "fc.item"

So I created a reproducible example to match my scenario, the code is shown below:

library(foreach)
library(data.table)

# generate data and set key of the data.table
dt = data.table(item = as.factor(paste0("item-", sort(rep(rep(1:10),10)))),
                t = rep(1:10,10), 
                y = as.integer(abs(rnorm(100, 0,10))))
setkeyv(dt,"item")

## helper functions written by Steve Weston
isplitDT = function(x, vals) {
    ival <- iter(vals)
    nextEl <- function() {
        val <- nextElem(ival)
        list(value=x[val], key=val)
    }
    obj <- list(nextElem=nextEl)
    class(obj) <- c('abstractiter', 'iter')
    obj
}
dtcomb = function(...) {
    rbindlist(list(...))
}
############################################

## main function to split-process-combine using isplitDT and dtcomb
result = foreach(dt.sub = isplitDT(dt, levels(dt$item)),
        .combine = "dtcomb") %do%
{
    print(dt.sub$key[1])
    dt.sub$value
}

print(paste("Did it work =", sum(result == dt) == 300))

My difficulty is that this code works just fine but I cannot see any difference with the earlier foreach loop that fails. I'd be very grateful if anyone can throw some light on what I am doing wrong: my guess is that it's a very stupid error I am making!


The data for the original problem is here:

> dput(dat.in)
structure(list(fc.item = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = "A", class = "factor"), period = c(1, 2, 
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 
20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 
36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 
68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 
84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 
100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 
113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 
126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 
139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 
152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 
165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 
178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 
191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 
204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 
217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 
230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 
243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 
256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 
269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 
282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 
295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 
308, 309, 310, 311, 312, 313), y = c(287718L, 343083L, 291241L, 
298469L, 300267L, 356797L, 225253L, 294265L, 337773L, 318346L, 
270013L, 294559L, 265521L, 292651L, 326301L, 274133L, 225154L, 
377162L, 341432L, 308449L, 271186L, 272062L, 296231L, 264176L, 
272708L, 279367L, 265335L, 313174L, 273261L, 327539L, 322067L, 
260082L, 317229L, 268120L, 231941L, 322187L, 255401L, 261383L, 
232523L, 333930L, 291594L, 325835L, 282851L, 309369L, 306474L, 
331198L, 333453L, 282738L, 223454L, 343898L, 404772L, 420113L, 
363688L, 283529L, 304850L, 304265L, 260494L, 286632L, 291025L, 
234396L, 249829L, 243722L, 281929L, 252805L, 291330L, 217721L, 
233124L, 291646L, 214542L, 272663L, 246599L, 248463L, 276895L, 
238617L, 353554L, 240288L, 260862L, 215496L, 264241L, 251804L, 
317853L, 261112L, 241778L, 274305L, 260939L, 284144L, 238942L, 
268412L, 207012L, 322499L, 216205L, 283388L, 210637L, 283405L, 
232547L, 317938L, 232847L, 254665L, 293350L, 356068L, 272952L, 
262610L, 449750L, 369915L, 294255L, 267604L, 244032L, 263057L, 
226927L, 249796L, 235638L, 254442L, 226594L, 255157L, 219919L, 
260555L, 202837L, 282846L, 242090L, 324165L, 195997L, 247319L, 
214422L, 211885L, 238364L, 228117L, 243929L, 183895L, 204071L, 
228919L, 227446L, 244663L, 225126L, 251333L, 199212L, 205160L, 
205272L, 211975L, 201057L, 240099L, 203967L, 276464L, 180230L, 
256560L, 185168L, 209131L, 209283L, 266414L, 221112L, 247453L, 
285895L, 310151L, 236241L, 246656L, 371197L, 346882L, 308349L, 
218239L, 222147L, 240713L, 227690L, 195599L, 254913L, 203627L, 
209650L, 182243L, 213345L, 239517L, 194998L, 220132L, 248232L, 
187663L, 182200L, 180731L, 188778L, 218335L, 234029L, 192304L, 
183598L, 165051L, 207673L, 168798L, 187578L, 175816L, 192978L, 
212731L, 208684L, 176274L, 210670L, 227207L, 203419L, 183886L, 
215670L, 158552L, 209275L, 186366L, 228439L, 176090L, 252070L, 
203126L, 235651L, 216970L, 222579L, 224996L, 241870L, 194938L, 
292197L, 283827L, 281966L, 157419L, 256606L, 184074L, 223767L, 
206831L, 196338L, 177536L, 179195L, 180747L, 228955L, 253872L, 
254636L, 172384L, 181243L, 228535L, 178251L, 166644L, 193261L, 
191703L, 158698L, 184620L, 188777L, 171378L, 176349L, 168550L, 
173176L, 198650L, 176989L, 163293L, 164869L, 165503L, 185504L, 
172217L, 164511L, 160720L, 175902L, 171150L, 140939L, 155618L, 
157323L, 171457L, 165290L, 140833L, 158788L, 162213L, 201366L, 
248834L, 170899L, 159564L, 231487L, 281335L, 268906L, 134745L, 
155222L, 133268L, 223074L, 211489L, 167485L, 139614L, 178060L, 
186616L, 141583L, 172486L, 175021L, 187544L, 153492L, 245626L, 
168411L, 166539L, 148776L, 191410L, 135434L, 153281L, 203938L, 
155049L, 149193L, 168851L, 168000L, 143976L, 167995L, 172333L, 
143025L, 168156L, 175161L, 184271L, 148113L, 153620L, 178359L, 
143852L, 139743L, 159931L, 181351L, 170455L, 140985L, 136863L, 
167934L, 162680L, 181756L, 212960L, 149715L, 168102L, 175952L, 
275313L, 276390L)), .Names = c("fc.item", "period", "y"), row.names = c(NA, 
-313L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x103804778>, sorted = "fc.item") 
Community
  • 1
  • 1
Matt Weller
  • 2,684
  • 2
  • 21
  • 30
  • Please advise how I should improve the question to attract interest. If I don't hear anything today I'm going to have to arrange a bounty as this is a major stumbling block in my work! – Matt Weller Sep 30 '15 at 15:04
  • I can't find the function `iter` trying to run your code – erasmortg Sep 30 '15 at 19:32
  • You're right, it is in the `iterators` package, which I thought was available through `foreach` – Matt Weller Sep 30 '15 at 23:17
  • 2
    foreach used to "Depend" on the iterators package, but now it "Imports" iterators. That means that you have to load iterators explicitly when your code uses "iter", for example. – Steve Weston Oct 01 '15 at 14:34
  • Can you attach your sessionInfo? I'm wondering if one of your packages might be out of date (esp. data.table). – jrshrenk Oct 06 '15 at 20:53
  • Also, when you run this code, are you passing the data to different cores/processes? I tried running with your data and couldn't reproduce your error, but I've had similar experiences before where foreach needs .packages specified to export data.table to the parallel environments. – jrshrenk Oct 06 '15 at 21:24

1 Answers1

0

I figured the full example would be too long for the comments. Other than a problem with setkey (which you posted as key, unless I misunderstood your post) in your MRE I couldn't reproduce. Could that be your issue? Consider this alternative:

> key(dat.in)
NULL
> setkey(dat.in,fc.item)
> foreach(dt.sub = isplitDT(dat.in, levels(dat.in$fc.item))) %do%
 {
     # code to execute on each core/iteration
     print(dt.sub$key[1])
     dt.sub$value 
 }
[1] "A"
[[1]]
     fc.item period      y
  1:       A      1 287718
  2:       A      2 343083
  3:       A      3 291241
  4:       A      4 298469
  5:       A      5 300267
 ---                      
309:       A    309 149715
310:       A    310 168102
311:       A    311 175952
312:       A    312 275313
313:       A    313 276390

> isplitDT = function(x, vals) {
     ival <- iter(vals)
     nextEl <- function() {
         val <- nextElem(ival)
         list(value=x[val], key=val)
     }
     obj <- list(nextElem=nextEl)
     class(obj) <- c('abstractiter', 'iter')
     obj
 }
 dtcomb = function(...) {
     rbindlist(list(...))
 }
 ############################################
> 
> ## main function to split-process-combine using isplitDT and dtcomb
> result = foreach(dt.sub = isplitDT(dt, levels(dt$item)),
                  .combine = "dtcomb") %do%
                  {
                      print(dt.sub$key[1])
                      dt.sub$value
                  }
[1] "item-1"
[1] "item-10"
[1] "item-2"
[1] "item-3"
[1] "item-4"
[1] "item-5"
[1] "item-6"
[1] "item-7"
[1] "item-8"
[1] "item-9"
> print(paste("Did it work =", sum(result == dt) == 300))
[1] "Did it work = TRUE"

Note that in this example, I set the key to fc.item (as period threw an error)

erasmortg
  • 3,246
  • 1
  • 17
  • 34
  • Thanks. There is no issue with the MRE but only when using my data. The `key()` is used to show that my table is keyed. I will fix my data (`dput` outputs all sorts of structures and pointers) so you can try with that. Will revert later today. – Matt Weller Oct 01 '15 at 11:03
  • I still can't seem to be able to reproduce. After the 'foreach' call in an object named `fc`. I get a data.frame/data.table with 313 observations... – erasmortg Oct 05 '15 at 06:59