1

I have a dataframe df

dput(df)
    structure(list(ID = c(4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 
6, 6, 6, 6, 8, 8, 8, 9, 9), Y = c(2268.14043972082, 2147.62290922552, 
2269.1387550775, 2247.31983098201, 1903.39138268307, 2174.78291538358, 
2359.51909126411, 2488.39004804939, 212.851575751527, 461.398994384333, 
567.150629704352, 781.775113821961, 918.303706148872, 1107.37695799186, 
1160.80594193377, 1412.61328924168, 1689.48879626486, 685.154353165934, 
574.088067465695, 650.30821636616, 494.185166497016, 436.312162090908
), P = c(1750.51986303926, 1614.11541634798, 951.847023338079, 
1119.3682884872, 1112.38984390156, 1270.65773075982, 1234.72262170166, 
1338.46096616983, 1198.95775346458, 1136.69287367165, 1265.46480803983, 
1364.70149818063, 1112.37006707489, 1346.49240261316, 1740.56677791104, 
1410.99217295647, 1693.18871380948, 275.447173420805, 396.449789014179, 
251.609239829704, 215.432550271042, 55.5336257666349), A = c(49, 
50, 51, 52, 53, 54, 55, 56, 1, 2, 3, 4, 5, 14, 15, 16, 17, 163, 
164, 165, 153, 154), TA = c(9.10006221322572, 7.65505467142961, 
8.21480062559674, 8.09251754304318, 8.466220758789, 8.48094407814006, 
8.77304120569444, 8.31727518543397, 8.14410265791868, 8.80921738865237, 
9.04091478341757, 9.66233618146246, 8.77015716015164, 9.46037931956657, 
9.59702379240667, 10.1739258740118, 9.39524442215692, -0.00568604734662462, 
-2.12940164413048, -0.428603434930109, 1.52337963973006, -1.04714984064565
), TS = c(9.6499861763085, 7.00622420539595, 7.73511170298675, 
7.68006974050443, 8.07442411510912, 8.27687965909096, 8.76025039592727, 
8.3345638889156, 9.23658956753677, 8.98160722605782, 8.98234210211611, 
9.57066566368204, 8.74444401914267, 8.98719629775988, 9.18169205278566, 
9.98225438314085, 9.56196773059615, 5.47788158053928, 2.58106090926808, 
3.22420704848299, 1.36953555753786, 0.241334267522977), R = c(11.6679680423377, 
11.0166459173372, 11.1851268491296, 10.7404563561694, 12.1054055597684, 
10.9551321815546, 11.1975918244469, 10.7242192465965, 10.1661703705992, 
11.4840412725324, 11.1248456370953, 11.2529612597628, 10.7694642397996, 
12.3300887767583, 12.0478558531771, 12.3212362249214, 11.5650773932264, 
9.56070414783612, 9.61762902218185, 10.2076240621201, 11.8234628013552, 
10.9184029778985)), .Names = c("ID", "Y", "P", "A", "TA", "TS", 
"R"), na.action = structure(77:78, .Names = c("77", "78"), class = "omit"), row.names = c(NA, 
22L), class = "data.frame")

I want to run a RandomForest on this data set with a leave one ID out cross validation. Thus, I do not want the cross validation to be kind of random. For every run, I want to leave out the data with the same ID value as the data with the same ID are not independent. For instance, the first run will be trained on the data with ID=5,6,8,9 and will be tested on the data with ID=4, the second run will be trained on the data with ID=4,6,8,9 and will be tested on the data with ID=5, and so on.

I implemented the command lines below, but I am not quite sure if it is conceptually correct.

# Create Training dataset
df<-na.omit(df)
tvec<-unique(df$ID)
nruns <- length(tvec)
crossclass<-sample(nruns,length(tvec),TRUE)
nobs<-nrow(df)
crossPredict<-rep(NA,nobs)

#Run a RandomForest with leave one out ID CV
for (i in 1:nruns) {
  indtrain<-which(df$ID %in% tvec[!crossclass==i])
  indvalidate<-setdiff(1:nobs,indtrain)
  rf<-randomForest(formula = Y ~ P + TA + TS + R + A, data=df, subset=indtrain,ntree=10000)
  crossPredict[indvalidate]<-predict(rf,df[indvalidate,])
}

Can somebody help me out with that?

SimonB
  • 670
  • 1
  • 10
  • 25

2 Answers2

2
library(randomForest)
newIris <- data.frame(iris, id=1+c(1:nrow(iris))%%3)
id <- unique(newIris$id)

loo <- NULL
for(i in id){
    rf <- randomForest(Species~., data=newIris[newIris$id!=i,])
    loo[[i]] <- predict(rf, newdata=newIris[newIris$id==i,])
}

print(loo)

Just make a vector of IDs and omit each ID in turn.

Sycorax
  • 1,298
  • 13
  • 26
  • Ok. So for my data, it will be `id <-df$ID pred <- NULL for(i in id){ rf_out <- randomForest(Y~ P + TA + TS + R + A, mtry=2, data=df[-i,], ntree=10000) pred[i] <-predict(rf_out, newdata=df[i,]) } pred` right? – SimonB Oct 04 '15 at 15:27
  • Alright. I indeed have number of rows not equal to number of rows. Here, I have 5 IDs and 22 rows. How could implement a CV leave one out ID then? Any ideas? – SimonB Oct 05 '15 at 07:52
  • Not really. Because if I do `data=iris[-id==i,]` as `id= unique(id_list)`, there is no ID left to compute the RF at each iteration. – SimonB Oct 05 '15 at 13:24
  • Well if you follow you procedure using my dataset, you will see that `data=df[-id==i,]` is empty dataframe for each iteration. – SimonB Oct 05 '15 at 13:33
  • It does work, except for the prediction because it predicts only one value per ID and not one one value for each row. – SimonB Oct 05 '15 at 13:55
  • @SimonB The example I've provided gives one prediction for every row. `loo` is a list, and the union of each slot of the list is a prediction for each row of `newIris`. – Sycorax Oct 05 '15 at 13:57
  • Indeed because all the IDs are different in your case. But if you have similar IDs in the dataframe, it does not provide a prediction for every row but only per ID. – SimonB Oct 05 '15 at 14:00
  • @SimonB I don't understand what you mean. Each row has an ID in the set {1,2,3} in my example; there are 150 rows, so some rows must have identical IDs by the pigeonhole principle. And how can IDs be similar? My understanding is that either ID[1] == ID[2] XOR ID[1]!=ID[2], and that there is no third option. – Sycorax Oct 05 '15 at 14:07
  • Dont know but your example does not seem to work on my dataset. Did you try it out with the dataset I provided? – SimonB Oct 05 '15 at 14:11
0

This seems to be working well.

library (caret)
library(randomForest)
    # Create training datastet
    subs <- unique(df$ID)
    train<- vector(mode = "list", length = length(subs))
    test<- vector(mode = "list", length = length(subs))

    # Run a RandomForest with leave one out ID CV
    for(i in seq_along(subs))
    train[[i]] <- which(df$ID != subs[i])
    names(train) <- paste0("ID", subs)
    rfFit <- train(Y~ P + TA + TS + R + A,
                   data =df,
                   method = "rf",
                   ntree = 100,
                   prox=TRUE, allowParallel=TRUE,
                   importance = TRUE,
                   trControl = trainControl(method = "cv", 
                                             index = train))

    # Create test dataset
    for(i in seq_along(subs))
    test[[i]] <- which(df$ID == subs[i])
    names(test)<-paste0("ID", subs)
SimonB
  • 670
  • 1
  • 10
  • 25