0

Using R I've fit an ordered logistic regression to some data. I need to use the fitted values to predict future values. I have a dataset which has 10 different combinations of parameters I want to predict the probabilities for the 'Result' dependant variable (which the model is fit to). I've used 'split' to split into 10 separate dataframes and I want to try to run the predict statement against each of the outputted dataframes. Can someone show me how to: a. Create a function to run the predict statement, and b. Loop the function through the 10 dataframes? c. Output the prediction probabilities for the three result conditions, and bind them to the relevant input dataframe

Here's what I've tried:

# Fit the model    
library(MASS)
fit <- polr(Result ~ State + score1+ score2 + history + pastdefault, data =   dat, Hess=TRUE)

## Model Summary
summary(fit)

#Take the new data which doesn't have predictions
t13<- dat[ which(dat$Tranche==13), ]
out <- split( t13 , f = r13$Gamen )

#Function
pred <- function(Result, State, score1, score2, history, pastdefault,...){
        cbind(out, predict(fit, out, type = "probs"))}

#Use sapply to run prediction
pred.out <- sapply(out, pred)

Further to a suggestion I've tried running the predict statement against the result of splitting the data into 'out' as per code below:

#Predict against 'out' dataframes
newdat1 <- cbind(out, predict(fit, newdata=out, type = "probs"))

However I get the same error message:

in eval(expr, envir, enclos) : object 'State' not found

Here's what the data looks like using head(data):

     State score1 score2 Salaries history exp moody   Result pastdefault Tranche
1     Minnesota  6.000   17.5   35.948       9 579  1085 moreinfo         yes       1
2    New Mexico  4.586   17.2   28.493      11 530  1015     fail          No       1
3    Washington  5.906   20.2   36.151      48 494   937     fail          No       1
4 West Virginia  6.107   14.8   31.944      17 484   932 moreinfo          No       1
5     Louisiana  4.761   16.8   26.461       9 535  1021     pass         yes       2
6      Oklahoma  4.845   15.5   28.172       9 536  1027     pass         yes       2
  Gamen
1     1
2     2
3     3
4     4
5     1
6     2

Here is the result of dput(data):

ture(list(State = structure(c(23L, 31L, 47L, 48L, 18L, 36L, 
39L, 42L, 13L, 14L, 16L, 35L, 50L, 1L, 20L, 27L, 30L, 7L, 9L, 
34L, 43L, 3L, 4L, 19L, 25L, 26L, 37L, 8L, 29L, 38L, 2L, 21L, 
32L, 6L, 10L, 11L, 15L, 24L, 5L, 17L, 33L, 40L, 41L, 46L, 49L, 
28L, 12L, 22L, 45L, 32L, 9L, 5L, 44L, 13L, 38L, 19L, 25L, 26L, 
37L, 8L), .Label = c("Alabama", "Alaska", "Arizona", "Arkansas", 
"California", "Colorado", "Connecticut", "Delaware", "Florida", 
"Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", 
"Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", 
"Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", 
"Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", 
"New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", 
"Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", 
"Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", 
"West Virginia", "Wisconsin", "Wyoming"), class = "factor"), 
    score1 = c(6, 4.586, 5.906, 6.107, 4.761, 4.845, 7.469, 4.388, 
    6.136, 5.826, 5.817, 6.162, 6.16, 4.405, 7.245, 5.935, 9.774, 
    8.817, 5.718, 4.775, 5.222, 4.778, 4.459, 6.428, 5.383, 5.692, 
    6.436, 7.03, 5.859, 7.109, 8.963, 7.287, 9.623, 5.443, 5.193, 
    6.078, 5.483, 4.08, 4.992, 5.217, 5.077, 4.797, 4.775, 5.327, 
    6.93, 5.16, 4.21, 6.994, 6.75, 5.859, 7.109, 8.963, 3.656, 
    7.287, 9.623, 5.443, 6.078, 5.483, 4.08, 4.992), score2 = c(17.5, 
    17.2, 20.2, 14.8, 16.8, 15.5, 14.7, 18.6, 17.3, 17.5, 15.1, 
    16.6, 14.9, 17.2, 17, 14.5, 13.8, 14.4, 19.1, 15.3, 15.7, 
    19.3, 17.1, 13.8, 15.5, 16.3, 19.9, 16.6, 15.6, 17.1, 17.6, 
    14.8, 15.2, 18.4, 16.3, 17.9, 15.8, 17.5, 24, 17, 16.2, 16.4, 
    14.4, 14.6, 15.9, 18.7, 19.1, 20.1, 13.8, 16.6, 15.6, 17.1, 
    24.3, 17.6, 14.8, 15.2, 24, 17, 16.2, 14.4), Salaries = c(35.948, 
    28.493, 36.151, 31.944, 26.461, 28.172, 40.729, 32.477, 39.431, 
    36.785, 34.652, 36.802, 31.285, 31.144, 40.661, 30.922, 46.087, 
    50.045, 32.588, 26.327, 31.223, 32.175, 28.934, 31.972, 31.189, 
    28.785, 38.555, 39.076, 34.72, 44.51, 47.951, 40.795, 47.612, 
    34.571, 32.291, 38.518, 31.511, 26.818, 41.078, 32.257, 30.793, 
    30.279, 25.994, 33.987, 37.746, 34.836, 29.783, 41.895, 35.406, 
    28.785, 38.555, 39.076, 29.082, 31.511, 26.818, 41.078, 32.257, 
    25.994, 33.987, 37.746), history = c(9L, 11L, 48L, 17L, 9L, 
    9L, 70L, 12L, 13L, 58L, 9L, 23L, 10L, 8L, 64L, 9L, 70L, 81L, 
    48L, 5L, 47L, 27L, 6L, 68L, 9L, 21L, 51L, 68L, 70L, 70L, 
    47L, 80L, 74L, 29L, 65L, 57L, 5L, 4L, 45L, 11L, 60L, 58L, 
    5L, 65L, 9L, 30L, 15L, 11L, 68L, 45L, 11L, 60L, 4L, 58L, 
    5L, 65L, 9L, 5L, 65L, 80L), exp = c(579L, 530L, 494L, 484L, 
    535L, 536L, 463L, 543L, 560L, 467L, 557L, 515L, 525L, 538L, 
    479L, 556L, 478L, 477L, 469L, 592L, 474L, 496L, 523L, 469L, 
    550L, 536L, 499L, 468L, 491L, 461L, 489L, 477L, 473L, 518L, 
    448L, 482L, 583L, 540L, 485L, 522L, 454L, 443L, 563L, 468L, 
    572L, 483L, 511L, 549L, 472L, 526L, 528L, 530L, 563L, 532L, 
    534L, 536L, 538L, 540L, 542L, 544L), moody = c(1085L, 1015L, 
    937L, 932L, 1021L, 1027L, 888L, 1040L, 1048L, 882L, 1060L, 
    975L, 1001L, 1029L, 909L, 1050L, 898L, 908L, 889L, 1107L, 
    893L, 944L, 1005L, 896L, 1045L, 1009L, 947L, 897L, 935L, 
    880L, 934L, 907L, 892L, 980L, 854L, 889L, 1099L, 1036L, 902L, 
    999L, 865L, 844L, 1068L, 896L, 1073L, 917L, 979L, 1033L, 
    901L, 1002L, 1007L, 1011L, 1076L, 1016L, 900L, 760L, 1029L, 
    850L, 1038L, 990L), Result = structure(c(3L, 2L, 2L, 3L, 
    4L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 3L, 2L, 4L, 3L, 3L, 3L, 3L, 
    4L, 4L, 2L, 3L, 3L, 3L, 4L, 2L, 4L, 2L, 4L, 2L, 3L, 2L, 4L, 
    3L, 4L, 4L, 2L, 2L, 3L, 4L, 2L, 4L, 3L, 3L, 4L, 2L, 3L, 3L, 
    3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", 
    "fail", "moreinfo", "pass"), class = "factor"), pastdefault = structure(c(2L, 
    1L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 
    1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 
    2L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 
    1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 1L), .Label = c("No", 
    "yes"), class = "factor"), Tranche = c(1L, 1L, 1L, 1L, 2L, 
    2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 
    5L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 9L, 
    9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 11L, 12L, 
    12L, 12L, 12L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 
    13L), Gamen = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 
    4L, 5L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 5L, 
    6L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 
    4L, 5L, 6L, 7L, 1L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 5L, 6L, 
    7L, 8L, 9L, 10L)), .Names = c("State", "score1", "score2", 
"Salaries", "history", "exp", "moody", "Result", "pastdefault", 
"Tranche", "Gamen"), class = "data.frame", row.names = c(NA, 
-60L))
989
  • 12,579
  • 5
  • 31
  • 53
JohnM
  • 11
  • 2
  • You shouldn't need to split these at all. Have you simply tried `predict(fit, newdata = out, type = "probs")`? If that doesn't work, you'll need to provide a reproducible example of these dataframes, with *more* of your data (so that an actual data can be fit). Use dput: http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example – David Robinson Nov 17 '15 at 02:36
  • Hi David Robinson; I tried running predict against 'out' however obtained the same error: in eval(expr, envir, enclos) : object 'State' not found – JohnM Nov 17 '15 at 04:25
  • 1
    Do it on `t13` not `out`. – David Robinson Nov 17 '15 at 05:01
  • Ok I tried it on t13 and it threw back the following error message: 'Error in X %*% object$coefficients : non-conformable arguments' – JohnM Nov 18 '15 at 03:02

0 Answers0