0

I don't know how the 2 model handle factor levels, but logit won't predict and gives an error message saying new factor levels. When I predict using C5 it works fine. I have created the train and test from a single data frame and levels in both match each other.

I am seeking an explanation of this behaviour and a solution for this. I understand that the new levels in test would not be able to get their coefficient calculated, but setting them to NULL should be okay I think.

Here is a bit of the code. I used this to match the levels of hold and train. tr=dataset to be split into train and test.

tr=structure(
        list(
            production_year = c(
                2007L, 2010L, 2010L, 2008L,
                2007L, 2008L, 2008L, 2008L, 2007L, 2011L, 2009L, 2009L, 2009L,
                2008L, 2007L, 2007L, 2010L, 2009L, 2008L, 2008L, 2010L, 2010L,
                2007L, 2010L, 2009L, 2008L, 2007L, 2007L, 2008L, 2007L, 2010L,
                2011L, 2010L, 2007L, 2009L, 2009L, 2008L, 2008L, 2010L, 2011L
            ), movie_sequel = structure(
                c(
                    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
                    1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
                    1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
                ), .Label = c("0", "1"), class = "factor"
            ), creative_type = structure(
                c(
                    1L,
                    4L, 1L, 4L, 5L, 1L, 1L, 6L, 2L, 1L, 6L, 1L, 1L, 1L, 1L, 1L, 1L,
                    1L, 8L, 1L, 7L, 1L, 1L, 3L, 1L, 1L, 2L, 4L, 4L, 1L, 1L, 4L, 5L,
                    5L, 1L, 4L, 1L, 1L, 1L, 1L
                ), .Label = c(
                    "Contemporary Fiction",
                    "Dramatization", "Factual", "Fantasy", "Historical Fiction",
                    "Kids Fiction", "Science Fiction", "Super Hero"
                ), class = "factor"
            ),
            source = structure(
                c(
                    6L, 2L, 6L, 7L, 2L, 6L, 6L, 6L, 4L,
                    6L, 2L, 7L, 6L, 6L, 6L, 3L, 6L, 6L, 1L, 2L, 6L, 5L, 6L, 5L,
                    5L, 6L, 4L, 2L, 2L, 6L, 6L, 2L, 7L, 4L, 6L, 5L, 6L, 2L, 6L,
                    6L
                ), .Label = c(
                    "Based on Comic/Graphic Novel", "Based on Fiction Book/Short Story",
                    "Based on Folk Tale/Legend/Fairytale", "Based on Real Life Events",
                    "Based on TV", "Original Screenplay", "Remake"
                ), class = "factor"
            ),
            production_method = structure(
                c(
                    3L, 3L, 3L, 3L, 3L, 3L, 3L,
                    2L, 3L, 3L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L,
                    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
                    3L, 3L, 3L
                ), .Label = c(
                    "Animation/Live Action", "Digital Animation",
                    "Live Action", "Stop-Motion Animation"
                ), class = "factor"
            ),
            genre = structure(
                c(
                    3L, 1L, 4L, 5L, 1L, 4L, 3L, 3L, 4L, 5L,
                    2L, 7L, 6L, 5L, 7L, 3L, 3L, 7L, 1L, 7L, 7L, 3L, 4L, 3L, 3L,
                    6L, 4L, 2L, 1L, 2L, 6L, 4L, 7L, 1L, 4L, 2L, 3L, 7L, 7L, 5L
                ), .Label = c(
                    "Action", "Adventure", "Comedy", "Drama", "Horror",
                    "Romantic Comedy", "Thriller/Suspense"
                ), class = "factor"
            ),
            language = structure(
                c(
                    2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
                    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
                    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
                    2L
                ), .Label = c("Danish", "English"), class = "factor"
            ),
            movie_board_rating_display_name = structure(
                c(
                    3L, 3L, 3L,
                    2L, 2L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 3L, 3L, 2L, 3L, 3L,
                    3L, 3L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 1L, 2L, 3L, 2L, 2L, 3L,
                    2L, 3L, 1L, 2L, 3L, 3L, 2L
                ), .Label = c("PG", "PG-13", "R"), class = "factor"
            ), movie_release_pattern_display_name = structure(
                c(
                    4L,
                    4L, 3L, 4L, 4L, 3L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 4L, 3L, 4L,
                    3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 1L, 4L,
                    4L, 4L, 2L, 3L, 4L, 4L, 4L, 3L, 4L
                ), .Label = c("Exclusive",
                              "Expands Wide", "Limited", "Wide"), class = "factor"
            ), Category1 = structure(
                c(
                    1L,
                    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
                    1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
                    2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
                ), .Label = c("0", "1"), class = "factor"
            )
        ), .Names = c(
            "production_year",
            "movie_sequel", "creative_type", "source", "production_method",
            "genre", "language", "movie_board_rating_display_name", "movie_release_pattern_display_name",
            "Category1"
        ), row.names = c(
            506L, 474L, 1011L, 569L, 737L, 1124L,
            602L, 717L, 747L, 977L, 284L, 620L, 100L, 301L, 514L, 865L, 828L,
            283L, 921L, 839L, 15L, 937L, 931L, 201L, 273L, 507L, 1180L, 689L,
            276L, 649L, 603L, 22L, 555L, 974L, 552L, 500L, 216L, 312L, 796L,
            682L
        ), class = "data.frame"
    )

    train=tr[1:25,] # training data
    hold=tr[26:40,] # test data

    for(i in 1:ncol(train)){
        if(is.factor(train[,i])){
            hold[,i] <- factor(hold[,i],levels=levels(train[,i]))
            
        }
    }

m.glm=glm(Category1 ~ ., data = train, family = 'binomial')
labels=hold$Category1
hold$Category1=NULL
p=predict(m.glm, hold)

all the levels

structure(list(production_year = 2011L, movie_sequel = structure(1L, .Label = c("0", 
"1"), class = "factor"), creative_type = structure(5L, .Label = c("Contemporary Fiction", 
"Dramatization", "Factual", "Fantasy", "Historical Fiction", 
"Kids Fiction", "Multiple Creative Types", "Science Fiction", 
"Super Hero"), class = "factor"), source = structure(14L, .Label = c("Based on Comic/Graphic Novel", 
"Based on Factual Book/Article", "Based on Fiction Book/Short Story", 
"Based on Folk Tale/Legend/Fairytale", "Based on Game", "Based on Musical or Opera", 
"Based on Play", "Based on Real Life Events", "Based on Short Film", 
"Based on Theme Park Ride", "Based on Toy", "Based on TV", "Compilation", 
"Original Screenplay", "Remake", "Spin-Off"), class = "factor"), 
    production_method = structure(4L, .Label = c("Animation/Live Action", 
    "Digital Animation", "Hand Animation", "Live Action", "Multiple Production Methods", 
    "Stop-Motion Animation"), class = "factor"), genre = structure(13L, .Label = c("Action", 
    "Adventure", "Black Comedy", "Comedy", "Concert/Performance", 
    "Documentary", "Drama", "Horror", "Multiple Genres", "Musical", 
    "Romantic Comedy", "Thriller/Suspense", "Western"), class = "factor"), 
    language = structure(3L, .Label = c("Arabic", "Danish", "English", 
    "Farsi", "French", "German", "Hebrew", "Hindi", "Italian", 
    "Japanese", "Norwegian", "Polish", "Portuguese", "Silent", 
    "Spanish", "Swedish"), class = "factor"), movie_board_rating_display_name = structure(6L, .Label = c("G", 
    "NC-17", "Not Rated", "PG", "PG-13", "R"), class = "factor"), 
    movie_release_pattern_display_name = structure(7L, .Label = c("Exclusive", 
    "Expands Wide", "IMAX", "Limited", "Oscar Qualifying Run", 
    "Special Engagement", "Wide"), class = "factor"), Category1 = structure(1L, .Label = c("0", 
    "1"), class = "factor")), .Names = c("production_year", "movie_sequel", 
"creative_type", "source", "production_method", "genre", "language", 
"movie_board_rating_display_name", "movie_release_pattern_display_name", 
"Category1"), row.names = 304L, class = "data.frame")
halfer
  • 19,824
  • 17
  • 99
  • 186
Chirayu Chamoli
  • 2,076
  • 1
  • 17
  • 32
  • 1
    You might notice that if you provide a reproducible example, the culprit may show itself. Make a mock example, fit the model, create a `newdata` data.frame and reproduce the error. Copy/paste the code and data here and wait. – Roman Luštrik Feb 07 '16 at 12:42
  • @RomanLuštrik The data is too fragmented. Here is a bit of code i used with error. Let me know if you would like some data then i would update that too and no of obs. – Chirayu Chamoli Feb 07 '16 at 12:48
  • Simulated dataset will suffice. You can also use the builtin datasets. [Here are some pointers](http://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) on how to do just that. – Roman Luštrik Feb 07 '16 at 12:50
  • @RomanLuštrik: Will this suffice? – Chirayu Chamoli Feb 07 '16 at 14:01
  • Can you format it so that I will be able to copy/paste into my R session without trying to figure out what part is train/hold? – Roman Luštrik Feb 07 '16 at 15:59
  • @RomanLuštrik I hope this will do! – Chirayu Chamoli Feb 07 '16 at 17:25
  • Can revise and delete non-relevant code? What is `tr`? I would prefer the example is copy/pastable. Please check some high voted question on SO if you need help formulating and formatting a question. – Roman Luštrik Feb 08 '16 at 09:40
  • @RomanLuštrik I have mentioned the def of tr. you can paste the data and run the codes directly. – Chirayu Chamoli Feb 08 '16 at 09:45

1 Answers1

1

The way I see it, you will have to exclude the rows with levels which have not been used to fit the model.

predict(m.glm, hold[!hold$movie_release_pattern_display_name %in% c("Exclusive", "Expands Wide"), ])
Roman Luštrik
  • 69,533
  • 24
  • 154
  • 197