I am using tidymodels
to create a Random Forrest prediction. I have test data that contains a new factor level not present in the training data which results in the error:
1: Novel levels found in column 'Siblings': '4'. The levels have been removed, and values have been coerced to 'NA'.
2: There are new levels in a factor: NA
> test_predict
Fehler: Objekt 'test_predict' nicht gefunden
I tried to include a step_novel
and step_dummy
on the "Siblings" column but this does not resolve the error. How should I deal with new factors not present in training data?
library(tidyverse)
library(tidymodels)
data <-
data.frame(
Survived = as.factor(c(0,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0)),
Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,3)),
Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s"))
)
test <-
data.frame(
Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,4)), #New factor level
Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s"))
)
#Model
rf_model <-
rand_forest() %>%
set_args(
mtry = 3,
trees = 1000,
min_n = 15
) %>%
set_engine("ranger",
importance = "impurity") %>%
set_mode("classification")
#Recipe
data_recipe <-
recipe(Survived ~Siblings + Class + Embarked, data=data) %>%
step_novel(Siblings) %>%
step_dummy(Siblings)
#Workflow
rf_workflow <-
workflow() %>%
add_recipe(data_recipe) %>%
add_model(rf_model)
final_model <- fit(rf_workflow, data)
final_model
test_predict <- predict(final_model, test)
test_predict