I trained a xgb model like this:
candidates_var_train <- model.matrix(job_change ~ 0 + ., data = candidates_train)
candidates_train_xgb <- xgb.DMatrix(data = candidates_var_train,
label = ifelse(candidates_train$job_change == "Interested", 1, 0))
candidates_var_test <- model.matrix(job_change ~ 0 + ., data = candidates_test)
candidates_test_xgb <- xgb.DMatrix(data = candidates_var_test,
label = ifelse(candidates_test$job_change == "Interested", 1, 0))
Got a decent AUC and want to apply it on my new data set. New data is saved as a data frame and has same columns as test/training data except for target variable "job_change". I tried to convert it into a sparse Matrix like this:
candidates_predict_sparse <- as(as.matrix(candidates_predict), "sparseMatrix")
candidates_predict_xgb <- xgb.DMatrix(data = candidates_predict_sparse)
But NAs were introduced in the sparse matrix and when I try to do a prediction using predict()
following error occurs:
Error in predict.xgb.Booster(xgb_model, newdata = candidates_predict_sparse, :
Feature names stored in `object` and `newdata` are different!
EDIT: Reproducible Example
minimal datasets:
candidates_predict (dataset I want to have the prediction for)
structure(list(enrollee_id = c(23427, 17605, 20912, 13948, 15205,
15140, 21736, 19800, 23755, 12148), city_development_index = c(0.698,
0.896, 0.754, 0.926, 0.92, 0.878, 0.926, 0.767, 0.689, 0.92),
gender = structure(c(4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), levels = c("Female", "Male", "Other", "keine Angabe"
), class = "factor"), enrolled_university = structure(c(4L,
2L, 1L, 2L, 1L, 3L, 3L, 2L, 2L, 2L), levels = c("Full time course",
"no_enrollment", "Part time course", "keine Angabe"), class = "factor"),
company_size = structure(c(9L, 9L, 9L, 5L, 3L, 9L, 3L, 6L,
2L, 9L), levels = c("<10", "10/49", "100-500", "1000-4999",
"10000+", "50-99", "500-999", "5000-9999", "keine Angabe"
), class = "factor"), company_type = structure(c(7L, 7L,
7L, 6L, 6L, 7L, 6L, 6L, 6L, 7L), levels = c("Early Stage Startup",
"Funded Startup", "NGO", "Other", "Public Sector", "Pvt Ltd",
"keine Angabe"), class = "factor"), last_new_job = structure(c(6L,
6L, 6L, 1L, 1L, 1L, 1L, 1L, 5L, 5L), levels = c("1", "2",
"3", "4", ">4", "never", "keine Angabe"), class = "factor"),
training_hours = c(63, 10, 46, 18, 55, 4, 324, 26, 140, 158
), education_detail = structure(c(8L, 7L, 7L, 21L, 8L, 22L,
7L, 7L, 7L, 19L), levels = c("Graduate Arts", "Graduate Business Degree",
"Graduate Humanities", "Graduate No Major", "Graduate no major discipline",
"Graduate Other", "Graduate STEM", "High School", "keine Angabe",
"Masters Arts", "Masters Business Degree", "Masters Humanities",
"Masters No Major", "Masters no major discipline", "Masters Other",
"Masters STEM", "Phd Arts", "Phd Business Degree", "Phd Humanities",
"Phd Other", "Phd STEM", "Primary School"), class = "factor"),
experience_detail = structure(c(23L, 23L, 23L, 23L, 23L,
21L, 23L, 17L, 10L, 23L), levels = c("<1", ">20", "1", "10",
"11", "12", "13", "14", "15", "16", "17", "18", "19", "2",
"20", "3", "4", "5", "6", "7", "8", "9", "no relevant experience"
), class = "factor")), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
candidates_train (dataset I trained the xgboost model with)
structure(list(enrollee_id = c(26270, 3166, 20087, 8518, 8899,
25403, 14514, 3300, 10364, 5220), city_development_index = c(0.92,
0.887, 0.698, 0.92, 0.92, 0.92, 0.624, 0.84, 0.926, 0.754), gender = structure(c(1L,
2L, 2L, 2L, 4L, 2L, 2L, 4L, 4L, 2L), levels = c("Female", "Male",
"Other", "keine Angabe"), class = "factor"), enrolled_university = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L), levels = c("Full time course",
"no_enrollment", "Part time course", "keine Angabe"), class = "factor"),
company_size = structure(c(7L, 9L, 1L, 9L, 9L, 3L, 9L, 2L,
5L, 9L), levels = c("<10", "10/49", "100-500", "1000-4999",
"10000+", "50-99", "500-999", "5000-9999", "keine Angabe"
), class = "factor"), company_type = structure(c(2L, 7L,
2L, 7L, 7L, 6L, 7L, 6L, 4L, 7L), levels = c("Early Stage Startup",
"Funded Startup", "NGO", "Other", "Public Sector", "Pvt Ltd",
"keine Angabe"), class = "factor"), last_new_job = structure(c(3L,
1L, 1L, 1L, 6L, 1L, 6L, 3L, 5L, 4L), levels = c("1", "2",
"3", "4", ">4", "never", "keine Angabe"), class = "factor"),
training_hours = c(127, 36, 7, 39, 53, 168, 111, 52, 107,
46), job_change = c("Interested", "Not interested", "Not interested",
"Not interested", "Not interested", "Not interested", "Not interested",
"Not interested", "Not interested", "Not interested"), education_detail = structure(c(3L,
7L, 16L, 22L, 22L, 3L, 8L, 7L, 8L, 6L), levels = c("Graduate Arts",
"Graduate Business Degree", "Graduate Humanities", "Graduate No Major",
"Graduate no major discipline", "Graduate Other", "Graduate STEM",
"High School", "keine Angabe", "Masters Arts", "Masters Business Degree",
"Masters Humanities", "Masters No Major", "Masters no major discipline",
"Masters Other", "Masters STEM", "Phd Arts", "Phd Business Degree",
"Phd Humanities", "Phd Other", "Phd STEM", "Primary School"
), class = "factor"), experience_detail = structure(c(17L,
5L, 18L, 23L, 23L, 14L, 23L, 8L, 5L, 2L), levels = c("<1",
">20", "1", "10", "11", "12", "13", "14", "15", "16", "17",
"18", "19", "2", "20", "3", "4", "5", "6", "7", "8", "9",
"no relevant experience"), class = "factor")), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"), na.action = structure(c(`505` = 505L,
`688` = 688L, `1355` = 1355L, `1498` = 1498L, `1594` = 1594L,
`3607` = 3607L, `4897` = 4897L, `5743` = 5743L, `5863` = 5863L,
`5908` = 5908L, `6377` = 6377L, `7449` = 7449L, `7578` = 7578L
), class = "omit"))
candidates_test (dataset I tested the xgboost model with)
structure(list(enrollee_id = c(402, 27107, 8722, 6588, 4167,
19061, 17139, 14928, 10164, 8612), city_development_index = c(0.762,
0.92, 0.624, 0.926, 0.92, 0.926, 0.624, 0.92, 0.926, 0.92), gender = structure(c(2L,
2L, 4L, 2L, 4L, 2L, 4L, 2L, 2L, 4L), levels = c("Female", "Male",
"Other", "keine Angabe"), class = "factor"), enrolled_university = structure(c(2L,
2L, 1L, 2L, 2L, 2L, 3L, 2L, 2L, 2L), levels = c("Full time course",
"no_enrollment", "Part time course", "keine Angabe"), class = "factor"),
company_size = structure(c(1L, 6L, 9L, 2L, 6L, 3L, 7L, 3L,
3L, 9L), levels = c("<10", "10/49", "100-500", "1000-4999",
"10000+", "50-99", "500-999", "5000-9999", "keine Angabe"
), class = "factor"), company_type = structure(c(6L, 6L,
7L, 6L, 6L, 6L, 6L, 6L, 6L, 7L), levels = c("Early Stage Startup",
"Funded Startup", "NGO", "Other", "Public Sector", "Pvt Ltd",
"keine Angabe"), class = "factor"), last_new_job = structure(c(5L,
1L, 6L, 5L, 6L, 2L, 1L, 3L, 4L, 4L), levels = c("1", "2",
"3", "4", ">4", "never", "keine Angabe"), class = "factor"),
training_hours = c(18, 46, 26, 18, 106, 50, 148, 40, 42,
50), job_change = c("Interested", "Interested", "Not interested",
"Not interested", "Not interested", "Not interested", "Interested",
"Not interested", "Interested", "Not interested"), education_detail = structure(c(7L,
7L, 8L, 7L, 7L, 16L, 7L, 7L, 21L, 7L), levels = c("Graduate Arts",
"Graduate Business Degree", "Graduate Humanities", "Graduate No Major",
"Graduate no major discipline", "Graduate Other", "Graduate STEM",
"High School", "keine Angabe", "Masters Arts", "Masters Business Degree",
"Masters Humanities", "Masters No Major", "Masters no major discipline",
"Masters Other", "Masters STEM", "Phd Arts", "Phd Business Degree",
"Phd Humanities", "Phd Other", "Phd STEM", "Primary School"
), class = "factor"), experience_detail = structure(c(7L,
20L, 23L, 10L, 3L, 5L, 8L, 2L, 2L, 23L), levels = c("<1",
">20", "1", "10", "11", "12", "13", "14", "15", "16", "17",
"18", "19", "2", "20", "3", "4", "5", "6", "7", "8", "9",
"no relevant experience"), class = "factor")), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"), na.action = structure(c(`531` = 531L,
`615` = 615L, `715` = 715L, `1000` = 1000L, `1148` = 1148L, `1318` = 1318L,
`1416` = 1416L), class = "omit"))
libraries used
library(Matrix)
library(xgboost)
library(dplyr)
library(readr)