I tried the answers posted in response to this question, but the error did not change. I am trying to preprocess both the training and test sets in the same way. They come from two different files and I am not sure if my instructor would be alright with me mixing up the sets so preprocessing before splitting them is not really an option. Why does the predict
line work the first time with the training set but not with the test set? the two dataframes should be identical in their structure except for the individual values in the rows and the total number of rows.
##### Load libraries #####
library(readr)
library(caret)
###### Read in data ######
training = read_csv("~/Machine Learning 2/M1/buad5132-m1-training-data.csv")
test = read_csv("~/Machine Learning 2/M1/buad5132-m1-test-data.csv")
##### Preprocessing #####
### Change column classes
#Training
training$INDEX = as.factor(training$INDEX)
training$TARGET_FLAG = as.factor(training$TARGET_FLAG)
training$PARENT1 = as.factor(training$PARENT1)
training$MSTATUS = as.factor(training$MSTATUS)
training$SEX = as.factor(training$SEX)
training$EDUCATION = as.factor(ifelse(training$EDUCATION == "<High School", "Less than High School", training$EDUCATION))
training$JOB = as.factor(training$JOB)
training$CAR_USE = as.factor(training$CAR_USE)
training$CAR_TYPE = as.factor(training$CAR_TYPE)
training$RED_CAR = as.factor(training$RED_CAR)
training$REVOKED = as.factor(training$REVOKED)
training$INCOME = suppressWarnings(as.numeric(gsub("[^0-9.]", "", training$INCOME)))
training$HOME_VAL = suppressWarnings(as.numeric(gsub("[^0-9.]", "", training$HOME_VAL)))
training$OLDCLAIM = suppressWarnings(as.numeric(gsub("[^0-9.]", "", training$HOME_VAL)))
training$BLUEBOOK = suppressWarnings(as.numeric(gsub("[^0-9.]", "", training$BLUEBOOK)))
training$URBANICITY = ifelse(grepl("Urban", training$URBANICITY), "Urban", "Rural")
training$URBANICITY = as.factor(training$URBANICITY)
#Test
test$INDEX = as.factor(test$INDEX)
test$TARGET_FLAG = as.factor(test$TARGET_FLAG)
test$PARENT1 = as.factor(test$PARENT1)
test$MSTATUS = as.factor(test$MSTATUS)
test$SEX = as.factor(test$SEX)
test$EDUCATION = as.factor(ifelse(test$EDUCATION == "<High School", "Less than High School", test$EDUCATION))
test$JOB = as.factor(test$JOB)
test$CAR_USE = as.factor(test$CAR_USE)
test$CAR_TYPE = as.factor(test$CAR_TYPE)
test$RED_CAR = as.factor(test$RED_CAR)
test$REVOKED = as.factor(test$REVOKED)
test$INCOME = suppressWarnings(as.numeric(gsub("[^0-9.]", "", test$INCOME)))
test$HOME_VAL = suppressWarnings(as.numeric(gsub("[^0-9.]", "", test$HOME_VAL)))
test$OLDCLAIM = suppressWarnings(as.numeric(gsub("[^0-9.]", "", test$HOME_VAL)))
test$BLUEBOOK = suppressWarnings(as.numeric(gsub("[^0-9.]", "", test$BLUEBOOK)))
test$URBANICITY = ifelse(grepl("Urban", test$URBANICITY), "Urban", "Rural")
test$URBANICITY = as.factor(test$URBANICITY)
### Dummy variables
#Training
trainDmyParams = dummyVars(~., training[,-c(1,2)])
training.dmy = as.data.frame(predict(trainDmyParams, training[,-c(1,2)]))
training.dmy$TARGET_FLAG = training$TARGET_FLAG
names(training.dmy) = make.names(names(training.dmy))
#Test
testDmyParams = dummyVars(~., test[,-c(1,2)])
test.dmy = as.data.frame(predict(testDmyParams, test[,-c(1,2)]))
test.dmy$TARGET_FLAG = test$TARGET_FLAG
names(test.dmy) = make.names(names(test.dmy))
### Standardization and imputation
#Training
preProcessTrain = preProcess(training.dmy, method = c("center", "scale", "bagImpute"))
training.prepped = predict(preProcessTrain, training.dmy)
#Test
preProcessTest = preProcess(test.dmy, method = c("center", "scale", "bagImpute"))
test.prepped = predict(preProcessTest, test.dmy) # <--- error occurs on this line
Error in UseMethod("predict") : no applicable method for 'predict' applied to an object of class "NULL"