I'm trying to predict price
from a series of real-estate rental variables. The goal is to get a low RMSE but whenever I apply the ranger model to newdata
as opposed to just data
I get the following error: Error in predict.ranger(model_forest5, newdata = train) : Error: Argument 'data' is required for non-quantile prediction.
Below is the minimal amount of code I could provide that would still give context (unfortunately I'm unsure how to give a reproducible sample of the data):
analysisData = read.csv(file = 'analysisData.csv', stringsAsFactors = F)
scoringData = read.csv(file = 'scoringData.csv', stringsAsFactors = F)
scoringData$zipcode <- as.character(scoringData$zipcode)
library(ggplot2)
library(tidyr)
library(dplyr)
library(caret)
library(leaps)
library(tidyverse)
library(leaps)
library(ggthemes)
library(glmnet)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(ranger)
library(randomForest)
library(xgboost)
library(vtreat)
library(fastDummies)
#################### SPLITTING AND PIPING PART ONE ################### STEP 1
set.seed(5656)
ksplit <- createDataPartition(y = analysisData$price, p=.7, list=F, groups=50)
train <- analysisData[ksplit,]
test <- analysisData[-ksplit,]
train$train_test_score <- "train"
test$train_test_score <- "test"
scoringData$train_test_score <- "score"
baseData <- bind_rows(train,test,scoringData)
#################### INITIAL FACTORING #################### STEP 3
baseData$host_response_time = factor(baseData$host_response_time)
baseData$host_is_superhost = factor(baseData$host_is_superhost)
baseData$host_has_profile_pic = factor(baseData$host_has_profile_pic)
baseData$host_identity_verified = factor(baseData$host_identity_verified)
baseData$state = factor(baseData$state)
baseData$market = factor(baseData$market)
baseData$country_code = factor(baseData$country_code)
baseData$is_location_exact = factor(baseData$is_location_exact)
baseData$property_type = factor(baseData$property_type)
baseData$room_type = factor(baseData$room_type)
baseData$bed_type = factor(baseData$bed_type)
baseData$has_availability = factor(baseData$has_availability)
baseData$requires_license = factor(baseData$requires_license)
baseData$instant_bookable = factor(baseData$instant_bookable)
baseData$is_business_travel_ready = factor(baseData$is_business_travel_ready)
baseData$require_guest_profile_picture = factor(baseData$require_guest_profile_picture)
baseData$require_guest_phone_verification = factor(baseData$require_guest_phone_verification)
baseData$cancellation_policy = factor(baseData$cancellation_policy)
#################### CLEANING DATA #################### STEP 4
# dates to years
baseData$host_since = as.numeric(substr(baseData$host_since, 1, 4))
baseData$first_review = as.numeric(substr(baseData$first_review, 1, 4))
baseData$last_review = as.numeric(substr(baseData$last_review, 1, 4))
# fix zipcode
baseData$zipcode = as.numeric(baseData$zipcode)
# analysisData$cancellation_strict <- ifelse(grepl("strict", analysisData$cancellation_policy), "yes", "no")
# length of rules
baseData$house_rules_length = nchar(baseData$house_rules)
# indicating if N/A in case trend tells a specific story
baseData$host_listings_count_exists = factor(ifelse(is.na(baseData$host_listings_count),"no","yes"))
baseData$square_feet_exists = factor(ifelse(is.na(baseData$square_feet),"no","yes"))
baseData$host_total_listings_count_exists = factor(ifelse(is.na(baseData$host_total_listings_count),"no","yes"))
baseData$beds_exists = factor(ifelse(is.na(baseData$beds),"no","yes"))
baseData$weekly_price_exists = factor(ifelse(is.na(baseData$weekly_price),"no","yes"))
baseData$monthly_price_exists = factor(ifelse(is.na(baseData$monthly_price),"no","yes"))
baseData$security_deposit_exists = factor(ifelse(is.na(baseData$security_deposit),"no","yes"))
baseData$cleaning_fee_exists = factor(ifelse(is.na(baseData$cleaning_fee),"no","yes"))
baseData$reviews_per_month_exists = factor(ifelse(is.na(baseData$reviews_per_month),"no","yes"))
baseData$host_response_rate = as.numeric(gsub("([0-9]+).*$", "\\1", baseData$host_response_rate))
# numeric version of variables for use in Ranger (trying to fix error)
baseData$num_accommodates = as.numeric(baseData$accommodates)
baseData$num_room_type = as.numeric(baseData$room_type)
baseData$num_review_scores_rating = as.numeric(baseData$review_scores_rating)
baseData$num_minimum_nights = as.numeric(baseData$minimum_nights)
baseData$num_host_response_time = as.numeric(baseData$host_response_time)
#################### IMPUTATION #################### STEP 5
sapply(baseData, function(x) sum(is.na(x)))
baseData$host_listings_count =
ifelse(is.na(baseData$host_listings_count),
mean(baseData$host_listings_count, na.rm = TRUE),
baseData$host_listings_count)
baseData$host_total_listings_count =
ifelse(is.na(baseData$host_total_listings_count),
mean(baseData$host_total_listings_count, na.rm = TRUE),
baseData$host_total_listings_count)
baseData$beds =
ifelse(is.na(baseData$beds),
mean(baseData$beds, na.rm = TRUE),
baseData$beds)
baseData$square_feet =
ifelse(is.na(baseData$square_feet),
mean(baseData$square_feet, na.rm = TRUE),
baseData$square_feet)
baseData$weekly_price =
ifelse(is.na(baseData$weekly_price),
mean(baseData$weekly_price, na.rm = TRUE),
baseData$weekly_price)
baseData$monthly_price =
ifelse(is.na(baseData$monthly_price),
mean(baseData$monthly_price, na.rm = TRUE),
baseData$monthly_price)
baseData$security_deposit =
ifelse(is.na(baseData$security_deposit),
mean(baseData$security_deposit, na.rm = TRUE),
baseData$security_deposit)
baseData$cleaning_fee =
ifelse(is.na(baseData$cleaning_fee),
mean(baseData$cleaning_fee, na.rm = TRUE),
baseData$cleaning_fee)
baseData$reviews_per_month =
ifelse(is.na(baseData$reviews_per_month),
mean(baseData$reviews_per_month, na.rm = TRUE),
baseData$reviews_per_month)
baseData$host_since =
ifelse(is.na(baseData$host_since),
mean(baseData$host_since, na.rm = TRUE),
baseData$host_since)
baseData$first_review =
ifelse(is.na(baseData$first_review),
mean(baseData$first_review, na.rm = TRUE),
baseData$first_review)
baseData$last_review =
ifelse(is.na(baseData$last_review),
mean(baseData$last_review, na.rm = TRUE),
baseData$last_review)
baseData$host_response_rate =
ifelse(is.na(baseData$host_response_rate),
mean(baseData$host_response_rate, na.rm = TRUE),
baseData$host_response_rate)
baseData$zipcode =
ifelse(is.na(baseData$zipcode),
median(baseData$zipcode, na.rm = TRUE),
baseData$zipcode)
#################### PIPING PART TWO #################### STEP 8
train <- baseData %>%
filter(train_test_score == "train")
test <- baseData %>%
filter(train_test_score == "test")
score <- baseData %>%
filter(train_test_score == "score")
nrow(analysisData); nrow(train); nrow(test); nrow(score);
#################### MODELING - RANGER #################### STEP 13
set.seed(617)
model_forest5 = ranger(price~num_minimum_nights+num_host_response_time+num_review_scores_rating+num_room_type+num_accommodates,
data = train,
num.trees = 1000)
pred_ranger = predict(model_forest5, data = test, num.trees = 1000)
rmse_ranger = sqrt(mean((pred_ranger$predictions-test$price)^2)); rmse_ranger
#################### SCORING #################### STEP 13
pred_train <- predict(model_forest5, newdata=train)
caret::postResample(pred = pred_train, train$price)
# Model Testing
pred_test <- predict(model_forest5, newdata=test)
caret::postResample(pred = pred_train, test$price)
Any help on this would be hugely appreciated. Thanks