0

I'm trying to predict price from a series of real-estate rental variables. The goal is to get a low RMSE but whenever I apply the ranger model to newdata as opposed to just data I get the following error: Error in predict.ranger(model_forest5, newdata = train) : Error: Argument 'data' is required for non-quantile prediction.

Below is the minimal amount of code I could provide that would still give context (unfortunately I'm unsure how to give a reproducible sample of the data):


analysisData = read.csv(file = 'analysisData.csv', stringsAsFactors = F)
scoringData = read.csv(file = 'scoringData.csv', stringsAsFactors = F)
scoringData$zipcode <- as.character(scoringData$zipcode)

library(ggplot2)
library(tidyr)
library(dplyr)
library(caret)
library(leaps)
library(tidyverse)
library(leaps)
library(ggthemes)
library(glmnet)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(ranger)
library(randomForest)
library(xgboost)
library(vtreat)
library(fastDummies)








####################  SPLITTING AND PIPING PART ONE  ###################     STEP 1
set.seed(5656)
ksplit <- createDataPartition(y = analysisData$price, p=.7, list=F, groups=50)
train <- analysisData[ksplit,]
test <- analysisData[-ksplit,]

train$train_test_score <- "train"
test$train_test_score <- "test"
scoringData$train_test_score <- "score"
baseData <- bind_rows(train,test,scoringData)





####################  INITIAL FACTORING  ####################      STEP 3 
baseData$host_response_time = factor(baseData$host_response_time)
baseData$host_is_superhost = factor(baseData$host_is_superhost)
baseData$host_has_profile_pic = factor(baseData$host_has_profile_pic)
baseData$host_identity_verified = factor(baseData$host_identity_verified)
baseData$state = factor(baseData$state)
baseData$market = factor(baseData$market)
baseData$country_code = factor(baseData$country_code)
baseData$is_location_exact = factor(baseData$is_location_exact)
baseData$property_type = factor(baseData$property_type)
baseData$room_type = factor(baseData$room_type)
baseData$bed_type = factor(baseData$bed_type)
baseData$has_availability = factor(baseData$has_availability)
baseData$requires_license = factor(baseData$requires_license)
baseData$instant_bookable = factor(baseData$instant_bookable)
baseData$is_business_travel_ready = factor(baseData$is_business_travel_ready)
baseData$require_guest_profile_picture = factor(baseData$require_guest_profile_picture)
baseData$require_guest_phone_verification = factor(baseData$require_guest_phone_verification)
baseData$cancellation_policy = factor(baseData$cancellation_policy)








####################  CLEANING DATA  ####################       STEP 4
# dates to years
baseData$host_since = as.numeric(substr(baseData$host_since, 1, 4))
baseData$first_review = as.numeric(substr(baseData$first_review, 1, 4))
baseData$last_review = as.numeric(substr(baseData$last_review, 1, 4))

# fix zipcode
baseData$zipcode = as.numeric(baseData$zipcode)

# analysisData$cancellation_strict <- ifelse(grepl("strict", analysisData$cancellation_policy), "yes", "no")

# length of rules
baseData$house_rules_length = nchar(baseData$house_rules)

# indicating if N/A in case trend tells a specific story
baseData$host_listings_count_exists = factor(ifelse(is.na(baseData$host_listings_count),"no","yes"))
baseData$square_feet_exists = factor(ifelse(is.na(baseData$square_feet),"no","yes"))
baseData$host_total_listings_count_exists = factor(ifelse(is.na(baseData$host_total_listings_count),"no","yes"))
baseData$beds_exists = factor(ifelse(is.na(baseData$beds),"no","yes"))
baseData$weekly_price_exists = factor(ifelse(is.na(baseData$weekly_price),"no","yes"))
baseData$monthly_price_exists = factor(ifelse(is.na(baseData$monthly_price),"no","yes"))
baseData$security_deposit_exists = factor(ifelse(is.na(baseData$security_deposit),"no","yes"))
baseData$cleaning_fee_exists = factor(ifelse(is.na(baseData$cleaning_fee),"no","yes"))
baseData$reviews_per_month_exists = factor(ifelse(is.na(baseData$reviews_per_month),"no","yes"))
baseData$host_response_rate = as.numeric(gsub("([0-9]+).*$", "\\1", baseData$host_response_rate))

# numeric version of variables for use in Ranger (trying to fix error)
baseData$num_accommodates = as.numeric(baseData$accommodates)
baseData$num_room_type = as.numeric(baseData$room_type)
baseData$num_review_scores_rating = as.numeric(baseData$review_scores_rating)
baseData$num_minimum_nights = as.numeric(baseData$minimum_nights)
baseData$num_host_response_time = as.numeric(baseData$host_response_time)








####################  IMPUTATION  ####################       STEP 5
sapply(baseData, function(x) sum(is.na(x)))

baseData$host_listings_count = 
  ifelse(is.na(baseData$host_listings_count), 
         mean(baseData$host_listings_count, na.rm = TRUE), 
         baseData$host_listings_count)
baseData$host_total_listings_count = 
  ifelse(is.na(baseData$host_total_listings_count), 
         mean(baseData$host_total_listings_count, na.rm = TRUE), 
         baseData$host_total_listings_count)
baseData$beds = 
  ifelse(is.na(baseData$beds), 
         mean(baseData$beds, na.rm = TRUE), 
         baseData$beds)
baseData$square_feet = 
  ifelse(is.na(baseData$square_feet), 
         mean(baseData$square_feet, na.rm = TRUE), 
         baseData$square_feet)
baseData$weekly_price = 
  ifelse(is.na(baseData$weekly_price), 
         mean(baseData$weekly_price, na.rm = TRUE), 
         baseData$weekly_price)
baseData$monthly_price = 
  ifelse(is.na(baseData$monthly_price), 
         mean(baseData$monthly_price, na.rm = TRUE), 
         baseData$monthly_price)
baseData$security_deposit = 
  ifelse(is.na(baseData$security_deposit), 
         mean(baseData$security_deposit, na.rm = TRUE), 
         baseData$security_deposit)
baseData$cleaning_fee = 
  ifelse(is.na(baseData$cleaning_fee), 
         mean(baseData$cleaning_fee, na.rm = TRUE), 
         baseData$cleaning_fee)
baseData$reviews_per_month = 
  ifelse(is.na(baseData$reviews_per_month), 
         mean(baseData$reviews_per_month, na.rm = TRUE), 
         baseData$reviews_per_month)
baseData$host_since = 
  ifelse(is.na(baseData$host_since), 
         mean(baseData$host_since, na.rm = TRUE), 
         baseData$host_since)
baseData$first_review = 
  ifelse(is.na(baseData$first_review), 
         mean(baseData$first_review, na.rm = TRUE), 
         baseData$first_review)
baseData$last_review = 
  ifelse(is.na(baseData$last_review), 
         mean(baseData$last_review, na.rm = TRUE), 
         baseData$last_review)
baseData$host_response_rate = 
  ifelse(is.na(baseData$host_response_rate), 
         mean(baseData$host_response_rate, na.rm = TRUE), 
         baseData$host_response_rate)
baseData$zipcode = 
  ifelse(is.na(baseData$zipcode), 
         median(baseData$zipcode, na.rm = TRUE), 
         baseData$zipcode)



####################  PIPING PART TWO  ####################        STEP 8
train <- baseData  %>% 
  filter(train_test_score == "train")
test <- baseData  %>% 
  filter(train_test_score == "test")
score <- baseData  %>% 
  filter(train_test_score == "score")

nrow(analysisData); nrow(train); nrow(test); nrow(score);




####################  MODELING - RANGER  ####################        STEP 13
set.seed(617)
model_forest5 = ranger(price~num_minimum_nights+num_host_response_time+num_review_scores_rating+num_room_type+num_accommodates,
                      data = train,
                      num.trees = 1000)

pred_ranger = predict(model_forest5, data = test, num.trees = 1000)
rmse_ranger = sqrt(mean((pred_ranger$predictions-test$price)^2)); rmse_ranger


####################  SCORING  ####################        STEP 13
pred_train <- predict(model_forest5, newdata=train)
caret::postResample(pred = pred_train, train$price)

# Model Testing
pred_test <- predict(model_forest5, newdata=test)
caret::postResample(pred = pred_train, test$price)

Any help on this would be hugely appreciated. Thanks

Sebastian Hubard
  • 163
  • 1
  • 4
  • 18
  • 2
    It's easier to help you if you include a simple [reproducible example](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) with sample input and desired output that can be used to test and verify possible solutions. We don't need to see all your data clean up code, just share the parts necessary to run and test the code. – MrFlick Nov 17 '20 at 18:33
  • 2
    Unlike what you say in the question, the code is neither minimal nor reproducible. Not reproducible because the first two lines give errors, we do not have access to your disk. Not minimal because you don't use most of the packages loaded, for instance, `ggplot2` is not used, and even if you did, `tidyverse` loads `dplyr`, `tidyr`, `ggplot2`. Furthermore, we don't need the entire data cleaning code, what we need starts at `MODELING - RANGER`. Please post the output of `dput(head(train, 30))` and of `dput(head(test, 30))`. – Rui Barradas Nov 17 '20 at 18:34
  • 1
    you have a typo, the argument should be ```predict(model_forest5, data=train)``` as you have done for the others – StupidWolf Nov 17 '20 at 19:30
  • maybe time to organize the code and reduce the redundancies – StupidWolf Nov 17 '20 at 19:30
  • Thanks @StupidWolf that fixed it. I still down really understand why I used "newdata" for linear regression but "data=" for ranger but it worked! If you place your comment as an answer I'd be happy to select it as the answer – Sebastian Hubard Nov 17 '20 at 21:04

1 Answers1

1

I write this answer as a further explanation to how you can find the input argument and why the arguments are different.

When you do predict(model_forest5, data=train), it depends on the object model_forest5. Since it is of class ranger :

mdl = ranger(mpg ~. ,data=mtcars)
class(mdl)
[1] "ranger"

the function predict.ranger is called. If you check the help manual for predict.ranger :

Arguments:

  object: Ranger ‘ranger’ object.

    data: New test data of class ‘data.frame’ or ‘gwaa.data’ (GenABEL).

So the input should be data=

If you use lm() :

mdl = lm(mpg ~. ,data=mtcars)
class(mdl)
[1] "lm"

The function called is predict.lm and the argument is in fact, newdata = as you have used.

StupidWolf
  • 45,075
  • 17
  • 40
  • 72