0

Context: I am trying to predict survival using Titanic disaster dataset from Kaggle.

Dataset train4 used in the code below.

PassengerId Pclass  Sex     Age SibSp   Parch   Fare    Cabin   Sex_F   Survived
   1          3    male      22   1       0     7.25     0        0 
   2          1    female    38   1       0     71.2833  C85      1        1
   3          3    female    26   0       0     7.925    1        1 
   4          1    female    35   1       0     53.1     C123     1        1
   5          3    male      35   0       0     8.05     0        0 
   7          1    male      54   0       0     51.8625  E46      0        0

I am trying to run n times glm on each of 3 models. I have wrapped these 3 models inside one function run_models. Apparently, function gets successfully defined but when I input n in run_models(10) it doesn't run (throws error: In predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type == : prediction from a rank-deficient fit may be misleading).

BUT it runs if I run directly the for(i in 1:n) part by replacing n by 10. The code can become more flexible if I can automate it without manually adding "n" in each occurence.

Here's the complete code:

library("dplyr")
library("ggplot2")
library("scales")
train<-read.csv("train.csv")
test<-read.csv("test.csv")
attach(train)


#Filtering relevant informations
train2<-train[complete.cases(train),]
train3<-train2[,-c(4,9)]
train3<-train3[,c(1,3:9,2)]


#Dummy variables for Sex 
train4<-mutate(train3,Sex_F = (train3$Sex == "female")*(train3$Sex == "female"))
#Massaging final dataset
train4<-train4[,c(1:8,10,9)]


#Fitting logistic regression
fit1<-glm(Survived~., data = train4, family = binomial(link='logit'))
fit2<-glm(Survived~ Pclass + Age + SibSp + Parch + Fare + Sex_F, data = train4, family = binomial(link='logit'))
fit3<-glm(Survived~ Pclass + Sex_F, data = train4, family = binomial(link='logit'))


models_summary<-data.frame(Model = numeric(),Accuracy = numeric(),Deviance = numeric())

run_models<-function(n){

  for(i in 1:n) 
  {  
    #Making some dataset to check accuracy
    #Model1
    check1<-sample_n(train4,100)
    check_fit1<-data.frame(round(predict(fit1, newdata = data.frame(check1[,-c(10)]),type="response"),0))
    colnames(check_fit1)<-c("Survival_predicted")

    final1<-cbind(check1[,c(1,10)],check_fit1[,1])
    colnames(final1)<-c("Passenger ID","Survived","Survival_predicted")

    accuracy1<-1 - nrow(final1[which(final1$Survived!=final1$Survival_predicted),])/nrow(final1)


    #Model2
    check2<-sample_n(train4[,-c(3,8)],100)
    check_fit2<-data.frame(round(predict(fit2, newdata = data.frame(check2[,-c(8)]),type="response"),0))
    colnames(check_fit2)<-c("Survival_predicted")

    final2<-cbind(check2[,c(1,8)],check_fit2[,1])
    colnames(final2)<-c("Passenger ID","Survived","Survival_predicted")

    accuracy2<-1 - nrow(final2[which(final2$Survived!=final2$Survival_predicted),])/nrow(final2)


    #Model3
    check3<-sample_n(train4[,c(1,2,9,10)],100)
    check_fit3<-data.frame(round(predict(fit3, newdata = data.frame(check3[,c(1:3)]),type="response"),0))
    colnames(check_fit3)<-c("Survival_predicted")

    final3<-cbind(check3[,c(1,4)],check_fit3[,1])
    colnames(final3)<-c("Passenger ID","Survived","Survival_predicted")

    accuracy3<-1 - nrow(final3[which(final3$Survived!=final3$Survival_predicted),])/nrow(final3)


    #Summary
    models_summary_TEMP<-data.frame(c("1","2","3"),c(accuracy1,accuracy2,accuracy3),c(fit1$deviance,fit2$deviance,fit3$deviance))
    colnames(models_summary_TEMP)<-c("Model","Accuracy","Deviance")
    models_summary<-rbind(models_summary,models_summary_TEMP)
    models_summary_TEMP<-data.frame(Model = numeric(),Accuracy = numeric(),Deviance = numeric())
  }
}

run_models(10)
Scott Grammilo
  • 1,229
  • 4
  • 16
  • 37
  • 1
    Please make your example reproducible. You are calling objects from workspace and there's almost no way of telling what's going on. – Roman Luštrik May 26 '17 at 19:18
  • dataset added !! Since it can run manually, I think wrapping loop for GLMs inside a user defined function is something tricky? possible? – Scott Grammilo May 26 '17 at 19:33
  • Can you make the example so that it's easy-to-paste? See [here](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) for tips on how to do that. – Roman Luštrik May 27 '17 at 06:50

0 Answers0