0

I'm trying to make a binary classification model based on the built-in iris dataset using the glm() function.

First, filtered the dataset so only Species Versicolor and Virginica are used in the binary classification. I am getting warning messages in the code.

Is there a way to fix the code to get rid of these warning messages? Warning messages are under the ROC.train code and ggplot() code. The code is below:

>library(dplyr)
> library(forcats)
> library(ggplot2)
> iris.small <- datasets::iris %>%
+ dplyr::filter(Species != "setosa") %>%
+ dplyr::mutate(Species = fct_drop(Species)) %>%
+ dplyr::group_by(Species) %>%
+ dplyr::summarize(avg_sl = mean(Sepal.Length),
+ avg_sw = mean(Sepal.Width),
+ avg_pl = mean(Petal.Length),
+ avg_pw = mean(Petal.Width))
> set.seed(2016-11-14)
> iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
+ sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
+ sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
+ pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
+ pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))
> train_fraction <- 0.5 #fraction of data for training purposes
> n_obs <- nrow(iris.big)
> train_size <- floor(train_fraction * nrow(iris.big))
> train_indices <- sample(n_obs, size=train_size, replace=TRUE)  #sample(x, size, replace = FALSE, prob = NULL)x    Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size  non-negative integer giving the number of items to choose. replace  Should sampling be with replacement? prob   A vector of probability weights for obtaining the elements of the vector being sampled
> train_data <- iris.big[train_indices, ]
> test_data <- iris.big[-train_indices, ]
> glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
> test_pred <- predict(glm.out.train, test_data, type='response')
> calc_ROC <- function(probabilities, known_truth, model.name=NULL)
+ {
+ outcome <- as.numeric(factor(known_truth))-1
+ pos <- sum(outcome) # total known positives
+ neg <- sum(1-outcome) # total known negatives
+ pos_probs <- outcome*probabilities # probabilities for known positives
+ neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
+ true_pos <- sapply(probabilities,
+ function(x) sum(pos_probs>=x)/pos) # true pos. rate
+ false_pos <- sapply(probabilities,
+ function(x) sum(neg_probs>=x)/neg)
+ if (is.null(model.name))
+ result <- data.frame(true_pos, false_pos)
+ else
+ result <- data.frame(true_pos, false_pos, model.name)
+ result %>% dplyr::arrange(false_pos, true_pos)
+ }
> ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
Warning messages:
1: In outcome * probabilities :
  longer object length is not a multiple of shorter object length
2: In (1 - outcome) * probabilities :
  longer object length is not a multiple of shorter object length
> ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
> ROCs <- rbind(ROC.train, ROC.test)
> ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
Warning message:
Removed 745 rows containing missing values (geom_path). 
> ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
# A tibble: 2 × 2
  model.name       AUC
      <fctr>     <dbl>
1       test 0.8700770
2      train 0.7329557

JackStat recommendation:

    > library(dplyr)
> library(forcats)
> library(ggplot2)
> iris.small <- datasets::iris %>%
+ dplyr::filter(Species != "setosa") %>%
+ dplyr::mutate(Species = fct_drop(Species)) %>%
+ dplyr::group_by(Species) %>%
+ dplyr::summarize(avg_sl = mean(Sepal.Length),
+ avg_sw = mean(Sepal.Width),
+ avg_pl = mean(Petal.Length),
+ avg_pw = mean(Petal.Width))
> set.seed(2016-11-14)
> iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
+ sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
+ sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
+ pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
+ pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))
> train_fraction <- 0.5 #fraction of data for training purposes
> n_obs <- nrow(iris.big)
> train_size <- floor(train_fraction * nrow(iris.big))
> train_indices <- sample(n_obs, size=train_size, replace=TRUE)  #sample(x, size, replace = FALSE, prob = NULL)x    Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size  non-negative integer giving the number of items to choose. replace  Should sampling be with replacement? prob   A vector of probability weights for obtaining the elements of the vector being sampled
> train_data <- iris.big[train_indices, ]
> test_data <- iris.big[-train_indices, ]
> glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
> test_pred <- predict(glm.out.train, test_data, type='response')
> calc_ROC <- function(probabilities, known_truth, model.name=NULL)
+ {
+ outcome <- as.numeric(factor(known_truth))-1
+ pos <- sum(outcome) # total known positives
+ neg <- sum(1-outcome) # total known negatives
+ pos_probs <- outcome*probabilities # probabilities for known positives
+ neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
+ true_pos <- sapply(probabilities,
+ function(x) sum(pos_probs>=x)/pos) # true pos. rate
+ false_pos <- sapply(probabilities,
+ function(x) sum(neg_probs>=x)/neg)
+ if (is.null(model.name))
+ result <- data.frame(true_pos, false_pos)
+ else
+ result <- data.frame(true_pos, false_pos, model.name)
+ result %>% dplyr::arrange(false_pos, true_pos) eps <- 1e-15; test_pred = pmax(pmin(test_pred, 1 - eps), eps)
Error: unexpected symbol in:
"result <- data.frame(true_pos, false_pos, model.name)
result %>% dplyr::arrange(false_pos, true_pos) eps"
> }
Error: unexpected '}' in "}"
> ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
Warning messages:
1: In outcome * probabilities :
  longer object length is not a multiple of shorter object length
2: In (1 - outcome) * probabilities :
  longer object length is not a multiple of shorter object length
> ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
> ROCs <- rbind(ROC.train, ROC.test)
> ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
Warning message:
Removed 745 rows containing missing values (geom_path). 
> ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
# A tibble: 2 × 2
  model.name       AUC
      <fctr>     <dbl>
1       test 0.8700770
2      train 0.7329557

The code is still giving an error message. Am I putting this code in the right place?

eps <- 1e-15; test_pred = pmax(pmin(test_pred, 1 - eps), eps)

I edited the code to exclude 0s and 1s. But, I'm still getting errors. What can I do now to fix the error?

  > library(dplyr)
> library(forcats)
> library(ggplot2)
> iris.small <- datasets::iris %>%
+ dplyr::filter(Species != "setosa") %>%
+ dplyr::mutate(Species = fct_drop(Species)) %>%
+ dplyr::group_by(Species) %>%
+ dplyr::summarize(avg_sl = mean(Sepal.Length),
+ avg_sw = mean(Sepal.Width),
+ avg_pl = mean(Petal.Length),
+ avg_pw = mean(Petal.Width))
> set.seed(2016-11-14)
> iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
+ sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
+ sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
+ pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
+ pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))
> iris.big$sl[iris.big$sl==0] <-0.0000000001
> iris.big$sw[iris.big$sw==0] <-0.0000000001
> iris.big$pl[iris.big$pl==0] <-0.0000000001
> iris.big$pw[iris.big$pw==0] <-0.0000000001
> iris.big$sl[iris.big$sl==1] <-0.99999999
> iris.big$sw[iris.big$sw==1] <-0.99999999
> iris.big$pl[iris.big$pl==1] <-0.99999999
> iris.big$pw[iris.big$pw==1] <-0.99999999
> train_fraction <- 0.5 #fraction of data for training purposes
> n_obs <- nrow(iris.big)
> train_size <- floor(train_fraction * nrow(iris.big))
> train_indices <- sample(n_obs, size=train_size, replace=TRUE)  #sample(x, size, replace = FALSE, prob = NULL)x    Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size  non-negative integer giving the number of items to choose. replace  Should sampling be with replacement? prob   A vector of probability weights for obtaining the elements of the vector being sampled
> train_data <- iris.big[train_indices, ]
> test_data <- iris.big[-train_indices, ]
> glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
> test_pred <- predict(glm.out.train, test_data, type='response')
> calc_ROC <- function(probabilities, known_truth, model.name=NULL)
+ {
+ outcome <- as.numeric(factor(known_truth))-1
+ pos <- sum(outcome) # total known positives
+ neg <- sum(1-outcome) # total known negatives
+ pos_probs <- outcome*probabilities # probabilities for known positives
+ neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
+ true_pos <- sapply(probabilities,
+ function(x) sum(pos_probs>=x)/pos) # true pos. rate
+ false_pos <- sapply(probabilities,
+ function(x) sum(neg_probs>=x)/neg)
+ if (is.null(model.name))
+ result <- data.frame(true_pos, false_pos)
+ else
+ result <- data.frame(true_pos, false_pos, model.name)
+ result %>% dplyr::arrange(false_pos, true_pos)
+ }
> ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
Warning messages:
1: In outcome * probabilities :
  longer object length is not a multiple of shorter object length
2: In (1 - outcome) * probabilities :
  longer object length is not a multiple of shorter object length
> ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
> ROCs <- rbind(ROC.train, ROC.test)
> ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
Warning message:
Removed 745 rows containing missing values (geom_path). 
> ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
# A tibble: 2 × 2
  model.name       AUC
      <fctr>     <dbl>
1       test 0.8700770
2      train 0.7329557
Coco4ML
  • 21
  • 1
  • 5
  • Versicolor and Virginica are linearly separable classes; logistic regression doesn't like this. See linked question. – Hong Ooi Nov 15 '16 at 16:56
  • @HongOoi This does not appear to be a convergence problem. It is a typo with `ROC.train` – Pierre L Nov 15 '16 at 17:01
  • This is not an exact duplicate – JackStat Nov 15 '16 at 17:59
  • You need to adjust your calculation to not include any 0s or 1s. They are causing the warnings. you should make them .0001 or .9999 or something – JackStat Nov 15 '16 at 18:01
  • @JackStat How can I adjust the calculation to exclude 0 and 1 values? – Coco4ML Nov 15 '16 at 18:08
  • ```eps <- 1e-15; test_pred = pmax(pmin(test_pred, 1 - eps), eps)``` – JackStat Nov 15 '16 at 18:10
  • Answering in comments is not optimal... – JackStat Nov 15 '16 at 18:11
  • @JackStat I'll put the code in the question above with the rest of it. Should I put the code some where in the ROC.train function or somewhere above it? – Coco4ML Nov 15 '16 at 18:26
  • You should probably put it inside the function – JackStat Nov 15 '16 at 18:30
  • @JackStat I put the code in the function. I still get a warning message. – Coco4ML Nov 15 '16 at 18:46
  • @PierreLafortune Hi Pierre, Where is the typo in ROC.train? – Coco4ML Nov 15 '16 at 19:57
  • The lengths won't match in this expression `calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")`. The length of test and train are different, so it doesn't make sense. – Pierre L Nov 15 '16 at 19:59
  • @PierreLafortune I split my data into training and testing subsets. I indicated on the code that .5 of the data is for training and 0.5 is for testing. Shouldn't that equal out the length of the test and train data? – Coco4ML Nov 15 '16 at 20:08

0 Answers0