I'm trying to make a binary classification model based on the built-in iris dataset using the glm()
function.
First, filtered the dataset so only Species Versicolor and Virginica are used in the binary classification. I am getting warning messages in the code.
Is there a way to fix the code to get rid of these warning messages? Warning messages are under the ROC.train
code and ggplot()
code. The code is below:
>library(dplyr)
> library(forcats)
> library(ggplot2)
> iris.small <- datasets::iris %>%
+ dplyr::filter(Species != "setosa") %>%
+ dplyr::mutate(Species = fct_drop(Species)) %>%
+ dplyr::group_by(Species) %>%
+ dplyr::summarize(avg_sl = mean(Sepal.Length),
+ avg_sw = mean(Sepal.Width),
+ avg_pl = mean(Petal.Length),
+ avg_pw = mean(Petal.Width))
> set.seed(2016-11-14)
> iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
+ sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
+ sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
+ pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
+ pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))
> train_fraction <- 0.5 #fraction of data for training purposes
> n_obs <- nrow(iris.big)
> train_size <- floor(train_fraction * nrow(iris.big))
> train_indices <- sample(n_obs, size=train_size, replace=TRUE) #sample(x, size, replace = FALSE, prob = NULL)x Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size non-negative integer giving the number of items to choose. replace Should sampling be with replacement? prob A vector of probability weights for obtaining the elements of the vector being sampled
> train_data <- iris.big[train_indices, ]
> test_data <- iris.big[-train_indices, ]
> glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
> test_pred <- predict(glm.out.train, test_data, type='response')
> calc_ROC <- function(probabilities, known_truth, model.name=NULL)
+ {
+ outcome <- as.numeric(factor(known_truth))-1
+ pos <- sum(outcome) # total known positives
+ neg <- sum(1-outcome) # total known negatives
+ pos_probs <- outcome*probabilities # probabilities for known positives
+ neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
+ true_pos <- sapply(probabilities,
+ function(x) sum(pos_probs>=x)/pos) # true pos. rate
+ false_pos <- sapply(probabilities,
+ function(x) sum(neg_probs>=x)/neg)
+ if (is.null(model.name))
+ result <- data.frame(true_pos, false_pos)
+ else
+ result <- data.frame(true_pos, false_pos, model.name)
+ result %>% dplyr::arrange(false_pos, true_pos)
+ }
> ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
Warning messages:
1: In outcome * probabilities :
longer object length is not a multiple of shorter object length
2: In (1 - outcome) * probabilities :
longer object length is not a multiple of shorter object length
> ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
> ROCs <- rbind(ROC.train, ROC.test)
> ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
Warning message:
Removed 745 rows containing missing values (geom_path).
> ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
# A tibble: 2 × 2
model.name AUC
<fctr> <dbl>
1 test 0.8700770
2 train 0.7329557
JackStat recommendation:
> library(dplyr)
> library(forcats)
> library(ggplot2)
> iris.small <- datasets::iris %>%
+ dplyr::filter(Species != "setosa") %>%
+ dplyr::mutate(Species = fct_drop(Species)) %>%
+ dplyr::group_by(Species) %>%
+ dplyr::summarize(avg_sl = mean(Sepal.Length),
+ avg_sw = mean(Sepal.Width),
+ avg_pl = mean(Petal.Length),
+ avg_pw = mean(Petal.Width))
> set.seed(2016-11-14)
> iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
+ sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
+ sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
+ pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
+ pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))
> train_fraction <- 0.5 #fraction of data for training purposes
> n_obs <- nrow(iris.big)
> train_size <- floor(train_fraction * nrow(iris.big))
> train_indices <- sample(n_obs, size=train_size, replace=TRUE) #sample(x, size, replace = FALSE, prob = NULL)x Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size non-negative integer giving the number of items to choose. replace Should sampling be with replacement? prob A vector of probability weights for obtaining the elements of the vector being sampled
> train_data <- iris.big[train_indices, ]
> test_data <- iris.big[-train_indices, ]
> glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
> test_pred <- predict(glm.out.train, test_data, type='response')
> calc_ROC <- function(probabilities, known_truth, model.name=NULL)
+ {
+ outcome <- as.numeric(factor(known_truth))-1
+ pos <- sum(outcome) # total known positives
+ neg <- sum(1-outcome) # total known negatives
+ pos_probs <- outcome*probabilities # probabilities for known positives
+ neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
+ true_pos <- sapply(probabilities,
+ function(x) sum(pos_probs>=x)/pos) # true pos. rate
+ false_pos <- sapply(probabilities,
+ function(x) sum(neg_probs>=x)/neg)
+ if (is.null(model.name))
+ result <- data.frame(true_pos, false_pos)
+ else
+ result <- data.frame(true_pos, false_pos, model.name)
+ result %>% dplyr::arrange(false_pos, true_pos) eps <- 1e-15; test_pred = pmax(pmin(test_pred, 1 - eps), eps)
Error: unexpected symbol in:
"result <- data.frame(true_pos, false_pos, model.name)
result %>% dplyr::arrange(false_pos, true_pos) eps"
> }
Error: unexpected '}' in "}"
> ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
Warning messages:
1: In outcome * probabilities :
longer object length is not a multiple of shorter object length
2: In (1 - outcome) * probabilities :
longer object length is not a multiple of shorter object length
> ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
> ROCs <- rbind(ROC.train, ROC.test)
> ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
Warning message:
Removed 745 rows containing missing values (geom_path).
> ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
# A tibble: 2 × 2
model.name AUC
<fctr> <dbl>
1 test 0.8700770
2 train 0.7329557
The code is still giving an error message. Am I putting this code in the right place?
eps <- 1e-15; test_pred = pmax(pmin(test_pred, 1 - eps), eps)
I edited the code to exclude 0s and 1s. But, I'm still getting errors. What can I do now to fix the error?
> library(dplyr)
> library(forcats)
> library(ggplot2)
> iris.small <- datasets::iris %>%
+ dplyr::filter(Species != "setosa") %>%
+ dplyr::mutate(Species = fct_drop(Species)) %>%
+ dplyr::group_by(Species) %>%
+ dplyr::summarize(avg_sl = mean(Sepal.Length),
+ avg_sw = mean(Sepal.Width),
+ avg_pl = mean(Petal.Length),
+ avg_pw = mean(Petal.Width))
> set.seed(2016-11-14)
> iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
+ sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
+ sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
+ pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
+ pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))
> iris.big$sl[iris.big$sl==0] <-0.0000000001
> iris.big$sw[iris.big$sw==0] <-0.0000000001
> iris.big$pl[iris.big$pl==0] <-0.0000000001
> iris.big$pw[iris.big$pw==0] <-0.0000000001
> iris.big$sl[iris.big$sl==1] <-0.99999999
> iris.big$sw[iris.big$sw==1] <-0.99999999
> iris.big$pl[iris.big$pl==1] <-0.99999999
> iris.big$pw[iris.big$pw==1] <-0.99999999
> train_fraction <- 0.5 #fraction of data for training purposes
> n_obs <- nrow(iris.big)
> train_size <- floor(train_fraction * nrow(iris.big))
> train_indices <- sample(n_obs, size=train_size, replace=TRUE) #sample(x, size, replace = FALSE, prob = NULL)x Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size non-negative integer giving the number of items to choose. replace Should sampling be with replacement? prob A vector of probability weights for obtaining the elements of the vector being sampled
> train_data <- iris.big[train_indices, ]
> test_data <- iris.big[-train_indices, ]
> glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
> test_pred <- predict(glm.out.train, test_data, type='response')
> calc_ROC <- function(probabilities, known_truth, model.name=NULL)
+ {
+ outcome <- as.numeric(factor(known_truth))-1
+ pos <- sum(outcome) # total known positives
+ neg <- sum(1-outcome) # total known negatives
+ pos_probs <- outcome*probabilities # probabilities for known positives
+ neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
+ true_pos <- sapply(probabilities,
+ function(x) sum(pos_probs>=x)/pos) # true pos. rate
+ false_pos <- sapply(probabilities,
+ function(x) sum(neg_probs>=x)/neg)
+ if (is.null(model.name))
+ result <- data.frame(true_pos, false_pos)
+ else
+ result <- data.frame(true_pos, false_pos, model.name)
+ result %>% dplyr::arrange(false_pos, true_pos)
+ }
> ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
Warning messages:
1: In outcome * probabilities :
longer object length is not a multiple of shorter object length
2: In (1 - outcome) * probabilities :
longer object length is not a multiple of shorter object length
> ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
> ROCs <- rbind(ROC.train, ROC.test)
> ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
Warning message:
Removed 745 rows containing missing values (geom_path).
> ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
# A tibble: 2 × 2
model.name AUC
<fctr> <dbl>
1 test 0.8700770
2 train 0.7329557