1

I am new to using R. I am using a data set and the missing values have been replaced with "?" before I get the data. I am looking for a way to delete the rows that contain this. It isn't specific to just one row it is in all of them.

I have tried Delete rows containing specific strings in R but it isn't working for me. I have included my code so far below.

library(randomForest)
heart <- read.csv(url('http://archive.ics.uci.edu/ml/machine-learning-databases/echocardiogram/echocardiogram.data'))
names <- names(heart)
nrow(heart)
ncol(heart)
names(heart)

colnames(heart)[colnames(heart)=="X11"] <- "survival"
colnames(heart)[colnames(heart)=="X0"] <- "alive"
colnames(heart)[colnames(heart)=="X71"] <- "attackAge"
colnames(heart)[colnames(heart)=="X0.1"] <- "pericardialEffusion"
colnames(heart)[colnames(heart)=="X0.260"] <- "fractionalShortening"
colnames(heart)[colnames(heart)=="X9"] <- "epss"
colnames(heart)[colnames(heart)=="X4.600"] <- "lvdd"
colnames(heart)[colnames(heart)=="X14"] <- "wallMotionScore"
colnames(heart)[colnames(heart)=="X1"] <- "wallMotionIndex"
colnames(heart)[colnames(heart)=="X1.1"] <- "mult"
colnames(heart)[colnames(heart)=="name"] <- "patientName"
colnames(heart)[colnames(heart)=="X1.2"] <- "group"
colnames(heart)[colnames(heart)=="X0.2"] <- "aliveAfterYear"
names(heart)

3 Answers3

2
library(randomForest)
heart <- read.csv(url('http://archive.ics.uci.edu/ml/machine-learning-databases/echocardiogram/echocardiogram.data'),na.strings = "?")
names <- names(heart)
nrow(heart)
ncol(heart)
names(heart)

colnames(heart)[colnames(heart)=="X11"] <- "survival"
colnames(heart)[colnames(heart)=="X0"] <- "alive"
colnames(heart)[colnames(heart)=="X71"] <- "attackAge"
colnames(heart)[colnames(heart)=="X0.1"] <- "pericardialEffusion"
colnames(heart)[colnames(heart)=="X0.260"] <- "fractionalShortening"
colnames(heart)[colnames(heart)=="X9"] <- "epss"
colnames(heart)[colnames(heart)=="X4.600"] <- "lvdd"
colnames(heart)[colnames(heart)=="X14"] <- "wallMotionScore"
colnames(heart)[colnames(heart)=="X1"] <- "wallMotionIndex"
colnames(heart)[colnames(heart)=="X1.1"] <- "mult"
colnames(heart)[colnames(heart)=="name"] <- "patientName"
colnames(heart)[colnames(heart)=="X1.2"] <- "group"
colnames(heart)[colnames(heart)=="X0.2"] <- "aliveAfterYear"
names(heart)


heart1 <- na.omit(heart)

while importing file you can specify na.string as ? and later using na.omit you can remove all the ? or NA strings

Hunaidkhan
  • 1,411
  • 2
  • 11
  • 21
1

I think this can do what you want.

# Do not forget to set stringsAsFactors as false to the read.csv 
# as to make string comparison efficient
heart <- read.csv(url('http://archive.ics.uci.edu/ml/machine-learning-databases/echocardiogram/echocardiogram.data'),stringsAsFactors = F)

# Simpler way to assign column names to the dataframe
colnames(heart) <- c("survival", "alive", "attackAge", "pericardialEffusion", 
                     "fractionalShortening", "epss", "lvdd", "wallMotionScore", 
                     "wallMotionIndex", "mult", "patientName", 
                     "group", "aliveAfterYear")


# You can traverse a dataframe as a matrix using the row and column index 
# as coordinates 

for(r in 1:nrow(heart)){
   for(c in 1:ncol(heart)){
      # For this particular cell you do a comparison 
      # substituting the ? with NA which is the default missing value
      # in R 
      heart[r,c] <- ifelse(heart[r,c]=="?",NA,heart[r,c])
   }
}

# omit the NA rows 
heart <- na.omit(heart)
nkorf
  • 59
  • 7
0

Some libraries support reading csv files and specifying strings to be read as missing values. I use the readr library most often. Then you can just use na.omit and similar functions.

library(readr)
library(dplyr)

heart  <- read_csv(
  'http://archive.ics.uci.edu/ml/machine-learning-databases/echocardiogram/echocardiogram.data',
  na=c("", "?")
)


colnames(heart) <- recode(
  colnames(heart),
  "X11" = "survival",
  "X0" = "alive",
  "X71" = "attackAge",
  "X0.1" = "pericardialEffusion",
  "X0.260" = "fractionalShortening",
  "X9" = "epss",
  "X4.600" = "lvdd",
  "X14" = "wallMotionScore",
  "X1" = "wallMotionIndex",
  "X1.1" = "mult",
  "name" = "patientName",
  "X1.2" = "group",
  "X0.2" = "aliveAfterYear"
  )

heart

heart <- na.omit(heart)

(Also you can spare some typing with the recode function from the dplyr package, but your solution for renaming the columns works as good.)

snaut
  • 2,261
  • 18
  • 37