1

I have a huge text file. I would like to extract out the blocks of rows which row indexes are defined in another data frame, such as sub. I have a loop script for it but I would like to find more efficient way (better without looping) for this task. Here is my toy example:

df <- data.frame(value=runif(10000, 0, 10^3))
df$idx <- 1:nrow(df)
sub <- data.frame(start=c(20,50,130,2000),end=c(25,60,150,2030))
sub_data <- data.frame()
for (j in 1:nrow(sub)){
  dt <- df[df$idx >= sub$start[j] & df$idx <= sub$end[j],]
  sub_data <- rbind(sub_data,dt)
}

sub_data
user3354212
  • 1,048
  • 8
  • 19

4 Answers4

4


Here is one solution with data.table using non equi join available since v1.9.8

library(data.table)
dt <- data.table(value=runif(10000, 0, 10^3))
# add index column
dt[, idx := seq_len(.N)]
# create subset table
sub <- data.table(start=c(20,50,130,2000),end=c(25,60,150,2030))
# use data.table non equijoin
dt1 <- dt[sub, on = .(idx >= start, idx <= end)]
head(dt1)
#>        value idx idx.1
#> 1: 820.38637  20    25
#> 2: 262.51398  20    25
#> 3: 900.37408  20    25
#> 4:  74.91815  20    25
#> 5: 507.87825  20    25
#> 6: 547.45235  20    25
# use data.table non equi join but just keep column from dt
dt2 <- dt[sub, .(value, idx = x.idx), on = .(idx >= start, idx <= end)]
head(dt2)
#>        value idx
#> 1: 820.38637  20
#> 2: 262.51398  21
#> 3: 900.37408  22
#> 4:  74.91815  23
#> 5: 507.87825  24
#> 6: 547.45235  25
cderv
  • 6,272
  • 1
  • 21
  • 31
2

Here is a solution creating sequence of all id, and then subset the df based on the sequence of id. df2 is the final output.

IDs <- unlist(lapply(1:nrow(sub), function(i) {sub$start[i]:sub$end[i]}))

df2 <- df[df$idx %in% IDs, ]

Or we can use functions from tidyverse.

library(tidyverse)

sub2 <- sub %>%
  mutate(idx = map2(start, end, `:`)) %>%
  unnest()

df2 <- df %>% semi_join(sub2, by = "idx")
www
  • 38,575
  • 12
  • 48
  • 84
2

Subset relevant portion of df for each row of sub such that the subgroups are in a list and then rbind the subgroups together

output = do.call(rbind, lapply(1:NROW(sub), function(i) with(sub, df[start[i]:end[i],])))
identical(sub_data, output)
#[1] TRUE
d.b
  • 32,245
  • 6
  • 36
  • 77
1

As you mention that you got a huge text file,

I suggest using data.table's fread and rbindlist functions to use

dt_div_conquer <- function(loc, id_name, subset_id){
# id_name : ID column in file - to be used for filtering
# subset_id : list of IDs to be filtered
# loc : file location



  ## Read ID Column from the txt file
  v <- fread(sprintf('%s', loc), select = id_name)

  ## filter row numbers to read
  v <- v[[id_name]] %in% subset_id
  seq  <- rle(v)
  idx  <- c(0, cumsum(seq$lengths))[which(seq$values)] + 1

  ## create starting row-number and length as a data-frame
  indx <- data.frame(start=idx, length=seq$length[which(seq$values)])

  ## Apply fread with row-number and length details
  result <- do.call(rbindlist, 
                    apply(indx, 1, function(x) return(fread(sprintf('%s', loc),nrows= x[2],skip=x[1]))))
  return(result)
}
sairaamv
  • 86
  • 4