0

I have a two dataframes - one is the base dataframe and the other the query dataframe.

Base Dataframe (base_df):

    Mon   Tue   Wed   Thu   Fri   Sat
A  5.23  0.01  6.81  8.67  0.10  6.21
B  6.26  2.19  4.28  5.57  0.16  2.81
C  7.41  2.63  4.32  6.57  0.20  1.69
D  6.17  1.50  5.30  9.22  2.19  5.47
E  1.23  9.01  8.09  1.29  7.65  4.57

Query Dataframe (query_df):

Person  Start  End
     A    Tue  Thu
     C    Mon  Wed
     D    Thu  Sat
     C    Thu  Sat
     B    Wed  Fri

I want to extract all the observations for a particular person between the start and end days. The difference between start and end days is always three (inclusive of start and end days).

Hence the output wanted is:

Person  Start  End     D1    D2    D3
     A    Tue  Thu   0.01  6.81  8.67
     C    Mon  Wed   7.41  2.63  4.32
     D    Thu  Sat   9.22  2.19  5.47
     C    Thu  Sat   6.57  0.20  1.69
     B    Wed  Fri   4.28  5.57  0.16

I want to avoid a loop because the actual base_df is more than 35000 rows. Is there a data.table solution? Solutions using other data structures are good too. Thank you!

Jaap
  • 81,064
  • 34
  • 182
  • 193
phil_t
  • 851
  • 2
  • 7
  • 17

5 Answers5

3

Another base R solution, using mapply...

query_df <- cbind(query_df,
                  t(mapply(function(p,s,e) {
                     base_df[p, match(s, names(base_df)):match(e, names(base_df))]},
                           query_df$Person,
                           query_df$Start,
                           query_df$End)))
names(query_df)[4:6] <- c("D1", "D2", "D3")

query_df
  Person Start End   D1   D2   D3
1      A   Tue Thu 0.01 6.81 8.67
2      C   Mon Wed 7.41 2.63 4.32
3      D   Thu Sat 9.22 2.19 5.47
4      C   Thu Sat 6.57  0.2 1.69
5      B   Wed Fri 4.28 5.57 0.16
Andrew Gustar
  • 17,295
  • 1
  • 22
  • 32
  • 1
    To remove the second line, you could do `query_df[paste0("D", 1:3)] <- t(mappply(...`. – lmo Sep 22 '17 at 13:02
2

The data.table solution below should be working also for varying numbers of days between Start and End days (not just 3 day periods) thanks to a non-equi join and melt() / dcast() for reshaping:

library(data.table)
setDT(base_df)
setDT(query_df)

# reshape from wide to long
long <- melt(base_df, id.vars = "Person", variable.name = "Day")

# align factor levels
cols <- c("Start", "End")
query_df[, (cols) := lapply(.SD, factor, levels = levels(long$Day)), .SDcols = cols][
  # add row id because Person is not unique
  , rn := .I]

# non-equi join right join, i.e., take all rows of query_df
long[query_df, on = .(Person, Day >= Start, Day <= End), 
     .(rn, Person, Start = i.Start, End = i.End, value)][
       # reshape from long to wide
       , dcast(.SD, rn + Person + ... ~ rowid(rn, prefix = "D"))]
   rn Person Start End   D1   D2   D3
1:  1      A   Tue Thu 0.01 6.81 8.67
2:  2      C   Mon Wed 7.41 2.63 4.32
3:  3      D   Thu Sat 9.22 2.19 5.47
4:  4      C   Thu Sat 6.57 0.20 1.69
5:  5      B   Wed Fri 4.28 5.57 0.16

Note that Day is a factor with the names of weekdays as factor levels in order of appearance:

 str(long)
Classes ‘data.table’ and 'data.frame':    30 obs. of  3 variables:
 $ Person: chr  "A" "B" "C" "D" ...
 $ Day   : Factor w/ 6 levels "Mon","Tue","Wed",..: 1 1 1 1 1 2 2 2 2 2 ...
 $ value : num  5.23 6.26 7.41 6.17 1.23 0.01 2.19 2.63 1.5 9.01 ...
 - attr(*, ".internal.selfref")=<externalptr>

Aligned factor levels are crucial for the non-equi join.

Data

library(data.table)

base_df <- fread(
  "Person    Mon   Tue   Wed   Thu   Fri   Sat
A  5.23  0.01  6.81  8.67  0.10  6.21
B  6.26  2.19  4.28  5.57  0.16  2.81
C  7.41  2.63  4.32  6.57  0.20  1.69
D  6.17  1.50  5.30  9.22  2.19  5.47
E  1.23  9.01  8.09  1.29  7.65  4.57"
)

query_df <- fread(
  "Person  Start  End
  A    Tue  Thu
  C    Mon  Wed
  D    Thu  Sat
  C    Thu  Sat
  B    Wed  Fri"
)
Community
  • 1
  • 1
Uwe
  • 41,420
  • 11
  • 90
  • 134
1

data.table solution:

Here I use get to extract columns (e.g. Mon) from a data.table object.

library(data.table)
# Prepare data
base_df$Person <- rownames(base_df)
d <- merge(query_df, base_df, "Person", sort = FALSE)
setDT(d)

# Extract mid day (day between start and end)
d[, Mid := days[which(Start == days) + 1], 1:nrow(d)]
# Extract columns using get
d[, .(Person, Start, End, 
      D1 = get(Start), D2 = get(Mid), D3 = get(End)), 1:nrow(d)][, nrow := NULL][]

   Person Start End   D1   D2   D3
1:      A   Tue Thu 0.01 6.81 8.67
2:      C   Mon Wed 7.41 2.63 4.32
3:      D   Thu Sat 9.22 2.19 5.47
4:      C   Thu Sat 6.57 0.20 1.69
5:      B   Wed Fri 4.28 5.57 0.16

Base R solution:

# Order of days
days <- names(base_df)
# Order of persons
subjects <- rownames(base_df)

res <- apply(query_df, 1, function(x) {
    # Extract observation between start:end date
    foo <- base_df[x[1] == subjects, which(x[2] == days):which(x[3] == days)]
    colnames(foo) <- paste0("D", 1:3)
    foo})
# Merge with original query_df
res <- cbind(query_df, do.call("rbind", res))
rownames(res) <- NULL
res
pogibas
  • 27,303
  • 19
  • 84
  • 117
1

A tidyverse answer

I reshape base_df, then join and slice the correct days, then reshape back.

library(tidyr)
library(dplyr)

base_df <- tibble::rownames_to_column(base_df, 'Person')
days <- names(base_df)[-1]

base_df %>% 
  gather(day, value, -Person) %>% 
  right_join(mutate(query_df, i = row_number())) %>% 
  group_by(i) %>% 
  slice(which(days == Start):which(days == End)) %>% 
  mutate(col = c('D1', 'D2', 'D3')) %>% 
  select(-day, -i) %>% 
  spread(col, value) 
Axeman
  • 32,068
  • 8
  • 81
  • 94
1

A base solution using indexing with a numeric matrix:

ri <- match(query_df$Person, rownames(base_df))
ci <- match(query_df$Start, names(base_df))
cbind(query_df, `dim<-`(base_df[cbind(ri, rep(ci, 3) + rep(0:2, each = nrow(query_df)))],
                        c(nrow(query_df), 3)))

#   Person Start End    1    2    3
# 1      A   Tue Thu 0.01 6.81 8.67
# 2      C   Mon Wed 7.41 2.63 4.32
# 3      D   Thu Sat 9.22 2.19 5.47
# 4      C   Thu Sat 6.57 0.20 1.69
# 5      B   Wed Fri 4.28 5.57 0.16
Henrik
  • 65,555
  • 14
  • 143
  • 159