2

I am trying to create a table with the headings study_id, Conflict and create a third and forth column which contains data that corresponds to the value in column: Conflict

The code below achieves this aim. However it is rather long especially as I wish to expand this to cover several hundred different values in column: conflict

Thank you in advance for any pointers

df <- data.frame(study_id=c("1", "1", "4", "4", "5"), 
                 Conflict=c("WATER.START", "WATER.STOP", "OIL.START", "NA", "WATER.STOP"), 
                 Result=c("TRUE", "TRUE", "TRUE", "NA", "TRUE"))

df2 <- data.frame(study_id=c("1", "2", "3", "4", "5"), 
                  WATER.start=c(1, 1, 2, NA, 6), 
                  WATER.truestart=c(1, 1, 2, NA, 25), 
                  WATER.stop=c(33, 3, 2, NA, 8), 
                  WATER.truestop= c(34, 4, 2, NA, 8))

final <- left_join(df, df2, by ='study_id')

dd <- final %>% filter(Result == "TRUE" & Conflict == "WATER.START")
dd <- dd %>% subset(., Conflict == "WATER.START", 
                    select=c(study_id, Conflict, WATER.start, WATER.truestart))
dd <- dd %>% rename(initial=WATER.start) %>% rename(verification=WATER.truestart)

ee <- final %>% filter(Result == "TRUE" & Conflict == "WATER.STOP")
ee <- ee %>% subset(., Conflict == "WATER.STOP", 
                    select=c(study_id, Conflict, WATER.stop, WATER.truestop))
ee <- ee %>% rename(initial=WATER.stop) %>% rename(verification=WATER.truestop)

ff <- bind_rows(dd, ee)
gg <- ff %>% select(study_id, Conflict, initial, verification)

gg 
#   study_id    Conflict initial verification
# 1        1 WATER.START       1            1
# 2        1  WATER.STOP      33           34
# 3        5  WATER.STOP       8            8
jay.sf
  • 60,139
  • 8
  • 53
  • 110
GJW
  • 105
  • 6

1 Answers1

2

First, as it seems you only focus on c('WATER.START', 'WATER.STOP'), subset your first df. Next, for every MARGIN=1 (i.e. every row), we apply an anonymous function \(x) that selects the appropriate columns of df2, using tolower to match case, and cbinds together. Finally rename columns using setNames and rbind the resulting list.

subset(df, Conflict %in% c('WATER.START', 'WATER.STOP')) |>
  apply(MARGIN=1, \(x) {
    mt <- match(tolower(x[2]), tolower(names(df2)))
    cbind(t(x[1:2]), df2[df2$study_id == x[1], c(mt, mt + 1)]) |>
    setNames(c('study_id', 'Conflict', 'initial', 'verification'))
}) |> do.call(what=rbind)
#   study_id    Conflict initial verification
# 1        1 WATER.START       1            1
# 2        1  WATER.STOP      33           34
# 5        5  WATER.STOP       8            8

You could also use a dictionary a (which may be expanded to other levels you might be using).

a <- c(WATER.START='WATER.start', WATER.STOP='WATER.stop')

subset(df, Conflict %in% c('WATER.START', 'WATER.STOP')) |>
  apply(MARGIN=1, \(x) {
    mt <- match(a[match(x[2], names(a))], names(df2))
    cbind(t(x[1:2]), df2[df2$study_id == x[1], c(mt, mt + 1)]) |>
      setNames(c('study_id', 'Conflict', 'initial', 'verification'))
  }) |> do.call(what=rbind)
#   study_id    Conflict initial verification
# 1        1 WATER.START       1            1
# 2        1  WATER.STOP      33           34
# 5        5  WATER.STOP       8            8

However, I think what you really need is to reshape your data.

## basic
reshape(df2, direction='long', idvar=1, varying=list(c(2, 4), c(3, 5))) 
#     study_id time WATER.start WATER.truestart
# 1.1        1    1           1               1
# 2.1        2    1           1               1
# 3.1        3    1           2               2
# 4.1        4    1          NA              NA
# 5.1        5    1           6              25
# 1.2        1    2          33              34
# 2.2        2    2           3               4
# 3.2        3    2           2               2
# 4.2        4    2          NA              NA
# 5.2        5    2           8               8

## enhanced
reshape(df2, dir='long', idvar='study_id',
        varying=list(c("WATER.start", "WATER.stop"), c("WATER.truestart", "WATER.truestop")),
        timevar='foo', times=c('water.start', 'water.stop'), v.names=c('initial', 'verification')) 
#               study_id         foo initial verification
# 1.water.start        1 water.start       1            1
# 2.water.start        2 water.start       1            1
# 3.water.start        3 water.start       2            2
# 4.water.start        4 water.start      NA           NA
# 5.water.start        5 water.start       6           25
# 1.water.stop         1  water.stop      33           34
# 2.water.stop         2  water.stop       3            4
# 3.water.stop         3  water.stop       2            2
# 4.water.stop         4  water.stop      NA           NA
# 5.water.stop         5  water.stop       8            8
jay.sf
  • 60,139
  • 8
  • 53
  • 110
  • Cheers. The approach uses the fact that WATER.START is also in the second dataframe all be it in partial lowercase and WATER.truestart is found by using mt + 1 What would be the approach be if WATER.START in the first data frame was unable to be altered in a simple was by changing to lower case. For example WATER.START was actually called fluid.on – GJW Feb 19 '23 at 02:41
  • The slightly less elegant way was to use fct_recode df <- data.frame(study_id = (c("1","1","4","4","5")), Conflict = (c("fluid_a_on","fluid_a_off","OIL.START","NA","WATER.STOP")), Result = (c("TRUE","TRUE","TRUE","NA","TRUE"))) df <- df %>% mutate(Conflict = fct_recode(Conflict, "WATER.START" = "fluid_a_on", "WATER.STOP" = "fluid_a_off")) df If there is an alternative approach I would be interested to know. Cheers – GJW Feb 19 '23 at 04:30
  • 1
    @GJW To provide the information you could use a dictionary, see update. We could do many fancy things here, but it might be better to solve this up front in a data cleaning preprocessing where corresponding items get the same labels. – jay.sf Feb 19 '23 at 05:28
  • Brilliant pointers. When applying this code to my dataframe i get the following error "Error in data.frame(..., check.names = FALSE) : arguments imply differing number of rows: 1, 0 Called from: data.frame(..., check.names = FALSE)" When I change cbind to bind_cols I stop getting the repeating error however the code doesn't return an output for all rows. My equivalent "df" contains less rows than my equivalent "df2" and df2 contains lots of NA. Both df and df2 are dataframes – GJW Feb 20 '23 at 14:01
  • @GJW Code is working for me, not sure what causes the issue, can't reproduce. Maybe you have tibbles in stead of data frames,try change them do data frames as shown in OP. – jay.sf Feb 20 '23 at 14:14
  • Sorry I couldn't copy across "my data" due to character limitations so I have posted again - with source data and the code. The error lies somewhere in how my data is presented as opposed to your solution detailed above. – GJW Feb 20 '23 at 15:36
  • https://stackoverflow.com/questions/75534122/copy-data-from-one-column-to-another-column-if-condition-in-another-column-is-me – GJW Feb 23 '23 at 05:54