0

I have two data frames (e.g. df and df1) which have several columns. One of the columns is the ID one, that I would use to merge the dfs.

for examples:

df$ID <- c("AB/XX/1/1/364/93/DA/3", "AT/PML/1/1/364/94/DA/3", "AT/PRT/1/1/364/95/DA/3", "AK/PRT/1/1/364/115/DA/3", "AB/XX/1/1/365/116/DA/3", "AB/XX/1/1/365/117/DA/3", "AT/PML/1/1/365/118/DA/3", NA, NA, "AK/PRT/1/1/1/149/DA/3", "AB/XX/1/1/1/151/DA/3",   "AT/PML/1/1/2/152/DA/3", "AK/PRT/1/1/2/153/DA/3")

df1$ID <- c("AT/FBA/1/12/360/26/SF/96", "AT/RLMW/1/12/360/44/SF/122", "AT/ACR/1/12/362/66/SF/175", "AT/AA/1/12/363/72/SF/281", "AT/BB/1/12/364/90/SF/310", "AT/ANT/1/123/364/92/SF/338")

I want to merge some columns of df1 to df according to a code text that is contained within the ID argument.

The code text is inside the ID code, and it is composed by the 5th and 6th arguments separated by '/'. For example, "360/26", "360/44", etc. (How to extract this code has been asked and replied here:R - Extract text between symbol or delimiter '/').

I tried to make a for loop, using grepl, but with bad results:

    str1_sub <- sub("^([^/]+/){4}([^/]*/[^/]*)/.*", "\\2", df1$ID)
        
    #check if the df$ID contains str1_sub
    grepl(str1_sub, df$ID, fixed = TRUE)
         
    #create new empty columns in df that will be filled using df1's columns
    df$banana <- NA
    df$apple <- NA

    #for loop to fill the columns
    for (n in length(str1_sub)){
      for (m in length(df$ID)){
        if (grepl(str1_sub[n], df$ID[m], fixed = TRUE) == TRUE) {
            df$banana[m] <- df1$BANANA[n]
            df$apple[m] <- df1$APPLE[n]
        }
      }
    }

The if condition does not work, saying that it misses the TRUE/FALSE value I am sure there is a better way than a for loop.

Strobila
  • 317
  • 3
  • 15

1 Answers1

0

You could create an ID_sub field and use a left_join.
I modified first ID value to get at least a correspondance.

library(dplyr)

df <- data.frame(ID = c("AB/XX/1/1/364/93/DA/3", "AT/PML/1/1/364/94/DA/3", "AT/PRT/1/1/364/95/DA/3", "AK/PRT/1/1/364/115/DA/3", "AB/XX/1/1/365/116/DA/3", "AB/XX/1/1/365/117/DA/3", "AT/PML/1/1/365/118/DA/3", NA, NA, "AK/PRT/1/1/1/149/DA/3", "AB/XX/1/1/1/151/DA/3",   "AT/PML/1/1/2/152/DA/3", "AK/PRT/1/1/2/153/DA/3"))
df1 <- data.frame(ID = c("AT/FBA/1/12/364/93/SF/96", "AT/RLMW/1/12/360/44/SF/122", "AT/ACR/1/12/362/66/SF/175", "AT/AA/1/12/363/72/SF/281", "AT/BB/1/12/364/90/SF/310", "AT/ANT/1/123/364/92/SF/338"))


df1$ID_sub <- sub("^([^/]+/){4}([^/]*/[^/]*)/.*", "\\2", df1$ID)
df$ID_sub <- sub("^([^/]+/){4}([^/]*/[^/]*)/.*", "\\2", df$ID)


df$banana <- "banana"
df1$BANANA <-"banana1"



left_join(df,df1,by="ID_sub") %>% mutate(banana = if_else(!is.na(ID.y),BANANA,banana)) %>%
                                  select(ID = ID.x, banana)
#>                         ID  banana
#> 1    AB/XX/1/1/364/93/DA/3 banana1
#> 2   AT/PML/1/1/364/94/DA/3  banana
#> 3   AT/PRT/1/1/364/95/DA/3  banana
#> 4  AK/PRT/1/1/364/115/DA/3  banana
#> 5   AB/XX/1/1/365/116/DA/3  banana
#> 6   AB/XX/1/1/365/117/DA/3  banana
#> 7  AT/PML/1/1/365/118/DA/3  banana
#> 8                     <NA>  banana
#> 9                     <NA>  banana
#> 10   AK/PRT/1/1/1/149/DA/3  banana
#> 11    AB/XX/1/1/1/151/DA/3  banana
#> 12   AT/PML/1/1/2/152/DA/3  banana
#> 13   AK/PRT/1/1/2/153/DA/3  banana
Waldi
  • 39,242
  • 6
  • 30
  • 78