0

df1 chunk

 tconst    titleType primaryTitle           startYear endYear runtimeMinutes genres                   
  <chr>     <chr>     <chr>                      <dbl> <chr>   <chr>          <chr>                      
1 tt0000001 short     Carmencita                  1894 "\\N"   1              Documentary,Short         
2 tt0000002 short     Le clown et ses chiens      1892 "\\N"   5              Animation,Short              
3 tt0000003 short     Pauvre Pierrot              1892 "\\N"   4             Animation,Comedy,Romance     
4 tt0000004 short     Un bon bock                 1892 "\\N"   12             Animation,Short              
5 tt0000005 short     Blacksmith Scene            1893 "\\N"   1              Comedy,Short                 
6 tt0000006 short     Chinese Opium Den           1894 "\\N"   1              Short                 

   

df2 chunk

nconst    primaryName     birthYear deathYear primaryProfession                   t_1       t_2       t_3      t_4  
  <chr>     <chr>           <chr>     <chr>     <chr>                               <chr>     <chr>     <chr>    <chr>
1 nm0000001 Fred Astaire    1899      "1987"    soundtrack,actor,miscellaneous      tt0072308 tt0053137 tt00504… tt00…
2 nm0000002 Lauren Bacall   1924      "2014"    actress,soundtrack                  tt0037382 tt0075213 tt01170… tt00…
3 nm0000003 Brigitte Bardot 1934      "\\N"     actress,soundtrack,music_department tt0056404 tt0054452 tt00573… tt00…
4 nm0000004 John Belushi    1949      "1982"    actor,soundtrack,writer             tt0078723 tt0080455 tt00725… tt00…
5 nm0000005 Ingmar Bergman  1918      "2007"    writer,director,actor               tt0083922 tt0050976 tt00509… tt00…
6 nm0000006 Ingrid Bergman  1915      "1982"    actress,soundtrack,producer         tt0036855 tt0038787 tt00381… tt00…

How can I merge the first df with the second df based on if df1.tconst consists in any of four columns t_1,t_2,t_3,t_4 in df2??

Df1 has over 1.2m row, df2 has over 12M rows

I separated a string tconst2 in df2, it was separated by comma as i haven't found an answer how to join them if it consists in a string. But know I'm stuck anyway.

I tried

> df2 <- merge(x = movies_ratings_df,y = name_df2, 
+              by.x = c("tconst", "tconst","tconst","tconst"),
+              by.y = c("t_1","t_2", "t_3", "t_4"))

Error in merge.data.frame(x = movies_ratings_df, y = name_df2, by.x = c("tconst",  : 
  'by.x' и 'by.y'
jay.sf
  • 60,139
  • 8
  • 53
  • 110
  • Hi, I rolled back because we don't use images for code/data on Stack Overflow. Cheers! – jay.sf Mar 29 '23 at 09:15
  • @jay.sf thank you, it's my first question, I wasn't sure how to do that. – Victoria Shpetnaya Mar 29 '23 at 09:20
  • You did a good job! Maybe you want to read our (more or less) short [tutorial](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example). – jay.sf Mar 29 '23 at 09:45

2 Answers2

0

You could try several left_joins from df2 on df1:

df1 <- left_join(df1, df2, by=c("tconst" = "t_1")
df1 <- left_join(df1, df2, by=c("tconst" = "t_2")
...

Does this answer your question? :)

Simon
  • 58
  • 5
0

You could first pivot your data to long and then perform a left_join:

df1 <- tibble::tribble(~tconst, ~x, 
        "tt0000001", 1,
        "tt0000002", 1,
        "tt0000003", 1,
        "tt0000004", 1,
        "tt0000005", 1,
        "tt0000006", 1)


df2 <- tibble::tribble(
  ~primaryProfession,                   ~t_1,       ~t_2,      
    "soundtrack,actor,miscellaneous",      "tt0072308", "tt0053137",
    "actress,soundtrack",                  "tt0037382", "tt0075213",
    "actress,soundtrack,music_department", "tt0056404", "tt0000006",
    "actor,soundtrack,writer",             "tt0078723", "tt0080455",
    "writer,director,actor",               "tt0083922", "tt0050976",
    "actress,soundtrack,producer",         "tt0000003", "tt0038787")

library(dplyr)
library(tidyr)
df1 |> 
  left_join(
    df2 |> 
      pivot_longer(c(starts_with("t_")), values_to = "tconst")
  )

Output:

# A tibble: 6 × 4
  tconst        x primaryProfession                   name 
  <chr>     <dbl> <chr>                               <chr>
1 tt0000001     1 NA                                  NA   
2 tt0000002     1 NA                                  NA   
3 tt0000003     1 actress,soundtrack,producer         t_1  
4 tt0000004     1 NA                                  NA   
5 tt0000005     1 NA                                  NA   
6 tt0000006     1 actress,soundtrack,music_department t_2 
Julian
  • 6,586
  • 2
  • 9
  • 33