2
df1 <- data.frame(w = 1:4, x = c("a", "b", "b", "c"), y = NA)
df1$y[df1$x == "c"] = 1
df2 <- data.frame(x = c("a", "b"), y = 1:2, z = 3:4)

I want to update df1 with the values in df2 for matching rows, including all df2 columns.

Expected result:

df1
  w x  y  z
1 1 a  1  3
2 2 b  2  4
3 3 b  2  4
4 4 c  1 NA

Here's my attempt:

# add missing columns from df2 to df1
df1[setdiff(colnames(df2), colnames(df1))] <- NA
# update values in df1 from df2 for matching x
df1[df1$x %in% df2$x, colnames(df2[,-1])] <- df2[match(df1$x, df2$x),-1]

This doesn't work because df2[match(df1$x, df2$x),-1] also includes mismatches. What's the right way to do this? (My actual dataframes have many more rows and columns.)

EDIT1: Upon request, the example does not represent the full complexity of my actual data. In real life, I am dealing with df offices (df1) and results (df2). Column "x" is "address".

colnames(offices)
 [1] "org_order"                   "organizations.api_path"      "type"                       
 [4] "uuid"                        "name"                        "street_1"                   
 [7] "street_2"                    "postal_code_o"                 "city"                       
[10] "city_web_path"               "region"                      "region_code2"               
[13] "region_web_path"             "country_o"                     "country_code2"              
[16] "country_code3"               "country_web_path"            "latitude"                   
[19] "longitude"                   "created_at"                  "updated_at"                 
[22] "address"                     "administrative_area_level_1" "administrative_area_level_2"
[25] "country"                   "locality"                    "neighborhood"               
[28] "postal_code"               "route"                       "street_number"              
colnames(results)
 [1] "address"                     "administrative_area_level_1" "administrative_area_level_2"
 [4] "country"                     "locality"                    "neighborhood"               
 [7] "postal_code"                 "route"                       "street_number"              
[10] "subpremise"                  "postal_code_suffix"          "premise"                    
[13] "political" 

Note that this is the picture at this point in time. I expect df results to add more columns (it is updated via Google Maps API which returns variable fields according to location).

EDIT2: Here is sample data from df offices and results.

structure(list(org_order = c("1", "3", "4", "8", "9", "10", "11", 
"11", "13", "13", "14", "15", "16", "18", "21", "23", "24", "25", 
"26", "27"), organizations.api_path = c("organizations/care1st-health-plan-arizona", 
"organizations/telecommunications-and-information-administration", 
"organizations/multimatecollection-com", "organizations/ion-holdco", 
"organizations/itris-bv", "organizations/crius-energy", "organizations/uwc-mahindra-college", 
"organizations/uwc-mahindra-college", "organizations/ustech-discovery", 
"organizations/ustech-discovery", "organizations/play-park-structures", 
"organizations/wings-rotors", "organizations/attractive-world", 
"organizations/bigtoys", "organizations/intercontec-produkt-gmbh", 
"organizations/elitesingles", "organizations/rumi-x-limited", 
"organizations/gt-grandstands", "organizations/new-health-capital-partners", 
"organizations/urban-media-2"), type = c("Address", "Address", 
"Address", "Address", "Address", "Address", "Address", "Address", 
"Address", "Address", "Address", "Address", "Address", "Address", 
"Address", "Address", "Address", "Address", "Address", "Address"
), uuid = c("e28deae838eb745ab093bb3d2f66963a", "ae150251316913fd72a1ba9df18bd686", 
"d49e135e7b61d01acf6609e1c5c1a34d", "359399c4f6db344f530897caabce666e", 
"2b4a469b217e5090611a539d952412f3", "3939cfa28f2191535bcdaf4297bfa488", 
"86620bc9b675150a847496bcfbb8f6b9", "ee869cc495e84ebdbbbbd6db0ed5d3c0", 
"f31c6e197494192ec47bcb36d14bc206", "2fb8855d0bcccde661aacbbf69d92a3e", 
"724967633f0be037df09bf63e8bd932d", "8d765e1b0111298f2e295883ac0c1f52", 
"ea0cd535b2a113b9c7d1cbec466963eb", "a7d69835d7abdbee8d28c77e9c54f176", 
"243b70fedc6ff9f6b68bb1ae1cb5f5a1", "7cec7f51b87fff47acf74a0424c88d30", 
"e1c4108f8e4d902ed3524e98418cb157", "666dffa11b2de8aa35307a35d19ec117", 
"d5d56654e892d46c303deebd9b71aaaa", "deb420cf0a669897e0058ab854e928bc"
), name = c("Headquarters", "Headquarters", "Delhi", "Headquarters", 
"Headquarters", "HQ", "Israel Office", "Headquarters", "Israel Office", 
"Headquarters", "Headquarters", "Israel Office", "Headquarters", 
"Headquarters", "HQ", "Headquarters", "Headquarters & Showroom", 
"Headquarters", "Headquarters", "Israel Office"), street_1 = c("2355 E. Camelback Road", 
"1401 Constitution Ave NW", "", "80 State Street 7th Floor", 
"Spray Gaarde 46", "1055 Washington Blvd., 7th Floor", "", "Village Khubavali, PO Paud Taluka Mulshi", 
"", "231 Lagrange Street", "401 Chestnut St", "st. ha Rav Bar Shaul 6", 
"", "7721 New Market Street", "Bernrieder Str. 15", "", "UNIT 1801, 18/F WAGA COMMERCIAL CENTER", 
"2810 Sydney Road", "1350 Avenue of the Americas", ""), street_2 = c("Suite 300", 
"", "", "", "", "", "", "", "", "", "Suite 410", "", "", "", 
"", "", "99 WELLINGTON ST, CENTRAL,", "", "9th Floor", ""), postal_code_cb = c("85016", 
"20230", "", "12207", "03436", "06901", "", "412 108", "", "02132", 
"37402", "7625149", "", "98501", "94559", "", "", "33566-1173", 
"10019", ""), city = c("Phoenix", "Washington", "", "Albany", 
"Nieuwegein", "Stamford", "", "Pune", "", "Boston", "Chattanooga", 
"Rehovot", "Paris", "Olympia", "Niederwinkling", "Dover", "Hong Kong", 
"Plant City", "New York", ""), city_web_path = c("location/phoenix/e55cf011f3c746498d0d801bc399dc52", 
"location/washington/09d220d540bacd296f7a5035ff6e6bef", "location/india/44048bf7db640d7adb20fd3c1ebf47b0", 
"location/albany/b40478e7b38920051a2a0f501b6e8c65", "location/nieuwegein/93f71c3141f8b16e539839308acd911e", 
"location/stamford/01272fa9253161fb0b1e69fe0fa8a53c", "location/israel/ab89e1a9013f82613b5f60487350334d", 
"location/pune/2c69941daa1ea666db3cb74f2029ae62", "location/israel/ab89e1a9013f82613b5f60487350334d", 
"location/boston/9898e533996b05148cc5302f60454c02", "location/chattanooga/9ecb7425cc53be56a353c7a127bf2f56", 
"location/rehovot/fd22cd6db48f2c192068a1bc99853c08", "location/paris/5805284c00a4f0aab8115843398ff2bb", 
"location/olympia/b289b456f00ca6c0536b2032da46985f", "location/niederwinkling/026a83aec4e324d209cd3e1b257b55d5", 
"location/dover/796562878525d2957e1dc10a32a5e13e", "location/hong-kong/ad27df91c866cb08f3b624121637bf69", 
"location/plant-city/646eec50d4fc27131241e0bebdae3673", "location/new-york/d64b7615985cfbf44affaa89d70c4050", 
"location/israel/ab89e1a9013f82613b5f60487350334d"), region = c("Arizona", 
"District of Columbia", "", "New York", "Utrecht", "Connecticut", 
"", "Maharashtra", "", "Massachusetts", "Tennessee", "HaMerkaz", 
"Ile-de-France", "Washington", "Bayern", "Delaware", "Hong Kong Island", 
"Florida", "New York", ""), region_code2 = c("AZ", "DC", "", 
"NY", "9", "CT", "", "16", "", "MA", "TN", "2", "A8", "WA", "2", 
"DE", "", "FL", "NY", ""), region_web_path = c("", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""
), country_cb = c("United States", "United States", "India", 
"United States", "The Netherlands", "United States", "Israel", 
"India", "Israel", "United States", "United States", "Israel", 
"France", "United States", "Germany", "United States", "Hong Kong", 
"United States", "United States", "Israel"), country_code2 = c("US", 
"US", "IN", "US", "NL", "US", "IL", "IN", "IL", "US", "US", "IL", 
"FR", "US", "DE", "US", "HK", "US", "US", "IL"), country_code3 = c("USA", 
"USA", "IND", "USA", "NLD", "USA", "ISR", "IND", "ISR", "USA", 
"USA", "ISR", "FRA", "USA", "DEU", "USA", "HKG", "USA", "USA", 
"ISR"), country_web_path = c("", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", "", ""), latitude = c("", 
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", 
"", "", ""), longitude = c("", "", "", "", "", "", "", "", "", 
"", "", "", "", "", "", "", "", "", "", ""), created_at = c("1475743389", 
"1475899280", "1475743519", "1475899256", "1475742451", "1475742715", 
"1475742786", "1475742171", "1475741845", "1475741660", "1475899240", 
"1475741599", "1475741472", "1475899218", "1475741445", "1475741067", 
"1475742388", "1475899207", "1475740101", "1475740590"), updated_at = c("1475743390", 
"1475899280", "1475743519", "1475899257", "1475742451", "1475742715", 
"1475742786", "1475742171", "1475741845", "1475741660", "1475899240", 
"1475741599", "1475741472", "1475899218", "1475741445", "1475741067", 
"1475742388", "1475899207", "1475740101", "1475740590"), address = c("2355 E. Camelback Road Suite 300 85016 Phoenix Arizona", 
"1401 Constitution Ave NW  20230 Washington District of Columbia", 
"", "80 State Street 7th Floor  12207 Albany New York", "Spray Gaarde 46  03436 Nieuwegein Utrecht", 
"1055 Washington Blvd., 7th Floor  06901 Stamford Connecticut", 
"", "Village Khubavali, PO Paud Taluka Mulshi  412 108 Pune Maharashtra", 
"", "231 Lagrange Street  02132 Boston Massachusetts", "401 Chestnut St Suite 410 37402 Chattanooga Tennessee", 
"st. ha Rav Bar Shaul 6  7625149 Rehovot HaMerkaz", "Paris Ile-de-France", 
"7721 New Market Street  98501 Olympia Washington", "Bernrieder Str. 15  94559 Niederwinkling Bayern", 
"Dover Delaware", "UNIT 1801, 18/F WAGA COMMERCIAL CENTER 99 WELLINGTON ST, CENTRAL,  Hong Kong Hong Kong Island", 
"2810 Sydney Road  33566-1173 Plant City Florida", "1350 Avenue of the Americas 9th Floor 10019 New York New York", 
""), administrative_area_level_1 = c("Arizona", "District of Columbia", 
NA, "New York", NA, "Connecticut", NA, NA, NA, "Massachusetts", 
"Tennessee", "Center District", "?e-de-France", "Washington", 
"Bayern", "Delaware", NA, NA, NA, NA), administrative_area_level_2 = c("Maricopa County", 
NA, NA, "Albany County", NA, "Fairfield County", NA, NA, NA, 
"Suffolk County", "Hamilton County", "Rehovot", "Paris", "Thurston County", 
"Niederbayern", "Kent County", NA, NA, NA, NA), country = c("United States", 
"United States", NA, "United States", NA, "United States", NA, 
NA, NA, "United States", "United States", "Israel", "France", 
"United States", "Germany", "United States", NA, NA, NA, NA), 
    locality = c("Phoenix", "Washington", NA, "Albany", NA, "Stamford", 
    NA, NA, NA, "Boston", "Chattanooga", "Rehovot", "Paris", 
    "Tumwater", "Niederwinkling", "Dover", NA, NA, NA, NA), neighborhood = c("Camelback East Village", 
    "Northwest Washington", NA, "Downtown", NA, "Downtown", NA, 
    NA, NA, "West Roxbury", NA, NA, NA, NA, NA, NA, NA, NA, NA, 
    NA), postal_code = c("85016", "20230", NA, "12207", NA, "06901", 
    NA, NA, NA, "02132", "37402", NA, NA, "98501", "94559", NA, 
    NA, NA, NA, NA), route = c("East Camelback Road", "Constitution Avenue Northwest", 
    NA, "State Street", NA, "Washington Boulevard", NA, NA, NA, 
    "Lagrange Street", "Chestnut Street", "HaRav Bar Shaul Street", 
    NA, "New Market Street Southwest", "Bernrieder Stra?", NA, 
    NA, NA, NA, NA), street_number = c("2355", "1401", NA, "80", 
    NA, "1055", NA, NA, NA, "231", "401", "6", NA, "7721", "15", 
    NA, NA, NA, NA, NA), subpremise = c("300", NA, NA, NA, NA, 
    NA, NA, NA, NA, NA, "410", NA, NA, NA, NA, NA, NA, NA, NA, 
    NA)), .Names = c("org_order", "organizations.api_path", "type", 
"uuid", "name", "street_1", "street_2", "postal_code_cb", "city", 
"city_web_path", "region", "region_code2", "region_web_path", 
"country_cb", "country_code2", "country_code3", "country_web_path", 
"latitude", "longitude", "created_at", "updated_at", "address", 
"administrative_area_level_1", "administrative_area_level_2", 
"country", "locality", "neighborhood", "postal_code", "route", 
"street_number", "subpremise"), class = c("data.table", "data.frame"
), row.names = c(NA, -20L), .internal.selfref = <pointer: 0x00000000001e0788>)

structure(list(address = structure(1:18, .Label = c("2355 E. Camelback Road Suite 300 85016 Phoenix Arizona", 
    "1401 Constitution Ave NW  20230 Washington District of Columbia", 
    "80 State Street 7th Floor  12207 Albany New York", "Spray Gaarde 46  3436 Nieuwegein Utrecht", 
    "1055 Washington Blvd., 7th Floor  6901 Stamford Connecticut", 
    "Village Khubavali, PO Paud Taluka Mulshi  412 108 Pune Maharashtra", 
    "231 Lagrange Street  2132 Boston Massachusetts", "401 Chestnut St Suite 410 37402 Chattanooga Tennessee", 
    "st. ha Rav Bar Shaul 6  7625149 Rehovot HaMerkaz", "Paris Ile-de-France", 
    "7721 New Market Street  98501 Olympia Washington", "Bernrieder Str. 15  94559 Niederwinkling Bayern", 
    "Dover Delaware", "UNIT 1801, 18/F WAGA COMMERCIAL CENTER 99 WELLINGTON ST, CENTRAL,  Hong Kong Hong Kong Island", 
    "2810 Sydney Road  33566-1173 Plant City Florida", "1350 Avenue of the Americas 9th Floor 10019 New York New York", 
    "Askanischer Platz 3  10963 Berlin Berlin", "Level 2, 145 Flinders Lane  03000 Melbourne Victoria"
    ), class = "factor"), administrative_area_level_1 = c("Arizona", 
    "District of Columbia", "New York", NA, "Connecticut", NA, "Massachusetts", 
    "Tennessee", "Center District", "?e-de-France", "Washington", 
    "Bayern", "Delaware", "Hong Kong Island", "Florida", "New York", 
    "Berlin", "Victoria"), administrative_area_level_2 = c("Maricopa County", 
    NA, "Albany County", NA, "Fairfield County", NA, "Suffolk County", 
    "Hamilton County", "Rehovot", "Paris", "Thurston County", "Niederbayern", 
    "Kent County", NA, "Hillsborough County", "New York County", 
    NA, "Melbourne City"), country = c("United States", "United States", 
    "United States", NA, "United States", NA, "United States", "United States", 
    "Israel", "France", "United States", "Germany", "United States", 
    "Hong Kong", "United States", "United States", "Germany", "Australia"
    ), locality = c("Phoenix", "Washington", "Albany", NA, "Stamford", 
    NA, "Boston", "Chattanooga", "Rehovot", "Paris", "Tumwater", 
    "Niederwinkling", "Dover", NA, "Plant City", "New York", "Berlin", 
    "Melbourne"), neighborhood = c("Camelback East Village", "Northwest Washington", 
    "Downtown", NA, "Downtown", NA, "West Roxbury", NA, NA, NA, NA, 
    NA, NA, "Central", NA, NA, NA, NA), postal_code = c("85016", 
    "20230", "12207", NA, "06901", NA, "02132", "37402", NA, NA, 
    "98501", "94559", NA, NA, "33566", "10019", "10963", "3000"), 
        route = c("East Camelback Road", "Constitution Avenue Northwest", 
        "State Street", NA, "Washington Boulevard", NA, "Lagrange Street", 
        "Chestnut Street", "HaRav Bar Shaul Street", NA, "New Market Street Southwest", 
        "Bernrieder Stra?", NA, "Wellington Street", "Sydney Road", 
        "6th Avenue", "Askanischer Platz", "Flinders Lane"), street_number = c("2355", 
        "1401", "80", NA, "1055", NA, "231", "401", "6", NA, "7721", 
        "15", NA, "99", "2810", "1350", "3", "145"), subpremise = c("300", 
        NA, NA, NA, NA, NA, NA, "410", NA, NA, NA, NA, NA, "1801", 
        NA, NA, NA, "2"), postal_code_suffix = c(NA, NA, NA, NA, 
        NA, NA, "3450", NA, NA, NA, NA, NA, NA, NA, "1173", NA, NA, 
        NA), premise = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
        NA, NA, NA, "Waga Commercial Centre", NA, NA, NA, NA), political = c(NA, 
        NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Manhattan", 
        "Bezirk Friedrichshain-Kreuzberg", NA)), .Names = c("address", 
    "administrative_area_level_1", "administrative_area_level_2", 
    "country", "locality", "neighborhood", "postal_code", "route", 
    "street_number", "subpremise", "postal_code_suffix", "premise", 
    "political"), row.names = c(NA, -18L), class = c("data.table", 
    "data.frame"), .internal.selfref = <pointer: 0x00000000001e0788>)
syre
  • 902
  • 1
  • 7
  • 19
  • 1
    for the y column shouldnt the last value be NA as it for the column z. – Chirayu Chamoli Jan 03 '17 at 09:27
  • 1
    I think we only replace value from `df2` in case it is originally missing in `df1`. – Aramis7d Jan 03 '17 at 09:28
  • @Chirayu Chamoli This is on purpose, because the whole thing is actually an iterative process. df2 will be updated several times with new rows. I need to update the new rows in df1, not the old rows. That's also why I can't do the whole thing in one go with merge. – syre Jan 03 '17 at 09:31
  • @Aramis7d Only replace values for matching values of x. – syre Jan 03 '17 at 09:33

3 Answers3

7

Here's a fully atomized data.table version to update all columns that present in both data sets and adding columns from df2 that are not present in df1 simultaneously. This will update df1 in place

cols <- setdiff(colnames(df2), "x")
setDT(df1)[setDT(df2), (cols) := mget(paste0('i.', cols)), on = "x"]
df1
#    w x y  z
# 1: 1 a 1  3
# 2: 2 b 2  4
# 3: 3 b 2  4
# 4: 4 c 1 NA

The idea behind paste0('i.', cols) is to tell data.table that we want to take the columns from the data.table located in the ith location (df2) so it will know how to handle columns that present in both data sets.


Disclaimer: The idea was borrowed from this @eddi's answer

Community
  • 1
  • 1
David Arenburg
  • 91,361
  • 17
  • 137
  • 196
  • If I subset the target df "on the fly", e.g. `setDT(df1[is.na(df1$y),])[setDT(df2), (cols) := mget(paste0('i.', cols)), on = "x"]` then I get a warning _Coerced 'integer' RHS to 'double' to match the column's type_ and the command doesn't work. – syre Jan 27 '17 at 09:29
  • 1
    You can't have two different classes in the same column in R. Also, `setDT(df1[is.na(df1$y),])` is not a correct syntax. Maybe just `df1[is.na(y)]` (if `df` is already a `data.table`). Also, please take a look here: http://stackoverflow.com/questions/29658627/conditional-binary-join-and-update-by-reference-using-the-data-table-package – David Arenburg Jan 27 '17 at 09:47
  • 1
    Also, it seems like you just got a warning rather an error. I think it actually worked but you haven't saved the results anywhere. The `:=` operator doesn't update the original table when you join to a subset. Try something like `res <- df1[is.na(y)][setDT(df2), (cols) := mget(paste0('i.', cols)), on = "x"]` and then check out `res` – David Arenburg Jan 27 '17 at 12:28
2

Using dplyr , you can try:

dplyr::left_join(df1,df2,by="x") %>%
  dplyr::mutate(y = dplyr::coalesce(as.integer(y.x),as.integer(y.y))) %>%
  dplyr::select(w,x,y,z)

which gives

#  w x y  z
#1 1 a 1  3
#2 2 b 2  4
#3 3 b 2  4
#4 4 c 1 NA

p.s. looking at your attempt to use setdiff() over colnames , it may appear that the example provided does not represent the full complexity of your actual question. In that case, please update the example.

Aramis7d
  • 2,444
  • 19
  • 25
  • 2
    This is also kind of manual. You will have to eyeball the result and fix by hand all the columns present in both data sets – David Arenburg Jan 03 '17 at 09:39
  • @DavidArenburg what if while merging, i add `suffix= c("zzzz", "zzzz")` such that we know later on which all columns are common in both apart from `x`. Then we can use `grepl` to work on them – joel.wilson Jan 03 '17 at 10:05
0

One can also use merge here:

dfm = merge(df1, df2, by="x", all=T)
dfm
  x w y.x y.y  z
1 a 1  NA   1  3
2 b 2  NA   2  4
3 b 3  NA   2  4
4 c 4   1  NA NA

Required y can be obtained using:

dfm$y = ifelse(is.na(dfm$y.y), dfm$y.x, dfm$y.y)
dfm[c("w","x","y","z")]
  w x y  z
1 1 a 1  3
2 2 b 2  4
3 3 b 2  4
4 4 c 1 NA

EDIT: To apply above to multiple common columns, get names of such columns:

> inames = intersect(names(df1), names(df2))
> inames
 [1] "address"                     "administrative_area_level_1" "administrative_area_level_2" "country"                    
 [5] "locality"                    "neighborhood"                "postal_code"                 "route"                      
 [9] "street_number"               "subpremise"               

Since first column "address" has been used to merge, use remaining columns to find non-NA values:

> for(i in inames[2:10]) dfm[[i]] = ifelse(is.na(dfm[[paste0(i,".x")]]), dfm[[paste0(i,".y")]], dfm[[paste0(i,".x")]]  ) 

The required columns are added to dfm.

rnso
  • 23,686
  • 25
  • 112
  • 234