I have a dataset that has US cities present in it but the states are currently all labeled as US - the dataset is called us_data. I have found a dataset containing all US cities and states - called us - and I'm trying to create a new state column in my us_data dataset by taking the city name, finding it in us and pulling the state from us to add to the new column in us_data.
I'm using R and not sure how to go about this. I don't believe a regular join will work as there can be multiple observations in us_data with the same city so all of the rows with the same city would need to be matched. I was thinking of using mutate() from dplyr but not sure how to reference two datasets in the function call so any help would be appreciated! I've attached a glimpse of both datasets for reference.
> dput(us_data[1:10,1:7])
structure(list(name = c("Carpenter Rd.", "1515 N. Sheridan - Wilmette",
"S McCarran & E Greg St - Sparks", "Hwy 20 & Tharp - Yuba City",
"Greenmount & I-64", "Veterans Blvd & Kingman St", "Hampden & Dayton - Denver",
"50th and Kipling-Wheatridge, CO", "Higuera & Tank Farm", "Burr Ridge-I-55 & County Line Rd"
), url = c("https://www.starbucks.com/store-locator/store/6323",
"https://www.starbucks.com/store-locator/store/6325", "https://www.starbucks.com/store-locator/store/6327",
"https://www.starbucks.com/store-locator/store/6328", "https://www.starbucks.com/store-locator/store/6329",
"https://www.starbucks.com/store-locator/store/6330", "https://www.starbucks.com/store-locator/store/6334",
"https://www.starbucks.com/store-locator/store/6333", "https://www.starbucks.com/store-locator/store/6331",
"https://www.starbucks.com/store-locator/store/6340"), street_address = c("3650 Carpenter Rd.",
"1515 North Sheridan, Building 4", "1560 S. Stanford Way, Suite A",
"1615 Colusa Hwy, Ste 100", "1126 Central Park Drive", "4312 Veterans Blvd.",
"9925 East Hampden Ave", "4975 Kipling St", "3971 S. Higuera Street",
"515 Village Center Dr."), city = c("Pittsfield", "Wilmette",
"Sparks", "Yuba City", "OFallon", "Metairie", "Denver", "Wheat Ridge",
"San Luis Obispo", "Burr Ridge"), state = c("US", "US", "US",
"US", "US", "US", "US", "US", "US", "US"), zip_code = c("48104",
"600911822", "894316331", "959939437", "622691769", "70006",
"802314903", "800332340", "934011580", "605274516"), country = c("US",
"US", "US", "US", "US", "US", "US", "US", "US", "US")), row.names = c(NA,
10L), class = "data.frame")
> dput(us[1:20,])
structure(list(city = c("New York", "Los Angeles", "Chicago",
"Miami", "Dallas", "Philadelphia", "Houston", "Atlanta", "Washington",
"Boston", "Phoenix", "Seattle", "San Francisco", "Detroit", "San Diego",
"Minneapolis", "Tampa", "Denver", "Brooklyn", "Queens"), city_ascii = c("New York",
"Los Angeles", "Chicago", "Miami", "Dallas", "Philadelphia",
"Houston", "Atlanta", "Washington", "Boston", "Phoenix", "Seattle",
"San Francisco", "Detroit", "San Diego", "Minneapolis", "Tampa",
"Denver", "Brooklyn", "Queens"), state_id = c("NY", "CA", "IL",
"FL", "TX", "PA", "TX", "GA", "DC", "MA", "AZ", "WA", "CA", "MI",
"CA", "MN", "FL", "CO", "NY", "NY"), state_name = c("New York",
"California", "Illinois", "Florida", "Texas", "Pennsylvania",
"Texas", "Georgia", "District of Columbia", "Massachusetts",
"Arizona", "Washington", "California", "Michigan", "California",
"Minnesota", "Florida", "Colorado", "New York", "New York")), row.names = c(NA,
20L), class = "data.frame")