What I'm trying to do:
I have a master data frame that contains state and zip code columns. I have reference data frames that contain Metropolitan Statistical Area based on zip code and census regions based on state. I'm using merge() to append MSA and census region to the master sheet. I've written a function that can do both automatically. Because of other things I have to do with the function, I need to have merge() not rearrange the columns. I found a solution here: How to prevent merge from reordering columns
However, I'm getting an error.
sheet <- merge(sheet, msa, by = "Zip", all.x = TRUE)[, union(names(sheet), names(msa))]
Error in
[.data.frame
(merge(sheet, msa, by = "Zip", all.x = TRUE), , : undefined columns selected
When I remove the re-ording command ([, union(names(sheet), names(msa))]), I no longer get the error, but the columns are reordered.
However, the following command is virtually identical and does not give me the error:
sheet <- merge(sheet, census, by = "State", all.x = TRUE)[, union(names(sheet), names(census))]
I've double checked both sheets and both contain columns named "Zip". I what else can I do?
Would the fact that I have the following command to convert all zip codes to five digits be causing this?
sheet$Zip <- substring(sheet$Zip, 1, 5)
Any help would be greatly appreciated.
Here are the data frames I'm working with.
Master sheet
structure(list(Source.File = structure(c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = "DrBillAllergy & Immunology", class = "factor"),
Source.Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "8/31/2015", class = "factor"), First.Name = structure(c(5L,
3L, 4L, 1L, 2L, 9L, 6L, 8L, 7L), .Label = c("Bill", "Jack",
"Jacob", "Joan", "Jon", "Karen", "Marc", "Rich", "William"
), class = "factor"), Last.Name = structure(c(9L, 2L, 7L,
6L, 5L, 4L, 3L, 1L, 8L), .Label = c("Biden", "Brown", "Bush",
"Clinton", "Edwards", "Jackson", "Johnson", "Sanders", "Smith"
), class = "factor"), Suffix = structure(c(1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L), .Label = c("", "III"), class = "factor"),
Title = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "MD", class = "factor"),
Gender = structure(c(1L, 2L, 1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("F",
"M"), class = "factor"), State = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "AL", class = "factor"),
Zip = structure(c(3L, 3L, 6L, 5L, 4L, 3L, 2L, 1L, 7L), .Label = c("35205-4731",
"35209-4159", "35209-6802", "35222-4330", "35235-3411", "35243-5129",
"36303-9461"), class = "factor"), County = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L), .Label = c("Houston", "Jefferson"
), class = "factor"), Phone.Dr.Bill.Allergy = c(126874384L,
192749635L, 984357952L, 579752485L, NA, NA, 584127544L, 587458745L,
532145968L), Work.Phone.Dr.Bill.Allergy = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA), Fax.Dr.Bill.Allergy = c(9874556324,
NA, NA, 987456321, NA, NA, 123654789, 123654788, 321456987
), Email.Dr.Bill.Allergy = structure(c(1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("", "jj@jj.com"), class = "factor"),
Phone.internal.subspec = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA), Work.Phone.Internal.Subspec = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA), Fax.Internal.Subspec = c(NA, NA, NA, NA,
587412658L, NA, NA, NA, NA), Email.internal.subspec = structure(c(1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L), .Label = c("", "nn@nn.com"
), class = "factor"), Phone.Combined.Allergy = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA), Work.Phone.Combined.Allergy = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA), Fax.Combined.Allergy = c(NA,
NA, NA, NA, NA, 598264572L, 987138582L, NA, NA), Email.Combined.Allergy = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), .Label = c("", "gg@gg.com"
), class = "factor"), Specialty1 = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "Allergy & Immunology", class = "factor"),
Specialty2 = structure(c(2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
1L), .Label = c("", "Allergy"), class = "factor"), NPI = c(1467660217L,
1851347918L, 1851634489L, 1316960271L, 1437421328L, 1982886826L,
1285606202L, 1487708012L, 1972504397L), Specialty.Code = structure(c(2L,
1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L), .Label = c("207K00000X",
"207KA0200X"), class = "factor"), MSA = c(1000L, 1000L, 1000L,
1000L, 1000L, 1000L, 1000L, 1000L, 2180L), License.Number = structure(c(3L,
5L, 4L, 1L, 7L, 9L, 2L, 6L, 8L), .Label = c("17948", "17992",
"29885", "4193", "6033", "8026", "8771", "AL16176", "MD200817"
), class = "factor"), License.State = structure(c(1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L), .Label = c("AL", "LA"), class = "factor"),
Medical.School = structure(c(1L, 6L, 3L, 4L, 5L, 1L, 1L,
1L, 2L), .Label = c("", "Hahnemann Univ Sch Of Med, Philadelphia Pa 19102",
"Temple Univ Sch Of Med, Philadelphia Pa 19140", "Univ Of Al Sch Of Med, Birmingham Al 35294",
"Univ Of Chicago, Pritzker Sch Of Med, Chicago Il 60637",
"Univ Of Mi Med Sch, Ann Arbor Mi 48109"), class = "factor"),
Residency.Training = structure(c(1L, 5L, 3L, 4L, 2L, 1L,
1L, 1L, 6L), .Label = c("", "Duke Univ Med Ctr, Internal Medicine; Kaiser Permanente Med Ctr-Sf, Flexible Or Transitional Year",
"Duke Univ Med Ctr, Pediatrics; Univ Hosp & Clinics, Pediatrics; Columbia Presbyterian Hosp, Flexible Or Transitional Year",
"Johns Hopkins Bayview M C, Internal Medicine", "Univ Of Al Hosp, Internal Medicine",
"Univ Of Pa Hlth System, Allergy And Immunology; Hahnemann Univ Hosp, Internal Medicine"
), class = "factor"), Graduation.Year = c(NA, 1971L, 1968L,
1984L, 1971L, NA, NA, NA, 1985L), Certifications = structure(c(1L,
3L, 4L, 3L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "Allergy & Immunology",
"Allergy & Immunology; Internal Medicine", "Allergy & Immunology; Pediatrics"
), class = "factor")), .Names = c("Source.File", "Source.Date",
"First.Name", "Last.Name", "Suffix", "Title", "Gender", "State",
"Zip", "County", "Phone.Dr.Bill.Allergy", "Work.Phone.Dr.Bill.Allergy",
"Fax.Dr.Bill.Allergy", "Email.Dr.Bill.Allergy", "Phone.internal.subspec",
"Work.Phone.Internal.Subspec", "Fax.Internal.Subspec", "Email.internal.subspec",
"Phone.Combined.Allergy", "Work.Phone.Combined.Allergy", "Fax.Combined.Allergy",
"Email.Combined.Allergy", "Specialty1", "Specialty2", "NPI",
"Specialty.Code", "MSA", "License.Number", "License.State", "Medical.School",
"Residency.Training", "Graduation.Year", "Certifications"), class = "data.frame", row.names = c(NA,
-9L))
msa
structure(list(Zip = c("35209", "35243", "35235", "35222"), MSA = structure(c(2L,
2L, 2L, 1L), .Label = c("Houston", "New York"), class = "factor")), .Names = c("Zip",
"MSA"), row.names = c(NA, -4L), class = "data.frame")
census
structure(list(State = structure(c(22L, 31L, 47L, 35L, 39L, 32L,
20L, 7L, 40L, 8L, 21L, 9L, 46L, 50L, 28L, 41L, 11L, 10L, 2L,
26L, 43L, 18L, 3L, 37L, 44L, 19L, 29L, 42L, 30L, 17L, 24L, 13L,
25L, 49L, 15L, 16L, 36L, 23L, 27L, 51L, 6L, 33L, 4L, 45L, 14L,
34L, 5L, 38L, 48L, 1L, 12L), .Label = c("AK", "AL", "AR", "AZ",
"CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI", "IA", "ID", "IL",
"IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI", "MN", "MO", "MS",
"MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV", "NY", "OH", "OK",
"OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA",
"WI", "WV", "WY"), class = "factor"), Census.Region = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L,
3L, 3L), .Label = c("Midwest", "Northeast", "Pacific", "South",
"West"), class = "factor")), .Names = c("State", "Census.Region"
), class = "data.frame", row.names = c(NA, -51L))