I've tried a few things and haven't been able to make any noticeable changes. I've isolated it down to the for loop and hopefully someone can give me some ideas.
Quick Background: Process was in SaS, SaS is going away. We take two data-sets, merge by zip and then compare strings w/ Jaro-Winkler to identify two entities that could be the same across the two sets. E.g. Is Zip the same? Yes, move to next criteria, is X the same? No, but it's close and it continues throwing out what it doesn't want, putting the matches in a new data frame. This part takes no time at all in R.
We can't just do a straight merge since we use all 128gigs of RAM on our EC2 instance.
So, looping row by row and only keeping those that fall into our match criteria however the speed in which it loops is a huge issue. Fairly new to R so I could be missing something absurdly obvious.
DataSet1
structure(list(PPOppID = c("785041315", "829852094", "854136412",
"787141118"), BusinessName = c("HAPPY COMPANY",
"SAD COMPANY", "HORRIBLE COMPANY", "MILDLY UPSET COMPANY"
), StreetName = c("HAPPY TRAIL", "SAD TRAIL", "HORRIBLE TRAIL",
"MILDLY UPSET TRAIL"), City = c("TOWNA", "TOWNB", "TOWNC", "TOWND"
), State = structure(c(52L, 52L, 52L, 52L), .Label = c("AK",
"AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA", "HI",
"IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME", "MI",
"MN", "MO", "MS", "MT", "NC", "ND", "NE", "NH", "NJ", "NM", "NV",
"NY", "OH", "OK", "OR", "PA", "PR", "RI", "SC", "SD", "TN", "TX",
"UT", "VA", "VT", "WA", "WI", "WV", "WY"), class = "factor"),
Zip = c("12345", "12345", "12345", "12345"), street_num = c("1435",
"110", "105", "875"), street_pre_direction = c("S", "E",
"", ""), street_name = c("HAPPY TRAIL", "SAD TRAIL", "HORRIBLE TRAIL", "MILDLY UPSET TRAIL"
), suffix = c("RD", "ST", "", ""), streetdirection = c("",
"", "", ""), unit = c("", "STE", "", ""), unit = c("",
"202", "", ""), po_box = c("", "", "", ""), phone = c("1234567891",
"3231234543", "9876543219", "9087653456"), state_code = c("56029",
"56021", "56013", "56043"), name = c("", "", "",
"PERSONA"), FirstName = c("", "", "", "K"), contact_middle = c("",
"", "", ""), LastName = c("", "", "", "LAST_NAME"), contact_title = c("",
"", "", "OWNER"), year_started = c("1996", "2005",
"1993", "1981"), number_of_employees = c("3", "5",
"5", "5"), size_code = c("A", "B", "B", "B"), type = c("C",
"C", "C", "C"), industry_class = c("", "",
"A", "A"), code= c(NA, NA, "139", "119"
), code2 = c(NA, NA, "111100", "111100"), annual_amount = c("289",
"378", "0", "306"), annual_amount2 = c("A", "A",
"A", "A"), acc002 = c("3", "1", "1", "0"), brc014 = c("0",
"0", "0", "0"), t055 = c("1", "0", "1", "0"), t068 = c("2",
"2", "1", "1"), loaddate = c("2018-05-25", "2018-05-25",
"2018-05-25", "2018-05-25"), DBAName = c("HAPPY COMPANY",
"SAD COMPANY", "HORRIBLE COMPANY", "MILDLY UPSET COMPANY"
), Phone = c(NA_character_, NA_character_, NA_character_,
NA_character_), PhoneAreaCode = c(NA_character_, NA_character_,
NA_character_, NA_character_), StreetNum = c("1435", "110",
"105", "875")), class = c("data.table", "data.frame"), row.names = c(NA, -4L)
DataSet2
structure(list(rn = c("1", "2", "3", "4"), Id = c("0000000000abcG7MAI",
"000C000000abcg9MAA", "000QC000000abcG9MAI", "000C000000abcGaMAI"
), NumId = c("187639087", "237893456", "923785629", "298777656"
), BusinessName = c("HAPPY COMPANY", "K&W PHARMACY INC",
"SCOTTISH INN", "CORY SMITH STUDIOS"), DBABusinessName = c("",
"", "", ""), Phone = c("123456789", "987654321", "9999999999",
"6086892577"), PhoneAreaCode = c("999", "123", "456", "678"),
FirstName = c("SAM", "KYLE", "TONY", "MIKE"), LastName = c("SAM",
"SMITH", "TRAVIS", "JOHNSON"), StreetNum = c("7585", "170",
"2457", ""), StreetName = c("Avoderm WAY", "Blue Buffalo U", "Farmina BLVD",
"PO BOX 0"), State = c("NJ", "NY", "AK", "PR"), City = c("P",
"X", "X", ""), Zip = c("19425", "08765",
"37355", "54632")), class = c("data.table", "data.frame"), row.names = c(NA,-4L)
Code
for(row in 1:length(df1$Zip)) {
df1 <- inner_join(df1, df2, by = c('Zip'))
df1[] <- lapply(df1, as.character)
df1$MatchBizName <- pmax(1-stringdist(df1$BusinessName, df1$BusinessName, method="jw", p=0.1),
1-stringdist(df1$DBAName, df1$DBAName, method="jw", p=0.1),
1-stringdist(df1$BusinessName, df1$DBABusinessName, method="jw", p=0.1),
1-stringdist(df1$DBAName, df1$BusinessName, method="jw", p=0.1))
df1$MatchPhone <- ifelse(1-stringdist(df1$Phone, df1$Phone, method="jw", p=0.1)>=1,1,0)
df1$MatchFirstName <- 1-stringdist(df1$FirstName, df1$FirstName, method="jw", p=0.1)
df1$MatchLastName <- 1-stringdist(df1$LastName, df1$LastName, method="jw", p=0.1)
df1$MatchStreetNum <- 1-stringdist(df1$StreetNum, df1$StreetNum, method="jw", p=0.1)
df1$MatchStreetName <- 1-stringdist(df1$StreetName, df1$StreetName, method="jw", p=0.1)
df1$MatchCity <- 1-stringdist(df1$City, df1$City, method="jw", p=0.1)
df1matches <- subset(df1,(MatchBizName >= 0.9 & MatchCity == 1 & MatchStreetName >= 0.7 & MatchStreetNum >= 0.7) |
(MatchPhone == 1 & MatchFirstName == 1 & MatchLastName == 1 & MatchStreetNum == 1 & MatchStreetName == 1 & MatchCity == 1) |
(MatchBizName >= 0.9 & MatchStreetNum == 1 & MatchStreetName >= 0.9 & MatchCity == 1) |
(MatchStreetNum == 1 & MatchPhone == 1 & MatchStreetName == 1 & MatchCity == 1) |
(MatchLastName >= 0.9 & MatchPhone == 1 & MatchBizName >= 0.9 & MatchCity == 1) |
(MatchPhone == 1 & MatchCity == 1 & MatchStreetNum == 1 & MatchStreetName >= 0.9 & MatchBizName >= 0.6) )
rm(df1)
rm(df1matches)
}