I am trying to compare two sets of addresses one small (pams: ~30,000 rows) and another large (nppes: ~4.5M rows). I am looking for a close match to first and last names and then a subsequent best match on addresses (I've used subsets of zip codes for this). Below is the code I have written so far.
The issues are: - It's slow (the code below is a bit optimized from what I last ran but it's taking hours and not completing). - Limiting to matching first letters on last name. I added the limitation to try and get the code to run but I'd rather not have the additional limitation.
I'd greatly appreciate any suggestions on how to improve this.
npi_1 <- data.frame(npi = nppes$npi, first_name = tolower(nppes$first_name), last_name = tolower(nppes$last_name),
zip = nppes$b_post_code)
pams_1 <- data.frame(pams_id = pams$ID, npi = pams$NPI, first_name = tolower(pams$First.Name),
last_name = tolower(pams$Last.Name), zip = gsub("-", "", pams$Zip))
result <- data.frame(pams_id = "", npi = "", match = "", stringsAsFactors = F)
for (i in 1:27809)
{
pams_2 <- pams_1[i,]
npi_2 <- npi_1 %>% filter(substr(last_name, 1, 1) == substr(pams_2$last_name, 1, 1))
npi_2 <- npi_2 %>% mutate(match_last = stringdist(last_name, pams_2$last_name, method = "jw")) %>%
filter(match_last <= 0.1000000)
npi_2 <- npi_2 %>% mutate(match_first = stringdist(first_name, pams_2$first_name, method = "jw")) %>%
filter(match_first <= 0.1000000)
npi_2 <- npi_2 %>% mutate(match_zip3 = stringdist(substr(zip, 1, 3), substr(pams_2$zip, 1, 3), method = "jw"))
npi_2 <- npi_2 %>% mutate(match_zip5 = stringdist(substr(zip, 1, 3), substr(pams_2$zip, 1, 5), method = "jw"))
npi_2 <- npi_2 %>% mutate(match_zip = stringdist(zip, pams_2$zip, method = "jw"))
npi_2 <- npi_2 %>% mutate(match_score = ((match_last + match_first) * 1.4) + (match_zip3 * 1.2) + match_zip5 + (match_zip * 0.8))
npi_2 <- npi_2 %>% arrange(match_score)
pair_1 <- c(pams_2$pams_id[1], npi_2$npi[1], npi_2$match_score[1])
result[i, ] <- pair_1
}