8

I have downloaded the street abbreviations from USPS. Here is the data:

dput(usps_streets)
structure(list(common_abbrev = c("allee", "alley", "ally", "aly", 
"anex", "annex", "annx", "anx", "arc", "arcade", "av", "ave", 
"aven", "avenu", "avenue", "avn", "avnue", "bayoo", "bayou", 
"bch", "beach", "bend", "bnd", "blf", "bluf", "bluff", "bluffs", 
"bot", "btm", "bottm", "bottom", "blvd", "boul", "boulevard", 
"boulv", "br", "brnch", "branch", "brdge", "brg", "bridge", "brk", 
"brook", "brooks", "burg", "burgs", "byp", "bypa", "bypas", "bypass", 
"byps", "camp", "cp", "cmp", "canyn", "canyon", "cnyn", "cape", 
"cpe", "causeway", "causwa", "cswy", "cen", "cent", "center", 
"centr", "centre", "cnter", "cntr", "ctr", "centers", "cir", 
"circ", "circl", "circle", "crcl", "crcle", "circles", "clf", 
"cliff", "clfs", "cliffs", "clb", "club", "common", "commons", 
"cor", "corner", "corners", "cors", "course", "crse", "court", 
"ct", "courts", "cts", "cove", "cv", "coves", "creek", "crk", 
"crescent", "cres", "crsent", "crsnt", "crest", "crossing", "crssng", 
"xing", "crossroad", "crossroads", "curve", "dale", "dl", "dam", 
"dm", "div", "divide", "dv", "dvd", "dr", "driv", "drive", "drv", 
"drives", "est", "estate", "estates", "ests", "exp", "expr", 
"express", "expressway", "expw", "expy", "ext", "extension", 
"extn", "extnsn", "exts", "fall", "falls", "fls", "ferry", "frry", 
"fry", "field", "fld", "fields", "flds", "flat", "flt", "flats", 
"flts", "ford", "frd", "fords", "forest", "forests", "frst", 
"forg", "forge", "frg", "forges", "fork", "frk", "forks", "frks", 
"fort", "frt", "ft", "freeway", "freewy", "frway", "frwy", "fwy", 
"garden", "gardn", "grden", "grdn", "gardens", "gdns", "grdns", 
"gateway", "gatewy", "gatway", "gtway", "gtwy", "glen", "gln", 
"glens", "green", "grn", "greens", "grov", "grove", "grv", "groves", 
"harb", "harbor", "harbr", "hbr", "hrbor", "harbors", "haven", 
"hvn", "ht", "hts", "highway", "highwy", "hiway", "hiwy", "hway", 
"hwy", "hill", "hl", "hills", "hls", "hllw", "hollow", "hollows", 
"holw", "holws", "inlt", "is", "island", "islnd", "islands", 
"islnds", "iss", "isle", "isles", "jct", "jction", "jctn", "junction", 
"junctn", "juncton", "jctns", "jcts", "junctions", "key", "ky", 
"keys", "kys", "knl", "knol", "knoll", "knls", "knolls", "lk", 
"lake", "lks", "lakes", "land", "landing", "lndg", "lndng", "lane", 
"ln", "lgt", "light", "lights", "lf", "loaf", "lck", "lock", 
"lcks", "locks", "ldg", "ldge", "lodg", "lodge", "loop", "loops", 
"mall", "mnr", "manor", "manors", "mnrs", "meadow", "mdw", "mdws", 
"meadows", "medows", "mews", "mill", "mills", "missn", "mssn", 
"motorway", "mnt", "mt", "mount", "mntain", "mntn", "mountain", 
"mountin", "mtin", "mtn", "mntns", "mountains", "nck", "neck", 
"orch", "orchard", "orchrd", "oval", "ovl", "overpass", "park", 
"prk", "parks", "parkway", "parkwy", "pkway", "pkwy", "pky", 
"parkways", "pkwys", "pass", "passage", "path", "paths", "pike", 
"pikes", "pine", "pines", "pnes", "pl", "plain", "pln", "plains", 
"plns", "plaza", "plz", "plza", "point", "pt", "points", "pts", 
"port", "prt", "ports", "prts", "pr", "prairie", "prr", "rad", 
"radial", "radiel", "radl", "ramp", "ranch", "ranches", "rnch", 
"rnchs", "rapid", "rpd", "rapids", "rpds", "rest", "rst", "rdg", 
"rdge", "ridge", "rdgs", "ridges", "riv", "river", "rvr", "rivr", 
"rd", "road", "roads", "rds", "route", "row", "rue", "run", "shl", 
"shoal", "shls", "shoals", "shoar", "shore", "shr", "shoars", 
"shores", "shrs", "skyway", "spg", "spng", "spring", "sprng", 
"spgs", "spngs", "springs", "sprngs", "spur", "spurs", "sq", 
"sqr", "sqre", "squ", "square", "sqrs", "squares", "sta", "station", 
"statn", "stn", "stra", "strav", "straven", "stravenue", "stravn", 
"strvn", "strvnue", "stream", "streme", "strm", "street", "strt", 
"st", "str", "streets", "smt", "suite", "sumit", "sumitt", "summit", 
"ter", "terr", "terrace", "throughway", "trace", "traces", "trce", 
"track", "tracks", "trak", "trk", "trks", "trafficway", "trail", 
"trails", "trl", "trls", "trailer", "trlr", "trlrs", "tunel", 
"tunl", "tunls", "tunnel", "tunnels", "tunnl", "trnpk", "turnpike", 
"turnpk", "underpass", "un", "union", "unions", "valley", "vally", 
"vlly", "vly", "valleys", "vlys", "vdct", "via", "viadct", "viaduct", 
"view", "vw", "views", "vws", "vill", "villag", "village", "villg", 
"villiage", "vlg", "villages", "vlgs", "ville", "vl", "vis", 
"vist", "vista", "vst", "vsta", "walk", "walks", "wall", "wy", 
"way", "ways", "well", "wells", "wls"), usps_abbrev = c("aly", 
"aly", "aly", "aly", "anx", "anx", "anx", "anx", "arc", "arc", 
"ave", "ave", "ave", "ave", "ave", "ave", "ave", "byu", "byu", 
"bch", "bch", "bnd", "bnd", "blf", "blf", "blf", "blfs", "btm", 
"btm", "btm", "btm", "blvd", "blvd", "blvd", "blvd", "br", "br", 
"br", "brg", "brg", "brg", "brk", "brk", "brks", "bg", "bgs", 
"byp", "byp", "byp", "byp", "byp", "cp", "cp", "cp", "cyn", "cyn", 
"cyn", "cpe", "cpe", "cswy", "cswy", "cswy", "ctr", "ctr", "ctr", 
"ctr", "ctr", "ctr", "ctr", "ctr", "ctrs", "cir", "cir", "cir", 
"cir", "cir", "cir", "cirs", "clf", "clf", "clfs", "clfs", "clb", 
"clb", "cmn", "cmns", "cor", "cor", "cors", "cors", "crse", "crse", 
"ct", "ct", "cts", "cts", "cv", "cv", "cvs", "crk", "crk", "cres", 
"cres", "cres", "cres", "crst", "xing", "xing", "xing", "xrd", 
"xrds", "curv", "dl", "dl", "dm", "dm", "dv", "dv", "dv", "dv", 
"dr", "dr", "dr", "dr", "drs", "est", "est", "ests", "ests", 
"expy", "expy", "expy", "expy", "expy", "expy", "ext", "ext", 
"ext", "ext", "exts", "fall", "fls", "fls", "fry", "fry", "fry", 
"fld", "fld", "flds", "flds", "flt", "flt", "flts", "flts", "frd", 
"frd", "frds", "frst", "frst", "frst", "frg", "frg", "frg", "frgs", 
"frk", "frk", "frks", "frks", "ft", "ft", "ft", "fwy", "fwy", 
"fwy", "fwy", "fwy", "gdn", "gdn", "gdn", "gdn", "gdns", "gdns", 
"gdns", "gtwy", "gtwy", "gtwy", "gtwy", "gtwy", "gln", "gln", 
"glns", "grn", "grn", "grns", "grv", "grv", "grv", "grvs", "hbr", 
"hbr", "hbr", "hbr", "hbr", "hbrs", "hvn", "hvn", "hts", "hts", 
"hwy", "hwy", "hwy", "hwy", "hwy", "hwy", "hl", "hl", "hls", 
"hls", "holw", "holw", "holw", "holw", "holw", "inlt", "is", 
"is", "is", "iss", "iss", "iss", "isle", "isle", "jct", "jct", 
"jct", "jct", "jct", "jct", "jcts", "jcts", "jcts", "ky", "ky", 
"kys", "kys", "knl", "knl", "knl", "knls", "knls", "lk", "lk", 
"lks", "lks", "land", "lndg", "lndg", "lndg", "ln", "ln", "lgt", 
"lgt", "lgts", "lf", "lf", "lck", "lck", "lcks", "lcks", "ldg", 
"ldg", "ldg", "ldg", "loop", "loop", "mall", "mnr", "mnr", "mnrs", 
"mnrs", "mdw", "mdws", "mdws", "mdws", "mdws", "mews", "ml", 
"mls", "msn", "msn", "mtwy", "mt", "mt", "mt", "mtn", "mtn", 
"mtn", "mtn", "mtn", "mtn", "mtns", "mtns", "nck", "nck", "orch", 
"orch", "orch", "oval", "oval", "opas", "park", "park", "park", 
"pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pass", 
"psge", "path", "path", "pike", "pike", "pne", "pnes", "pnes", 
"pl", "pln", "pln", "plns", "plns", "plz", "plz", "plz", "pt", 
"pt", "pts", "pts", "prt", "prt", "prts", "prts", "pr", "pr", 
"pr", "radl", "radl", "radl", "radl", "ramp", "rnch", "rnch", 
"rnch", "rnch", "rpd", "rpd", "rpds", "rpds", "rst", "rst", "rdg", 
"rdg", "rdg", "rdgs", "rdgs", "riv", "riv", "riv", "riv", "rd", 
"rd", "rds", "rds", "rte", "row", "rue", "run", "shl", "shl", 
"shls", "shls", "shr", "shr", "shr", "shrs", "shrs", "shrs", 
"skwy", "spg", "spg", "spg", "spg", "spgs", "spgs", "spgs", "spgs", 
"spur", "spur", "sq", "sq", "sq", "sq", "sq", "sqs", "sqs", "sta", 
"sta", "sta", "sta", "stra", "stra", "stra", "stra", "stra", 
"stra", "stra", "strm", "strm", "strm", "st", "st", "st", "st", 
"sts", "smt", "ste", "smt", "smt", "smt", "ter", "ter", "ter", 
"trwy", "trce", "trce", "trce", "trak", "trak", "trak", "trak", 
"trak", "trfy", "trl", "trl", "trl", "trl", "trlr", "trlr", "trlr", 
"tunl", "tunl", "tunl", "tunl", "tunl", "tunl", "tpke", "tpke", 
"tpke", "upas", "un", "un", "uns", "vly", "vly", "vly", "vly", 
"vlys", "vlys", "via", "via", "via", "via", "vw", "vw", "vws", 
"vws", "vlg", "vlg", "vlg", "vlg", "vlg", "vlg", "vlgs", "vlgs", 
"vl", "vl", "vis", "vis", "vis", "vis", "vis", "walk", "walk", 
"wall", "way", "way", "ways", "wl", "wls", "wls")), class = "data.frame", row.names = c(NA, 
-503L))

I would like to use them to work with street addresses and states. Toy data:

a <- c("10900 harper ave", "12235 davis annex", "24 van cortland parkway")

To convert common abbreviations to the usps abbreviation (standardizing the data), I built a little function:

mr_zip <- function(x){
  x <-textclean::mgsub(usps_streets$common_abbrev, usps_streets$usps_abbrev, x, fixed = T,
                   order.pattern = T)
  return(x)
}

The problem arises when I apply my function to my data:

f <- sapply(a, mr_zip)

I get the wrong results:

 "10900 harper avee"       "1235 davis anx" "24 van cortland pkway"

Because what I should be getting is:

"10900 harper ave"       "1235 davis anx" "24 van cortland pkwy"

My questions:

  1. Why is this happening when I specified order.pattern = T and fixed = T in the mgsub function?
  2. What can I do to fix it?
  3. Is there an alternative approach to using vectors in multiple substitution patterns for text?

Thanks in advance, all suggestions are welcome.

EDIT: Thanks to @RichieSacramento I have found that using the word boundary does help but the function is still incredibly slow when used on a large dataframe (> 400,000 rows). Using safe = TRUE in mgsub leads to the function working properly but it's incredibly slow. Something quick would be desired--hence the bounty.

ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
jvalenti
  • 604
  • 1
  • 9
  • 31
  • @Tyler Rinker any thoughts on this? Since you made it? https://stackoverflow.com/questions/19424709/r-gsub-pattern-vector-and-replacement-vector – jvalenti Oct 06 '21 at 18:22
  • @RitchieSacramento thanks for this it works well thanks!. Would you mind explaining a bit? Still learning `regex`. – jvalenti Oct 08 '21 at 15:59

5 Answers5

8

So let's start having fun.

Step 1 First, we will load your data into a tibble named USPS.

library(tidyverse)
USPS = tibble(
 common_abbrev = c("allee", "alley", "ally", "aly", 
 "anex", "annex", "annx", "anx", "arc", "arcade", "av", "ave", 
 "aven", "avenu", "avenue", "avn", "avnue", "bayoo", "bayou", 
 "bch", "beach", "bend", "bnd", "blf", "bluf", "bluff", "bluffs", 
 "bot", "btm", "bottm", "bottom", "blvd", "boul", "boulevard", 
 "boulv", "br", "brnch", "branch", "brdge", "brg", "bridge", "brk", 
 "brook", "brooks", "burg", "burgs", "byp", "bypa", "bypas", "bypass", 
 "byps", "camp", "cp", "cmp", "canyn", "canyon", "cnyn", "cape", 
 "cpe", "causeway", "causwa", "cswy", "cen", "cent", "center", 
 "centr", "centre", "cnter", "cntr", "ctr", "centers", "cir", 
 "circ", "circl", "circle", "crcl", "crcle", "circles", "clf", 
 "cliff", "clfs", "cliffs", "clb", "club", "common", "commons", 
 "cor", "corner", "corners", "cors", "course", "crse", "court", 
 "ct", "courts", "cts", "cove", "cv", "coves", "creek", "crk", 
 "crescent", "cres", "crsent", "crsnt", "crest", "crossing", "crssng", 
 "xing", "crossroad", "crossroads", "curve", "dale", "dl", "dam", 
 "dm", "div", "divide", "dv", "dvd", "dr", "driv", "drive", "drv", 
 "drives", "est", "estate", "estates", "ests", "exp", "expr", 
 "express", "expressway", "expw", "expy", "ext", "extension", 
 "extn", "extnsn", "exts", "fall", "falls", "fls", "ferry", "frry", 
 "fry", "field", "fld", "fields", "flds", "flat", "flt", "flats", 
 "flts", "ford", "frd", "fords", "forest", "forests", "frst", 
 "forg", "forge", "frg", "forges", "fork", "frk", "forks", "frks", 
 "fort", "frt", "ft", "freeway", "freewy", "frway", "frwy", "fwy", 
 "garden", "gardn", "grden", "grdn", "gardens", "gdns", "grdns", 
 "gateway", "gatewy", "gatway", "gtway", "gtwy", "glen", "gln", 
 "glens", "green", "grn", "greens", "grov", "grove", "grv", "groves", 
 "harb", "harbor", "harbr", "hbr", "hrbor", "harbors", "haven", 
 "hvn", "ht", "hts", "highway", "highwy", "hiway", "hiwy", "hway", 
 "hwy", "hill", "hl", "hills", "hls", "hllw", "hollow", "hollows", 
 "holw", "holws", "inlt", "is", "island", "islnd", "islands", 
 "islnds", "iss", "isle", "isles", "jct", "jction", "jctn", "junction", 
 "junctn", "juncton", "jctns", "jcts", "junctions", "key", "ky", 
 "keys", "kys", "knl", "knol", "knoll", "knls", "knolls", "lk", 
 "lake", "lks", "lakes", "land", "landing", "lndg", "lndng", "lane", 
 "ln", "lgt", "light", "lights", "lf", "loaf", "lck", "lock", 
 "lcks", "locks", "ldg", "ldge", "lodg", "lodge", "loop", "loops", 
 "mall", "mnr", "manor", "manors", "mnrs", "meadow", "mdw", "mdws", 
 "meadows", "medows", "mews", "mill", "mills", "missn", "mssn", 
 "motorway", "mnt", "mt", "mount", "mntain", "mntn", "mountain", 
 "mountin", "mtin", "mtn", "mntns", "mountains", "nck", "neck", 
 "orch", "orchard", "orchrd", "oval", "ovl", "overpass", "park", 
 "prk", "parks", "parkway", "parkwy", "pkway", "pkwy", "pky", 
 "parkways", "pkwys", "pass", "passage", "path", "paths", "pike", 
 "pikes", "pine", "pines", "pnes", "pl", "plain", "pln", "plains", 
 "plns", "plaza", "plz", "plza", "point", "pt", "points", "pts", 
 "port", "prt", "ports", "prts", "pr", "prairie", "prr", "rad", 
 "radial", "radiel", "radl", "ramp", "ranch", "ranches", "rnch", 
 "rnchs", "rapid", "rpd", "rapids", "rpds", "rest", "rst", "rdg", 
 "rdge", "ridge", "rdgs", "ridges", "riv", "river", "rvr", "rivr", 
 "rd", "road", "roads", "rds", "route", "row", "rue", "run", "shl", 
 "shoal", "shls", "shoals", "shoar", "shore", "shr", "shoars", 
 "shores", "shrs", "skyway", "spg", "spng", "spring", "sprng", 
 "spgs", "spngs", "springs", "sprngs", "spur", "spurs", "sq", 
 "sqr", "sqre", "squ", "square", "sqrs", "squares", "sta", "station", 
 "statn", "stn", "stra", "strav", "straven", "stravenue", "stravn", 
 "strvn", "strvnue", "stream", "streme", "strm", "street", "strt", 
 "st", "str", "streets", "smt", "suite", "sumit", "sumitt", "summit", 
 "ter", "terr", "terrace", "throughway", "trace", "traces", "trce", 
 "track", "tracks", "trak", "trk", "trks", "trafficway", "trail", 
 "trails", "trl", "trls", "trailer", "trlr", "trlrs", "tunel", 
 "tunl", "tunls", "tunnel", "tunnels", "tunnl", "trnpk", "turnpike", 
 "turnpk", "underpass", "un", "union", "unions", "valley", "vally", 
 "vlly", "vly", "valleys", "vlys", "vdct", "via", "viadct", "viaduct", 
 "view", "vw", "views", "vws", "vill", "villag", "village", "villg", 
 "villiage", "vlg", "villages", "vlgs", "ville", "vl", "vis", 
 "vist", "vista", "vst", "vsta", "walk", "walks", "wall", "wy", 
 "way", "ways", "well", "wells", "wls"), 
 usps_abbrev = c("aly", 
 "aly", "aly", "aly", "anx", "anx", "anx", "anx", "arc", "arc", 
 "ave", "ave", "ave", "ave", "ave", "ave", "ave", "byu", "byu", 
 "bch", "bch", "bnd", "bnd", "blf", "blf", "blf", "blfs", "btm", 
 "btm", "btm", "btm", "blvd", "blvd", "blvd", "blvd", "br", "br", 
 "br", "brg", "brg", "brg", "brk", "brk", "brks", "bg", "bgs", 
 "byp", "byp", "byp", "byp", "byp", "cp", "cp", "cp", "cyn", "cyn", 
 "cyn", "cpe", "cpe", "cswy", "cswy", "cswy", "ctr", "ctr", "ctr", 
 "ctr", "ctr", "ctr", "ctr", "ctr", "ctrs", "cir", "cir", "cir", 
 "cir", "cir", "cir", "cirs", "clf", "clf", "clfs", "clfs", "clb", 
 "clb", "cmn", "cmns", "cor", "cor", "cors", "cors", "crse", "crse", 
 "ct", "ct", "cts", "cts", "cv", "cv", "cvs", "crk", "crk", "cres", 
 "cres", "cres", "cres", "crst", "xing", "xing", "xing", "xrd", 
 "xrds", "curv", "dl", "dl", "dm", "dm", "dv", "dv", "dv", "dv", 
 "dr", "dr", "dr", "dr", "drs", "est", "est", "ests", "ests", 
 "expy", "expy", "expy", "expy", "expy", "expy", "ext", "ext", 
 "ext", "ext", "exts", "fall", "fls", "fls", "fry", "fry", "fry", 
 "fld", "fld", "flds", "flds", "flt", "flt", "flts", "flts", "frd", 
 "frd", "frds", "frst", "frst", "frst", "frg", "frg", "frg", "frgs", 
 "frk", "frk", "frks", "frks", "ft", "ft", "ft", "fwy", "fwy", 
 "fwy", "fwy", "fwy", "gdn", "gdn", "gdn", "gdn", "gdns", "gdns", 
 "gdns", "gtwy", "gtwy", "gtwy", "gtwy", "gtwy", "gln", "gln", 
 "glns", "grn", "grn", "grns", "grv", "grv", "grv", "grvs", "hbr", 
 "hbr", "hbr", "hbr", "hbr", "hbrs", "hvn", "hvn", "hts", "hts", 
 "hwy", "hwy", "hwy", "hwy", "hwy", "hwy", "hl", "hl", "hls", 
 "hls", "holw", "holw", "holw", "holw", "holw", "inlt", "is", 
 "is", "is", "iss", "iss", "iss", "isle", "isle", "jct", "jct", 
 "jct", "jct", "jct", "jct", "jcts", "jcts", "jcts", "ky", "ky", 
 "kys", "kys", "knl", "knl", "knl", "knls", "knls", "lk", "lk", 
 "lks", "lks", "land", "lndg", "lndg", "lndg", "ln", "ln", "lgt", 
 "lgt", "lgts", "lf", "lf", "lck", "lck", "lcks", "lcks", "ldg", 
 "ldg", "ldg", "ldg", "loop", "loop", "mall", "mnr", "mnr", "mnrs", 
 "mnrs", "mdw", "mdws", "mdws", "mdws", "mdws", "mews", "ml", 
 "mls", "msn", "msn", "mtwy", "mt", "mt", "mt", "mtn", "mtn", 
 "mtn", "mtn", "mtn", "mtn", "mtns", "mtns", "nck", "nck", "orch", 
 "orch", "orch", "oval", "oval", "opas", "park", "park", "park", 
 "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pass", 
 "psge", "path", "path", "pike", "pike", "pne", "pnes", "pnes", 
 "pl", "pln", "pln", "plns", "plns", "plz", "plz", "plz", "pt", 
 "pt", "pts", "pts", "prt", "prt", "prts", "prts", "pr", "pr", 
 "pr", "radl", "radl", "radl", "radl", "ramp", "rnch", "rnch", 
 "rnch", "rnch", "rpd", "rpd", "rpds", "rpds", "rst", "rst", "rdg", 
 "rdg", "rdg", "rdgs", "rdgs", "riv", "riv", "riv", "riv", "rd", 
 "rd", "rds", "rds", "rte", "row", "rue", "run", "shl", "shl", 
 "shls", "shls", "shr", "shr", "shr", "shrs", "shrs", "shrs", 
 "skwy", "spg", "spg", "spg", "spg", "spgs", "spgs", "spgs", "spgs", 
 "spur", "spur", "sq", "sq", "sq", "sq", "sq", "sqs", "sqs", "sta", 
 "sta", "sta", "sta", "stra", "stra", "stra", "stra", "stra", 
 "stra", "stra", "strm", "strm", "strm", "st", "st", "st", "st", 
 "sts", "smt", "ste", "smt", "smt", "smt", "ter", "ter", "ter", 
 "trwy", "trce", "trce", "trce", "trak", "trak", "trak", "trak", 
 "trak", "trfy", "trl", "trl", "trl", "trl", "trlr", "trlr", "trlr", 
 "tunl", "tunl", "tunl", "tunl", "tunl", "tunl", "tpke", "tpke", 
 "tpke", "upas", "un", "un", "uns", "vly", "vly", "vly", "vly", 
 "vlys", "vlys", "via", "via", "via", "via", "vw", "vw", "vws", 
 "vws", "vlg", "vlg", "vlg", "vlg", "vlg", "vlg", "vlgs", "vlgs", 
 "vl", "vl", "vis", "vis", "vis", "vis", "vis", "walk", "walk", 
 "wall", "way", "way", "ways", "wl", "wls", "wls"))

USPS

output

# A tibble: 503 x 2
   common_abbrev usps_abbrev
   <chr>         <chr>      
 1 allee         aly        
 2 alley         aly        
 3 ally          aly        
 4 aly           aly        
 5 anex          anx        
 6 annex         anx        
 7 annx          anx        
 8 anx           anx        
 9 arc           arc        
10 arcade        arc        
# ... with 493 more rows

Step 2 Now we will convert your USPS table into a vector with named elements.

USPSv = array(data = USPS$usps_abbrev, 
              dimnames= list(USPS$common_abbrev))

Let's see what it gives us

USPSv['viadct']
# viadct 
#  "via" 

USPSv['coves'] 
# coves 
# "cvs" 

It looks inviting.

Step 3 Now let's create an converting (vectorized) function that uses our USPSv vector with the named elements.

USPS_conv = function(x) {
  comm = str_split(x, " ") %>% .[[1]] %>% .[length(.)]
  str_replace(x, comm, USPSv[comm])
}
USPS_conv = Vectorize(USPS_conv)

Let's see how our USPS_conv works.

USPS_conv("10900 harper coves")
# 10900 harper coves 
# "10900 harper cvs"

USPS_conv("10900 harper viadct")
# 10900 harper viadct 
# "10900 harper via"

Great, but will it handle the vector?

USPS_conv(c("10900 harper coves", "10900 harper viadct", "10900 harper ave"))
# 10900 harper coves 10900 harper viadct    10900 harper ave 
# "10900 harper cvs"  "10900 harper via"  "10900 harper ave"   

Everything has been going great so far.

Step 4 Now it's time to use our USPS_conv function in the mutate function. However, we need some input data. We will generate them ourselves.

n=10
set.seed(1111)
df = tibble(
  addresses = paste(
    sample(10:10000, n, replace = TRUE),
    sample(c("harper", "davis", "van cortland", "marry", "von brown"), n, replace = TRUE),
    sample(USPS$common_abbrev, n, replace = TRUE)
  )
)
df

output

# A tibble: 10 x 1
   addresses          
   <chr>              
 1 8995 davis crk     
 2 8527 davis tunnl   
 3 7663 von brown wall
 4 3043 harper lake   
 5 9192 von brown grdn
 6 120 marry rvr      
 7 72 von brown locks 
 8 8752 marry gardn   
 9 7754 davis corner  
10 3745 davis jcts  

Let's perform a mutation

df %>% mutate(addresses = USPS_conv(addresses))

output

# A tibble: 10 x 1
   addresses          
   <chr>              
 1 8995 davis crk     
 2 8527 davis tunl    
 3 7663 von brown wall
 4 3043 harper lk     
 5 9192 von brown gdn 
 6 120 marry riv      
 7 72 von brown lcks  
 8 8752 marry gdn     
 9 7754 davis cor     
10 3745 davis jcts 

Does it look okay? It seems like the most.

Step 5 So it's time for a great test of 1,000,000 addresses! We will generate the data exactly as before.

n=1000000
set.seed(1111)
df = tibble(
  addresses = paste(
    sample(10:10000, n, replace = TRUE),
    sample(c("harper", "davis", "van cortland", "marry", "von brown"), n, replace = TRUE),
    sample(USPS$common_abbrev, n, replace = TRUE)
  )
)
df

output

# A tibble: 1,000,000 x 1
   addresses              
   <chr>                  
 1 8995 marry pass        
 2 8527 davis spng        
 3 7663 marry loaf        
 4 3043 davis common      
 5 9192 marry bnd         
 6 120 von brown corner   
 7 72 van cortland plains 
 8 8752 van cortland crcle
 9 7754 von brown sqrs    
10 3745 marry key         
# ... with 999,990 more rows

So let's go. But let's measure immediately how long it will take.

start_time =Sys.time()
df %>% mutate(addresses = USPS_conv(addresses))
Sys.time()-start_time
#Time difference of 3.610211 mins

As you can see, it took me less than 4 minutes. I don't know if you were expecting something faster and if you are satisfied with this time. I will be waiting for your comment.

Last minute update

It turned out that USPS_conv can be slightly sped up if we slightly change its code.

USPS_conv2 = function(x) {
  t = str_split(x, " ")
  comm = t[[1]][length(t[[1]])]
  str_replace(x, comm, USPSv[comm])
}
USPS_conv2 = Vectorize(USPS_conv2)

The new USPS_conv2 function works slightly faster.

enter image description here

All this translated into the reduction of the mutation time of a million records to 3.3 min.

Big update for super speed!!

I realized that my first version of the answer, although simple in structure, was a bit slow :-(. So I decided to come up with something faster. I will share my idea here, but be warned, some solutions will be a bit "magical".

Magic Dictionary-Environment

To speed up the operation, we need to create a dictionary that will quickly convert the key into a value. We will create it using the Environments in R.

And here is a small interface of our dictionary.

#Simple Dictionary (hash Table) Interface for R
ht.create = function() new.env()

ht.insert = function(ht, key, value)  ht[[key]] <- value
ht.insert = Vectorize(ht.insert, c("key", "value"))

ht.lookup = function(ht, key) ht[[key]]
ht.lookup = Vectorize(ht.lookup, "key")

ht.delete = function(ht, key) rm(list=key,envir=ht,inherits=FALSE)
ht.delete = Vectorize(ht.delete, "key")

How it happened. I already show. Below I will create a new dictionary-environment ht.create() to which I will add two elements "a1" and "a2" ht.insert with values "va1" and "va2" respectively. Finally, I will ask my environment-dictionary with the values for these ht.lookup keys.

ht1 = ht.create()
ht.insert(ht1, "a1", "va1" )
ht1 %>% ht.insert("a2", "va2")
ht.lookup(ht1, "a1")
# a1
# "va1"
ht1 %>% ht.lookup("a2")
# a2
# "va2"

Please note that the functions ht.insert and ht.lookup are vectorized, which means that I will be able to add entire vectors to the dictionary. In the same way, I will be able to query my dictionary by giving whole vectors.

ht.insert(ht1, paste0("a", 1:10),paste0("va", 1:10))
ht1 %>% ht.insert( paste0("a", 11:20),paste0("va", 11:20))

ht.lookup(ht1, paste0("a", 10:1))
# a10     a9     a8     a7     a6     a5     a4     a3     a2     a1
# "va10"  "va9"  "va8"  "va7"  "va6"  "va5"  "va4"  "va3"  "va2"  "va1"
ht1 %>% ht.lookup(paste0("a", 20:11))
# a20    a19    a18    a17    a16    a15    a14    a13    a12    a11
# "va20" "va19" "va18" "va17" "va16" "va15" "va14" "va13" "va12" "va11"

Magic attribute

Now we will do a function that will add an additional attribute to the selected dictionary-environment table.

#Functions that add a dictionary attribute to tibble
addHashTable = function(.data, key, value){
  key = enquo(key)
  value = enquo(value)

  if (!all(c(as_label(key), as_label(value)) %in% names(.data))) {
    stop(paste0("`.data` must contain `", as_label(key),
                "` and `", as_label(value), "` columns"))
  }

  if((.data %>% distinct(!!key, !!value) %>% nrow)!=
     (.data %>% distinct(!!key) %>% nrow)){
    warning(paste0(
      "\nThe number of unique values of the ", as_label(key),
      " variable is different\n",
      " from the number of unique values of the ",
      as_label(key), " and ", as_label(value)," pairs!\n",
      "The dictionary will only return the last values for a given key!"))
  }

  ht = ht.create()
  ht %>% ht.insert(.data %>% distinct(!!key, !!value) %>% pull(!!key),
                   .data %>% distinct(!!key, !!value) %>% pull(!!value))
  attr(.data, "hashTab") = ht
  .data
}


addHashTable2 = function(.x, .y, key, value){
  key = enquo(key)
  value = enquo(value)

  if (!all(c(as_label(key), as_label(value)) %in% names(.y))) {
    stop(paste0("`.y` must contain `", as_label(key),
                "` and `", as_label(value), "` columns"))
  }

  if((.y %>% distinct(!!key, !!value) %>% nrow)!=
     (.y %>% distinct(!!key) %>% nrow)){
    warning(paste0(
      "\nThe number of unique values of the ", as_label(key),
      " variable is different\n",
      " from the number of unique values of the ",
      as_label(key), " and ", as_label(value)," pairs!\n",
      "The dictionary will only return the last values for a given key!"))
  }

  ht = ht.create()
  ht %>% ht.insert(.y %>% distinct(!!key, !!value) %>% pull(!!key),
                   .y %>% distinct(!!key, !!value) %>% pull(!!value))
  attr(.x, "hashTab") = ht
  .x
}

There are actually two functions there. The addHashTable function adds the dictionary-environment attribute to the same table from which it gets key-value pairs. The addHashTable2 function likewise adds to the dictionary-environment table, but retrieves key pairs from another table.

Let's see how addHashTable works.

USPS = USPS %>% addHashTable(common_abbrev, usps_abbrev)
str(USPS)
# tibble [503 x 2] (S3: tbl_df/tbl/data.frame)
# $ common_abbrev: chr [1:503] "allee" "alley" "ally" "aly" ...
# $ usps_abbrev  : chr [1:503] "aly" "aly" "aly" "aly" ...
# - attr(*, "hashTab")=<environment: 0x000000001591bbf0>

As you can see, an attribute has been added to the USPS table which points to the 0x000000001591bbf0 environment.

Replacement function

We need to create one function that will use the dictionary-environment added in this way to replace, in this case, the last word from the indicated variable with the corresponding value from the dictionary. Here it is.

replaceString = function(.data, value){
  value = enquo(value)

  #Test whether the value variable is in .data
  if(!(as_label(value) %in% names(.data))){
    stop(paste("The", as_label(value),
               "variable does not exist in the .data table!"))
  }

  #Dictionary attribute presence test
  if(!("hashTab" %in% names(attributes(.data)))) {
    stop(paste0(
      "\nThere is no dictionary attribute in the .data table!\n",
      "Use addHashTable or addHashTable2 to add a dictionary attribute."))
  }

  txt = .data %>% pull(!!value)
  i = sapply(strsplit(txt, ""), function(x) max(which(x==" ")))
  txt = paste0(str_sub(txt, end=i),
               ht.lookup(attr(.data, "hashTab"),
                         str_sub(txt, start=i+1)))
  .data %>% mutate(!!value := txt)
}

First test

It's time for the first text. To avoid copying the code, I added one little function that returns a table with randomly selected addresses.

randomAddresses = function(n){
  tibble(
    addresses = paste(
      sample(10:10000, n, replace = TRUE),
      sample(c("harper", "davis", "van cortland", "marry", "von brown"), n, replace = TRUE),
      sample(USPS$common_abbrev, n, replace = TRUE)
    )
  )
}

set.seed(1111)
df = randomAddresses(10)
df
# # A tibble: 10 x 1
#   addresses
#   <chr>
# 1 74 marry forges
# 2 787 von brown knol
# 3 2755 van cortland summit
# 4 9405 harper plaza
# 5 5376 marry pass
# 6 1857 marry trailer
# 7 9810 von brown drv
# 8 7984 davis garden
# 9 9110 marry alley
# 10 6458 von brown row

It's time to use our magic text replacement function. However, remember to add the dictionary-environment to the table first.

df = df %>% addHashTable2(USPS, common_abbrev, usps_abbrev)
df %>% replaceString(addresses)
# A tibble: 10 x 1
#   addresses
#   <chr>
# 1 74 marry frgs
# 2 787 von brown knl
# 3 2755 van cortland smt
# 4 9405 harper plz
# 5 5376 marry pass
# 6 1857 marry trlr
# 7 9810 von brown dr
# 8 7984 davis gdn
# 9 9110 marry aly
# 10 6458 von brown row

It looks like it works!

Big test

Well, there is nothing to wait for. Now let's try it on a table with a million rows. Let's measure immediately how long it takes to draw addresses and add a dictionary-environment.

start_time =Sys.time()
df = randomAddresses(1000000)
df = df %>% addHashTable2(USPS, common_abbrev, usps_abbrev)
Sys.time()-start_time
#Time difference of 1.56609 secs

otput

df
# A tibble: 1,000,000 x 1
#   addresses              
#   <chr>                  
# 1 8995 marry pass        
# 2 8527 davis spng        
# 3 7663 marry loaf        
# 4 3043 davis common      
# 5 9192 marry bnd         
# 6 120 von brown corner   
# 7 72 van cortland plains 
# 8 8752 van cortland crcle
# 9 7754 von brown sqrs    
# 10 3745 marry key         
# # ... with 999,990 more rows

1.6 seconds is probably not too much. However, the big question is how long it will take to replace the abbreviations.

start_time =Sys.time()
df = df %>% replaceString(addresses)
Sys.time()-start_time
#Time difference of 8.316476 secs

output

# A tibble: 1,000,000 x 1
#   addresses            
#   <chr>                
#   1 8995 marry pass      
# 2 8527 davis spg       
# 3 7663 marry lf        
# 4 3043 davis cmn       
# 5 9192 marry bnd       
# 6 120 von brown cor    
# 7 72 van cortland plns 
# 8 8752 van cortland cir
# 9 7754 von brown sqs   
# 10 3745 marry ky        
# # ... with 999,990 more rows

BOOM!! And we have 8 seconds !!

I am convinced that a faster mechanism cannot be made in R.

Small update for @ThomasIsCoding

Below is a small benchmarking. Note I borrowed the code for the functions f_MK_conv2, f_TIC1 and f_TIC2 from @ThomasIsCoding.

set.seed(1111)
df = randomAddresses(10000)
df = df %>% addHashTable2(USPS, common_abbrev, usps_abbrev)

library(microbenchmark)
mb1 = microbenchmark(
  f_MK_conv2(df$addresses),
  f_TIC1(df$addresses),
  f_TIC2(df$addresses),
  replaceString(df, addresses),
  times = 20L
)
ggplot2::autoplot(mb1)

enter image description here

Marek Fiołka
  • 4,825
  • 1
  • 5
  • 20
  • 1
    A little note. My converter function assumes that the abbreviation is the last word in the address. Unfortunately, in my country we do not use such provisions, so I do not know if this assumption is correct. You have to verify it yourself. – Marek Fiołka Oct 31 '21 at 21:10
  • 1
    nice answer using named vector, good idea! Upvoted! By the way, maybe you can add my answer for benchmarking if you like. :) – ThomasIsCoding Nov 01 '21 at 08:56
  • 1
    superb new approach! cheers! – ThomasIsCoding Nov 02 '21 at 21:50
  • Wow @MarekFiołka! I don't really understand the 'magic', but I love it: thank you for explaining your approach in such great detail. I think my data.table answer is competitive with your `replaceString()` but I'm concerned I might not have implemented your code correctly. Are you able to please add my `dt_func()` to your benchmarking? – jared_mamrot Nov 03 '21 at 05:11
  • @MarekFiołka Thank you so much for such a thorough response! Can you explain the `USPS_conv` function? I'm confused about the `comm` part especially. – jvalenti Nov 04 '21 at 13:41
  • Comment for @jvalenti. Is it that piece of code, `comm = str_split(x," ") %>% .[[1]] %>% .[length(.)]`? If so, read it naturally as it reads this comment from left to right. So first we do split text, where the break character is a space `str_split(x," ")`. The result of this split is a list. So we take its first element `.[[1]]` which is a vector, and get the last element from it, which is `.[length(.)]`. This is how I got the last word from the text. Of course, this can be done in many different ways which you will find in this very extensive post. – Marek Fiołka Nov 04 '21 at 14:17
  • Personally, I like this form of writing very much because for me it is very clear and legible. Although, as you can see, it is not fast at all. So if you do not care about the clarity of the code, look for faster solutions. If, on the other hand, your calculations take a fraction of a second, the readability of the code is an advantage worth much more than those few milliseconds. – Marek Fiołka Nov 04 '21 at 14:20
  • OK I get it...I like this idea, but the assumption about the abbreviation being the last word is a bit strong and is likely to be only somewhat effective (my data is a little messy). is there a way to loosen it to simply look for `common_abbrev` and substitute with `usps_abbrev`? – jvalenti Nov 04 '21 at 14:22
  • Of course it does exist! Specify exactly what can be expected in your data and write which of the solutions you like best. We will change it together or I will change it myself for you. No problem. – Marek Fiołka Nov 04 '21 at 14:27
  • If you want a solution to `USPS_conv` I do warn you. As it turned out, building a dictionary on a vector with named elements is quite slow. If you want to use `replaceString`, you can too. And it will be much faster, but much more difficult to understand. Or maybe you would like to use one of the solutions proposed by @ThomasIsCoding or @jared_mamrot? These are really fast. – Marek Fiołka Nov 04 '21 at 14:32
  • @MarekFiołka the data looks like the example I give, but some entries don't have addresses in them--ergo what I mean by messy. some are of the form `apartment x` etc. Speed is important but accuracy is the chief concern. For example, if it took a minute or two to process 1,000,000 rows that would be OK. – jvalenti Nov 04 '21 at 18:12
  • 1
    @jvalenti, see my latest answer. I got everything you need there! – Marek Fiołka Nov 04 '21 at 20:02
6

Update

Here is the benchmarking for the existing to OP's question (borrow test data from @Marek Fiołka but with n <- 10000)

> mb1
Unit: milliseconds
                              expr       min        lq       mean    median
          f_MK_conv2(df$addresses) 1409.0643 1470.3992 1612.09037 1631.3014
 f_MK_replaceString(df, addresses)   50.1582   54.3035   94.53149   62.5772
              f_TIC1(df$addresses)  394.5972  420.3283  461.50675  447.6186
              f_TIC2(df$addresses) 1579.1868 1852.6873 2052.28388 1964.8845
              f_TIC3(df$addresses)   65.8436   71.5448   93.36210   84.9698
        uq       max neval
 1710.3459 1898.6773    20
  116.3108  264.2616    20
  499.4052  626.9240    20
 2246.5562 2916.2253    20
  102.7689  183.5121    20

enter image description here

where the benchmark code is given as follows

f_MK_conv2 <- function(x) {
  USPSv <- array(
    data = USPS$usps_abbrev,
    dimnames = list(USPS$common_abbrev)
  )
  USPS_conv2 <- function(x) {
    t <- str_split(x, " ")
    comm <- t[[1]][length(t[[1]])]
    str_replace(x, comm, USPSv[comm])
  }
  Vectorize(USPS_conv2)(x)
}

f_MK_replaceString <- function(.data, value) {
  ht.create <- function() new.env()

  ht.insert <- function(ht, key, value) ht[[key]] <- value
  ht.insert <- Vectorize(ht.insert, c("key", "value"))

  ht.lookup <- function(ht, key) ht[[key]]
  ht.lookup <- Vectorize(ht.lookup, "key")

  ht.delete <- function(ht, key) rm(list = key, envir = ht, inherits = FALSE)
  ht.delete <- Vectorize(ht.delete, "key")

  addHashTable2 <- function(.x, .y, key, value) {
    key <- enquo(key)
    value <- enquo(value)

    if (!all(c(as_label(key), as_label(value)) %in% names(.y))) {
      stop(paste0(
        "`.y` must contain `", as_label(key),
        "` and `", as_label(value), "` columns"
      ))
    }

    if ((.y %>% distinct(!!key, !!value) %>% nrow()) !=
      (.y %>% distinct(!!key) %>% nrow())) {
      warning(paste0(
        "\nThe number of unique values of the ", as_label(key),
        " variable is different\n",
        " from the number of unique values of the ",
        as_label(key), " and ", as_label(value), " pairs!\n",
        "The dictionary will only return the last values for a given key!"
      ))
    }

    ht <- ht.create()
    ht %>% ht.insert(
      .y %>% distinct(!!key, !!value) %>% pull(!!key),
      .y %>% distinct(!!key, !!value) %>% pull(!!value)
    )
    attr(.x, "hashTab") <- ht
    .x
  }

  .data <- .data %>% addHashTable2(USPS, common_abbrev, usps_abbrev)

  value <- enquo(value)
  # Test whether the value variable is in .data
  if (!(as_label(value) %in% names(.data))) {
    stop(paste(
      "The", as_label(value),
      "variable does not exist in the .data table!"
    ))
  }

  # Dictionary attribute presence test
  if (!("hashTab" %in% names(attributes(.data)))) {
    stop(paste0(
      "\nThere is no dictionary attribute in the .data table!\n",
      "Use addHashTable or addHashTable2 to add a dictionary attribute."
    ))
  }

  txt <- .data %>% pull(!!value)
  i <- sapply(strsplit(txt, ""), function(x) max(which(x == " ")))
  txt <- paste0(
    str_sub(txt, end = i),
    ht.lookup(
      attr(.data, "hashTab"),
      str_sub(txt, start = i + 1)
    )
  )
  .data %>% mutate(!!value := txt)
}

f_TIC1 <- function(x) {
  sapply(
    strsplit(x, " "),
    function(x) {
      with(USPS, {
        idx <- match(x, common_abbrev)
        paste0(ifelse(is.na(idx), x, usps_abbrev[idx]),
          collapse = " "
        )
      })
    }
  )
}

f_TIC2 <- function(x) {
  res <- c()
  for (s in x) {
    v <- unlist(strsplit(s, "\\W+"))
    for (p in v) {
      k <- match(p, USPS$common_abbrev)
      if (!is.na(k)) {
        s <- with(
          USPS,
          gsub(
            sprintf("\\b%s\\b", common_abbrev[k]),
            usps_abbrev[k],
            s
          )
        )
      }
    }
    res <- append(res, s)
  }
  res
}

f_TIC3 <- function(x) {
  x.split <- strsplit(x, " ")
  lut <- with(USPS, setNames(usps_abbrev, common_abbrev))
  grp <- rep(seq_along(x.split), lengths(x.split))
  xx <- unlist(x.split)
  r <- lut[xx]
  tapply(
    replace(xx, !is.na(r), na.omit(r)),
    grp,
    function(s) paste0(s, collapse = " ")
  )
}

f_TIC4 <- function(x) {
  xb <- gsub("^.*\\s+", "", x)
  rp <- with(USPS, usps_abbrev[match(xb, common_abbrev)])
  paste0(gsub("\\w+$", "", x), replace(xb, !is.na(rp), na.omit(rp)))
}

f_JM <- function(x) {
  x$abbreviation <- gsub("^.* ", "", x$addresses)
  setDT(x)
  setDT(USPS)
  x[USPS, abbreviation := usps_abbrev, on = .(abbreviation = common_abbrev)]

  x$usps_abbreviation <- paste(str_extract(x$addresses, "^.* "), x$abbreviation, sep = "")
}

set.seed(1111)
df <- randomAddresses(10000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df$addresses),
  f_MK_replaceString(df, addresses),
  f_JM(df),
  f_TIC1(df$addresses),
  f_TIC2(df$addresses),
  f_TIC3(df$addresses),
  f_TIC4(df$addresses),
  times = 20L
)
ggplot2::autoplot(mb1)

Possible Solutions

Perhaps one of the following base R options could help

  • solution 1
f_TIC1 <- function(x) {
  sapply(
    strsplit(x, " "),
    function(x) {
      with(USPS, {
        idx <- match(x, common_abbrev)
        paste0(ifelse(is.na(idx), x, usps_abbrev[idx]),
          collapse = " "
        )
      })
    }
  )
}
  • solution 2

f_TIC2 <- function(x) {
  res <- c()
  for (s in x) {
    v <- unlist(strsplit(s, "\\W+"))
    for (p in v) {
      k <- match(p, USPS$common_abbrev)
      if (!is.na(k)) {
        s <- with(
          USPS,
          gsub(
            sprintf("\\b%s\\b", common_abbrev[k]),
            usps_abbrev[k],
            s
          )
        )
      }
    }
    res <- append(res, s)
  }
  res
}
  • solution 3

f_TIC3 <- function(x) {
  x.split <- strsplit(x, " ")
  lut <- with(USPS, setNames(usps_abbrev, common_abbrev))
  grp <- rep(seq_along(x.split), lengths(x.split))
  xx <- unlist(x.split)
  r <- lut[xx]
  tapply(
    replace(xx, !is.na(r), na.omit(r)),
    grp,
    function(s) paste0(s, collapse = " ")
  )
}
  • solution 4 (this is for a special case, i.e., abbreviation for the last word only)
f_TIC4 <- function(x) {
  xb <- gsub("^.*\\s+", "", x)
  rp <- with(USPS, usps_abbrev[match(xb, common_abbrev)])
  paste0(gsub("\\w+$", "", x), replace(xb, !is.na(rp), na.omit(rp)))
}

output

[1] "10900 harper ave"     "12235 davis anx"      "24 van cortland pkwy"
ThomasIsCoding
  • 96,636
  • 9
  • 24
  • 81
  • 1
    Hi Thomas. Thanks for motivating me to learn more. Upvoted from me! Now I'm ready for benchmarking :-). It's like who's doing it, you or me? – Marek Fiołka Nov 02 '21 at 21:15
  • 1
    @MarekFiołka You are welcome! I have done the benchmarking :) – ThomasIsCoding Nov 02 '21 at 21:19
  • 1
    Thomas, you missed my newest `replaceString` function :-( – Marek Fiołka Nov 02 '21 at 21:32
  • 1
    @MarekFiołka Sorry, I just saw it. Then you can add my answer to your benchmarking. – ThomasIsCoding Nov 02 '21 at 21:41
  • 1
    I added benchmarking. Thanks again for motivating me to learn something new! – Marek Fiołka Nov 02 '21 at 21:49
  • 1
    @MarekFiołka I added another option for benchmarking, so you can see the update. One comment to the function `f_MK_replaceString()`: I put all of your helper functions within your function, since it should be treated as a blackbox to test the performance with given data input. I think this makes sense for fairness when benchmarking. – ThomasIsCoding Nov 02 '21 at 23:27
  • @ThomasIsCoding i don't see where you create `USPS`. Is it a data frame? – jvalenti Nov 04 '21 at 15:51
  • 1
    @jvalenti I borrowed data.frame `USPS` from Marek Fiolka's answer, which is the same as `usps_streets` in your post. – ThomasIsCoding Nov 04 '21 at 19:25
  • @ThomasIsCoding this actually worked pretty fast for my purposes, f_TIC1 was what I used. This is a pretty elegant solution and I like that it uses indexing via position--easy to understand. Thanks a lot, I really appreciate it! – jvalenti Nov 05 '21 at 14:14
  • Glad that my answer helps. You are welcome! – ThomasIsCoding Nov 05 '21 at 14:22
5

Update:

I spent some time tweaking my existing answer (below) and I believe it's the fastest method. Also, it's worth noting that if you add perl = TRUE to the gsub in f_JM and TIC4 you get a noticeable increase in speed with this example (may not apply to 'real world' data). There is also a major caveat to my answer, as it is predicated on the abbreviated term being the last term in the address (TIC1, TIC2 and TIC3 for example don't rely on that assumption).

Huge thanks to @Marek and @TIC for the benchmarking code and for the constructive comments:

## Benchmarking with updated f_JM() and TIC4()
library(data.table)
library(tidyverse)

USPS = tibble(
  common_abbrev = c("allee", "alley", "ally", "aly",
                    "anex", "annex", "annx", "anx", "arc", "arcade", "av", "ave",
                    "aven", "avenu", "avenue", "avn", "avnue", "bayoo", "bayou",
                    "bch", "beach", "bend", "bnd", "blf", "bluf", "bluff", "bluffs",
                    "bot", "btm", "bottm", "bottom", "blvd", "boul", "boulevard",
                    "boulv", "br", "brnch", "branch", "brdge", "brg", "bridge", "brk",
                    "brook", "brooks", "burg", "burgs", "byp", "bypa", "bypas", "bypass",
                    "byps", "camp", "cp", "cmp", "canyn", "canyon", "cnyn", "cape",
                    "cpe", "causeway", "causwa", "cswy", "cen", "cent", "center",
                    "centr", "centre", "cnter", "cntr", "ctr", "centers", "cir",
                    "circ", "circl", "circle", "crcl", "crcle", "circles", "clf",
                    "cliff", "clfs", "cliffs", "clb", "club", "common", "commons",
                    "cor", "corner", "corners", "cors", "course", "crse", "court",
                    "ct", "courts", "cts", "cove", "cv", "coves", "creek", "crk",
                    "crescent", "cres", "crsent", "crsnt", "crest", "crossing", "crssng",
                    "xing", "crossroad", "crossroads", "curve", "dale", "dl", "dam",
                    "dm", "div", "divide", "dv", "dvd", "dr", "driv", "drive", "drv",
                    "drives", "est", "estate", "estates", "ests", "exp", "expr",
                    "express", "expressway", "expw", "expy", "ext", "extension",
                    "extn", "extnsn", "exts", "fall", "falls", "fls", "ferry", "frry",
                    "fry", "field", "fld", "fields", "flds", "flat", "flt", "flats",
                    "flts", "ford", "frd", "fords", "forest", "forests", "frst",
                    "forg", "forge", "frg", "forges", "fork", "frk", "forks", "frks",
                    "fort", "frt", "ft", "freeway", "freewy", "frway", "frwy", "fwy",
                    "garden", "gardn", "grden", "grdn", "gardens", "gdns", "grdns",
                    "gateway", "gatewy", "gatway", "gtway", "gtwy", "glen", "gln",
                    "glens", "green", "grn", "greens", "grov", "grove", "grv", "groves",
                    "harb", "harbor", "harbr", "hbr", "hrbor", "harbors", "haven",
                    "hvn", "ht", "hts", "highway", "highwy", "hiway", "hiwy", "hway",
                    "hwy", "hill", "hl", "hills", "hls", "hllw", "hollow", "hollows",
                    "holw", "holws", "inlt", "is", "island", "islnd", "islands",
                    "islnds", "iss", "isle", "isles", "jct", "jction", "jctn", "junction",
                    "junctn", "juncton", "jctns", "jcts", "junctions", "key", "ky",
                    "keys", "kys", "knl", "knol", "knoll", "knls", "knolls", "lk",
                    "lake", "lks", "lakes", "land", "landing", "lndg", "lndng", "lane",
                    "ln", "lgt", "light", "lights", "lf", "loaf", "lck", "lock",
                    "lcks", "locks", "ldg", "ldge", "lodg", "lodge", "loop", "loops",
                    "mall", "mnr", "manor", "manors", "mnrs", "meadow", "mdw", "mdws",
                    "meadows", "medows", "mews", "mill", "mills", "missn", "mssn",
                    "motorway", "mnt", "mt", "mount", "mntain", "mntn", "mountain",
                    "mountin", "mtin", "mtn", "mntns", "mountains", "nck", "neck",
                    "orch", "orchard", "orchrd", "oval", "ovl", "overpass", "park",
                    "prk", "parks", "parkway", "parkwy", "pkway", "pkwy", "pky",
                    "parkways", "pkwys", "pass", "passage", "path", "paths", "pike",
                    "pikes", "pine", "pines", "pnes", "pl", "plain", "pln", "plains",
                    "plns", "plaza", "plz", "plza", "point", "pt", "points", "pts",
                    "port", "prt", "ports", "prts", "pr", "prairie", "prr", "rad",
                    "radial", "radiel", "radl", "ramp", "ranch", "ranches", "rnch",
                    "rnchs", "rapid", "rpd", "rapids", "rpds", "rest", "rst", "rdg",
                    "rdge", "ridge", "rdgs", "ridges", "riv", "river", "rvr", "rivr",
                    "rd", "road", "roads", "rds", "route", "row", "rue", "run", "shl",
                    "shoal", "shls", "shoals", "shoar", "shore", "shr", "shoars",
                    "shores", "shrs", "skyway", "spg", "spng", "spring", "sprng",
                    "spgs", "spngs", "springs", "sprngs", "spur", "spurs", "sq",
                    "sqr", "sqre", "squ", "square", "sqrs", "squares", "sta", "station",
                    "statn", "stn", "stra", "strav", "straven", "stravenue", "stravn",
                    "strvn", "strvnue", "stream", "streme", "strm", "street", "strt",
                    "st", "str", "streets", "smt", "suite", "sumit", "sumitt", "summit",
                    "ter", "terr", "terrace", "throughway", "trace", "traces", "trce",
                    "track", "tracks", "trak", "trk", "trks", "trafficway", "trail",
                    "trails", "trl", "trls", "trailer", "trlr", "trlrs", "tunel",
                    "tunl", "tunls", "tunnel", "tunnels", "tunnl", "trnpk", "turnpike",
                    "turnpk", "underpass", "un", "union", "unions", "valley", "vally",
                    "vlly", "vly", "valleys", "vlys", "vdct", "via", "viadct", "viaduct",
                    "view", "vw", "views", "vws", "vill", "villag", "village", "villg",
                    "villiage", "vlg", "villages", "vlgs", "ville", "vl", "vis",
                    "vist", "vista", "vst", "vsta", "walk", "walks", "wall", "wy",
                    "way", "ways", "well", "wells", "wls"),
  usps_abbrev = c("aly",
                  "aly", "aly", "aly", "anx", "anx", "anx", "anx", "arc", "arc",
                  "ave", "ave", "ave", "ave", "ave", "ave", "ave", "byu", "byu",
                  "bch", "bch", "bnd", "bnd", "blf", "blf", "blf", "blfs", "btm",
                  "btm", "btm", "btm", "blvd", "blvd", "blvd", "blvd", "br", "br",
                  "br", "brg", "brg", "brg", "brk", "brk", "brks", "bg", "bgs",
                  "byp", "byp", "byp", "byp", "byp", "cp", "cp", "cp", "cyn", "cyn",
                  "cyn", "cpe", "cpe", "cswy", "cswy", "cswy", "ctr", "ctr", "ctr",
                  "ctr", "ctr", "ctr", "ctr", "ctr", "ctrs", "cir", "cir", "cir",
                  "cir", "cir", "cir", "cirs", "clf", "clf", "clfs", "clfs", "clb",
                  "clb", "cmn", "cmns", "cor", "cor", "cors", "cors", "crse", "crse",
                  "ct", "ct", "cts", "cts", "cv", "cv", "cvs", "crk", "crk", "cres",
                  "cres", "cres", "cres", "crst", "xing", "xing", "xing", "xrd",
                  "xrds", "curv", "dl", "dl", "dm", "dm", "dv", "dv", "dv", "dv",
                  "dr", "dr", "dr", "dr", "drs", "est", "est", "ests", "ests",
                  "expy", "expy", "expy", "expy", "expy", "expy", "ext", "ext",
                  "ext", "ext", "exts", "fall", "fls", "fls", "fry", "fry", "fry",
                  "fld", "fld", "flds", "flds", "flt", "flt", "flts", "flts", "frd",
                  "frd", "frds", "frst", "frst", "frst", "frg", "frg", "frg", "frgs",
                  "frk", "frk", "frks", "frks", "ft", "ft", "ft", "fwy", "fwy",
                  "fwy", "fwy", "fwy", "gdn", "gdn", "gdn", "gdn", "gdns", "gdns",
                  "gdns", "gtwy", "gtwy", "gtwy", "gtwy", "gtwy", "gln", "gln",
                  "glns", "grn", "grn", "grns", "grv", "grv", "grv", "grvs", "hbr",
                  "hbr", "hbr", "hbr", "hbr", "hbrs", "hvn", "hvn", "hts", "hts",
                  "hwy", "hwy", "hwy", "hwy", "hwy", "hwy", "hl", "hl", "hls",
                  "hls", "holw", "holw", "holw", "holw", "holw", "inlt", "is",
                  "is", "is", "iss", "iss", "iss", "isle", "isle", "jct", "jct",
                  "jct", "jct", "jct", "jct", "jcts", "jcts", "jcts", "ky", "ky",
                  "kys", "kys", "knl", "knl", "knl", "knls", "knls", "lk", "lk",
                  "lks", "lks", "land", "lndg", "lndg", "lndg", "ln", "ln", "lgt",
                  "lgt", "lgts", "lf", "lf", "lck", "lck", "lcks", "lcks", "ldg",
                  "ldg", "ldg", "ldg", "loop", "loop", "mall", "mnr", "mnr", "mnrs",
                  "mnrs", "mdw", "mdws", "mdws", "mdws", "mdws", "mews", "ml",
                  "mls", "msn", "msn", "mtwy", "mt", "mt", "mt", "mtn", "mtn",
                  "mtn", "mtn", "mtn", "mtn", "mtns", "mtns", "nck", "nck", "orch",
                  "orch", "orch", "oval", "oval", "opas", "park", "park", "park",
                  "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pass",
                  "psge", "path", "path", "pike", "pike", "pne", "pnes", "pnes",
                  "pl", "pln", "pln", "plns", "plns", "plz", "plz", "plz", "pt",
                  "pt", "pts", "pts", "prt", "prt", "prts", "prts", "pr", "pr",
                  "pr", "radl", "radl", "radl", "radl", "ramp", "rnch", "rnch",
                  "rnch", "rnch", "rpd", "rpd", "rpds", "rpds", "rst", "rst", "rdg",
                  "rdg", "rdg", "rdgs", "rdgs", "riv", "riv", "riv", "riv", "rd",
                  "rd", "rds", "rds", "rte", "row", "rue", "run", "shl", "shl",
                  "shls", "shls", "shr", "shr", "shr", "shrs", "shrs", "shrs",
                  "skwy", "spg", "spg", "spg", "spg", "spgs", "spgs", "spgs", "spgs",
                  "spur", "spur", "sq", "sq", "sq", "sq", "sq", "sqs", "sqs", "sta",
                  "sta", "sta", "sta", "stra", "stra", "stra", "stra", "stra",
                  "stra", "stra", "strm", "strm", "strm", "st", "st", "st", "st",
                  "sts", "smt", "ste", "smt", "smt", "smt", "ter", "ter", "ter",
                  "trwy", "trce", "trce", "trce", "trak", "trak", "trak", "trak",
                  "trak", "trfy", "trl", "trl", "trl", "trl", "trlr", "trlr", "trlr",
                  "tunl", "tunl", "tunl", "tunl", "tunl", "tunl", "tpke", "tpke",
                  "tpke", "upas", "un", "un", "uns", "vly", "vly", "vly", "vly",
                  "vlys", "vlys", "via", "via", "via", "via", "vw", "vw", "vws",
                  "vws", "vlg", "vlg", "vlg", "vlg", "vlg", "vlg", "vlgs", "vlgs",
                  "vl", "vl", "vis", "vis", "vis", "vis", "vis", "walk", "walk",
                  "wall", "way", "way", "ways", "wl", "wls", "wls"))

randomAddresses = function(n){
  tibble(
    addresses = paste(
      sample(10:10000, n, replace = TRUE),
      sample(c("harper", "davis", "van cortland", "marry", "von brown"), n, replace = TRUE),
      sample(USPS$common_abbrev, n, replace = TRUE)
    )
  )
}

set.seed(1111)
df = randomAddresses(10)

USPS_conv2 = function(x, y) {
  t = str_split(x, " ")
  comm = t[[1]][length(t[[1]])]
  str_replace(x, comm, y[comm])
}
USPS_conv2 = Vectorize(USPS_conv2, "x")

f_MK_conv2 <- function(x, y) {
  x %>% mutate(
    addresses = USPS_conv2(addresses, 
                           array(data = y$usps_abbrev, dimnames = list(y$common_abbrev))))
}
f_MK_conv2(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts


ht.create <- function() new.env()

ht.insert <- function(ht, key, value) ht[[key]] <- value
ht.insert <- Vectorize(ht.insert, c("key", "value"))

ht.lookup <- function(ht, key) ht[[key]]
ht.lookup <- Vectorize(ht.lookup, "key")

ht.delete <- function(ht, key) rm(list = key, envir = ht, inherits = FALSE)
ht.delete <- Vectorize(ht.delete, "key")


f_MK_replaceString <- function(x, y) {
  ht <- ht.create()
  ht.insert(ht, y$common_abbrev, y$usps_abbrev)
  
  txt <- x$addresses
  i <- sapply(strsplit(txt, ""), function(x) max(which(x == " ")))
  txt <- paste0(
    str_sub(txt, end = i),
    ht.lookup(ht, str_sub(txt, start = i + 1))
  )
  x %>% mutate(addresses = txt)
}
f_MK_replaceString(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts

f_TIC1 <- function(x, y) {
  x %>% mutate(addresses = sapply(
    strsplit(x$addresses, " "),
    function(x) {
      with(y, {
        idx <- match(x, common_abbrev)
        paste0(ifelse(is.na(idx), x, usps_abbrev[idx]),
               collapse = " "
        )
      })
    }
  )
  )
}
f_TIC1(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts


f_TIC2 <- function(x, y) {
  res <- c()
  for (s in x$addresses) {
    v <- unlist(strsplit(s, "\\W+"))
    for (p in v) {
      k <- match(p, y$common_abbrev)
      if (!is.na(k)) {
        s <- with(
          y,
          gsub(
            sprintf("\\b%s\\b", common_abbrev[k]),
            usps_abbrev[k],
            s
          )
        )
      }
    }
    res <- append(res, s)
  }
  x %>% mutate(addresses = res)
}
f_TIC2(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts


f_TIC3 <- function(x, y) {
  x.split <- strsplit(x$addresses, " ")
  lut <- with(y, setNames(usps_abbrev, common_abbrev))
  grp <- rep(seq_along(x.split), lengths(x.split))
  xx <- unlist(x.split)
  r <- lut[xx]
  x %>% mutate(addresses = tapply(
    replace(xx, !is.na(r), na.omit(r)),
    grp,
    function(s) paste0(s, collapse = " ")
  ))
}
f_TIC3(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts

f_TIC4 <- function(x, y) {
  xb <- gsub("^.*\\s+", "", x$addresses, perl = TRUE)
  rp <- with(USPS, usps_abbrev[match(xb, common_abbrev)])
  x %>% mutate(addresses = paste0(gsub("\\w+$", "", x$addresses), replace(xb, !is.na(rp), na.omit(rp))))
}
f_TIC4(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts

f_JM <- function(x, y) {
  x$abbreviation <- gsub("^.* ", "", x$addresses, perl = TRUE)
  setDT(x)
  setDT(y)
  x[y, abbreviation := usps_abbrev, on = .(abbreviation = common_abbrev)]
  x$addresses <- paste(str_extract(x$addresses, "^.* "), x$abbreviation, sep = "")
  x$abbreviation <- NULL
  return(as_tibble(x))
}
f_JM(df, USPS)
#> # A tibble: 10 × 1
#>    addresses          
#>    <chr>              
#>  1 8995 davis crk     
#>  2 8527 davis tunl    
#>  3 7663 von brown wall
#>  4 3043 harper lk     
#>  5 9192 von brown gdn 
#>  6 120 marry riv      
#>  7 72 von brown lcks  
#>  8 8752 marry gdn     
#>  9 7754 davis cor     
#> 10 3745 davis jcts

set.seed(1111)
df = randomAddresses(100)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df, USPS),
  f_MK_replaceString(df, USPS),
  f_TIC1(df, USPS),
  f_TIC2(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.

set.seed(1111)
df = randomAddresses(1000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df, USPS),
  f_MK_replaceString(df, USPS),
  f_TIC1(df, USPS),
  f_TIC2(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.

set.seed(1111)
df = randomAddresses(10000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df, USPS),
  f_MK_replaceString(df, USPS),
  f_TIC1(df, USPS),
  f_TIC2(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.

set.seed(1111)
df = randomAddresses(100000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_replaceString(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.

set.seed(1111)
df = randomAddresses(1000000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_replaceString(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)
#> Coordinate system already present. Adding new coordinate system, which will replace the existing one.

Created on 2021-11-04 by the reprex package (v2.0.1)

Original:

Brilliant answers @Marek and @TIC! After some tweaking and benchmarking I think this data.table 'split/lookup-replace/paste' approach might be faster:

library(tidyverse)
library(data.table)

n=1000000
set.seed(1111)
df = tibble(
  addresses = paste(
    sample(10:10000, n, replace = TRUE),
    sample(c("harper", "davis", "van cortland", "marry", "von brown"), n, replace = TRUE),
    sample(USPS$common_abbrev, n, replace = TRUE)
  )
)
df
#> # A tibble: 1,000,000 × 1
#>    addresses              
#>    <chr>                  
#>  1 8995 marry pass        
#>  2 8527 davis spng        
#>  3 7663 marry loaf        
#>  4 3043 davis common      
#>  5 9192 marry bnd         
#>  6 120 von brown corner   
#>  7 72 van cortland plains 
#>  8 8752 van cortland crcle
#>  9 7754 von brown sqrs    
#> 10 3745 marry key         
#> # … with 999,990 more rows

start_time =Sys.time()
df$abbreviation <- gsub("^.* ", "", df$addresses)
setDT(df)
setDT(USPS)
df[USPS, abbreviation:=usps_abbrev, on=.(abbreviation=common_abbrev)]

df$usps_abbreviation <- paste(str_extract(df$addresses, "^.* "), df$abbreviation, sep = "")
Sys.time()-start_time
#> Time difference of 2.804245 secs
df
#>                    addresses abbreviation usps_abbreviation
#>       1:     8995 marry pass         pass   8995 marry pass
#>       2:     8527 davis spng          spg    8527 davis spg
#>       3:     7663 marry loaf           lf     7663 marry lf
#>       4:   3043 davis common          cmn    3043 davis cmn
#>       5:      9192 marry bnd          bnd    9192 marry bnd
#>      ---                                                   
#>  999996:     1379 marry vdct          via    1379 marry via
#>  999997:    237 harper avnue          ave    237 harper ave
#>  999998:      7592 davis riv          riv    7592 davis riv
#>  999999: 4963 marry junction          jct    4963 marry jct
#> 1000000:     813 harper bluf          blf    813 harper blf

Created on 2021-11-03 by the reprex package (v2.0.1)

Edit

I changed dt_func() to produce the same output as Marek's function (fairer comparison) and it's still super quick:

set.seed(1111)
df <- randomAddresses(10000)

dt_func <- function(x) {
  x$abbreviation <- gsub("^.* ", "", x$addresses)
  setDT(x)
  setDT(USPS)
  x[USPS, abbreviation:=usps_abbrev, on=.(abbreviation=common_abbrev)]
  
  x$addresses <- paste(str_extract(x$addresses, "^.* "), x$abbreviation, sep = "")
  x$abbreviation <- NULL
  return(as_tibble(x))
}

benchmark2.png

Compare output:

df2 <- f_MK_replaceString(df, addresses)
df3 <- dt_func(df)
dplyr::all_equal(df2, df3)
#> [1] TRUE
jared_mamrot
  • 22,354
  • 4
  • 21
  • 46
  • data.table left-join is a good attempt. upvoted. However, I think you just assumed the last word might be abbreviated, but I guess every word may have the possibility for replacement (for general cases). – ThomasIsCoding Nov 03 '21 at 08:22
  • Unfortunately, this test is not entirely fair. Why? Because the function `f_MK_replaceString` contains a lot of code that must have slowed it down. Note that in each `f_MK_replaceString` call, four internals `ht.create`, `ht.insert`, `ht.lookup`, `ht.delete` are created, three of which are additionally vectorized. Then the internal function `addHashTable2` is created and run to prepare the dictionary. – Marek Fiołka Nov 03 '21 at 08:27
  • That alone should discredit this character from any tests. You only create a dictionary once! Additional checks are performed in the next section to validate the data. This, of course, should not be considered in any benchmark at all. In any case, none of the other functions being evaluated have even a single line of guide to validate the input. Unfortunately, I am currently at work in the next 7 hours and I will not have time to prepare a fully fair function suitable for comparisons. But I will try to swear it tonight. – Marek Fiołka Nov 03 '21 at 08:27
  • Thanks @ThomasIsCoding! I wasn't sure if that was a fair assumption to make. I saw a comment by Marek along the same lines ("My converter function assumes that the abbreviation is the last word in the address.") but it's a very good point that it may not be the case in 'real world' data, and then the 'left-join' approach will fail completely. Thanks for the constructive feedback – jared_mamrot Nov 03 '21 at 09:46
  • Hi @MarekFiołka, thank you for clarifying! That makes sense, and that's why I left you a comment regarding my implementation of your function in my benchmarking ("I'm concerned I might not have implemented your code correctly") - thanks again for your highly-detailed answer and for taking the time to clarify/comment :) – jared_mamrot Nov 03 '21 at 10:01
  • 1
    @jared_mamrot If we only care about the last word for the replacement, I think your data.table approach is really efficient (I personally love data.table solutions), cheers! – ThomasIsCoding Nov 03 '21 at 10:02
  • 1
    I added another base R solution that processes the abbreviation for the last word and update the benchmark as well :) – ThomasIsCoding Nov 03 '21 at 10:28
4

Especially for @jvalenti

This is a special answer where you will find modified functions and all the code needed for your task.

library(tidyverse)

USPS = tibble(
  common_abbrev = c("allee", "alley", "ally", "aly",
                    "anex", "annex", "annx", "anx", "arc", "arcade", "av", "ave",
                    "aven", "avenu", "avenue", "avn", "avnue", "bayoo", "bayou",
                    "bch", "beach", "bend", "bnd", "blf", "bluf", "bluff", "bluffs",
                    "bot", "btm", "bottm", "bottom", "blvd", "boul", "boulevard",
                    "boulv", "br", "brnch", "branch", "brdge", "brg", "bridge", "brk",
                    "brook", "brooks", "burg", "burgs", "byp", "bypa", "bypas", "bypass",
                    "byps", "camp", "cp", "cmp", "canyn", "canyon", "cnyn", "cape",
                    "cpe", "causeway", "causwa", "cswy", "cen", "cent", "center",
                    "centr", "centre", "cnter", "cntr", "ctr", "centers", "cir",
                    "circ", "circl", "circle", "crcl", "crcle", "circles", "clf",
                    "cliff", "clfs", "cliffs", "clb", "club", "common", "commons",
                    "cor", "corner", "corners", "cors", "course", "crse", "court",
                    "ct", "courts", "cts", "cove", "cv", "coves", "creek", "crk",
                    "crescent", "cres", "crsent", "crsnt", "crest", "crossing", "crssng",
                    "xing", "crossroad", "crossroads", "curve", "dale", "dl", "dam",
                    "dm", "div", "divide", "dv", "dvd", "dr", "driv", "drive", "drv",
                    "drives", "est", "estate", "estates", "ests", "exp", "expr",
                    "express", "expressway", "expw", "expy", "ext", "extension",
                    "extn", "extnsn", "exts", "fall", "falls", "fls", "ferry", "frry",
                    "fry", "field", "fld", "fields", "flds", "flat", "flt", "flats",
                    "flts", "ford", "frd", "fords", "forest", "forests", "frst",
                    "forg", "forge", "frg", "forges", "fork", "frk", "forks", "frks",
                    "fort", "frt", "ft", "freeway", "freewy", "frway", "frwy", "fwy",
                    "garden", "gardn", "grden", "grdn", "gardens", "gdns", "grdns",
                    "gateway", "gatewy", "gatway", "gtway", "gtwy", "glen", "gln",
                    "glens", "green", "grn", "greens", "grov", "grove", "grv", "groves",
                    "harb", "harbor", "harbr", "hbr", "hrbor", "harbors", "haven",
                    "hvn", "ht", "hts", "highway", "highwy", "hiway", "hiwy", "hway",
                    "hwy", "hill", "hl", "hills", "hls", "hllw", "hollow", "hollows",
                    "holw", "holws", "inlt", "is", "island", "islnd", "islands",
                    "islnds", "iss", "isle", "isles", "jct", "jction", "jctn", "junction",
                    "junctn", "juncton", "jctns", "jcts", "junctions", "key", "ky",
                    "keys", "kys", "knl", "knol", "knoll", "knls", "knolls", "lk",
                    "lake", "lks", "lakes", "land", "landing", "lndg", "lndng", "lane",
                    "ln", "lgt", "light", "lights", "lf", "loaf", "lck", "lock",
                    "lcks", "locks", "ldg", "ldge", "lodg", "lodge", "loop", "loops",
                    "mall", "mnr", "manor", "manors", "mnrs", "meadow", "mdw", "mdws",
                    "meadows", "medows", "mews", "mill", "mills", "missn", "mssn",
                    "motorway", "mnt", "mt", "mount", "mntain", "mntn", "mountain",
                    "mountin", "mtin", "mtn", "mntns", "mountains", "nck", "neck",
                    "orch", "orchard", "orchrd", "oval", "ovl", "overpass", "park",
                    "prk", "parks", "parkway", "parkwy", "pkway", "pkwy", "pky",
                    "parkways", "pkwys", "pass", "passage", "path", "paths", "pike",
                    "pikes", "pine", "pines", "pnes", "pl", "plain", "pln", "plains",
                    "plns", "plaza", "plz", "plza", "point", "pt", "points", "pts",
                    "port", "prt", "ports", "prts", "pr", "prairie", "prr", "rad",
                    "radial", "radiel", "radl", "ramp", "ranch", "ranches", "rnch",
                    "rnchs", "rapid", "rpd", "rapids", "rpds", "rest", "rst", "rdg",
                    "rdge", "ridge", "rdgs", "ridges", "riv", "river", "rvr", "rivr",
                    "rd", "road", "roads", "rds", "route", "row", "rue", "run", "shl",
                    "shoal", "shls", "shoals", "shoar", "shore", "shr", "shoars",
                    "shores", "shrs", "skyway", "spg", "spng", "spring", "sprng",
                    "spgs", "spngs", "springs", "sprngs", "spur", "spurs", "sq",
                    "sqr", "sqre", "squ", "square", "sqrs", "squares", "sta", "station",
                    "statn", "stn", "stra", "strav", "straven", "stravenue", "stravn",
                    "strvn", "strvnue", "stream", "streme", "strm", "street", "strt",
                    "st", "str", "streets", "smt", "suite", "sumit", "sumitt", "summit",
                    "ter", "terr", "terrace", "throughway", "trace", "traces", "trce",
                    "track", "tracks", "trak", "trk", "trks", "trafficway", "trail",
                    "trails", "trl", "trls", "trailer", "trlr", "trlrs", "tunel",
                    "tunl", "tunls", "tunnel", "tunnels", "tunnl", "trnpk", "turnpike",
                    "turnpk", "underpass", "un", "union", "unions", "valley", "vally",
                    "vlly", "vly", "valleys", "vlys", "vdct", "via", "viadct", "viaduct",
                    "view", "vw", "views", "vws", "vill", "villag", "village", "villg",
                    "villiage", "vlg", "villages", "vlgs", "ville", "vl", "vis",
                    "vist", "vista", "vst", "vsta", "walk", "walks", "wall", "wy",
                    "way", "ways", "well", "wells", "wls"),
  usps_abbrev = c("aly",
                  "aly", "aly", "aly", "anx", "anx", "anx", "anx", "arc", "arc",
                  "ave", "ave", "ave", "ave", "ave", "ave", "ave", "byu", "byu",
                  "bch", "bch", "bnd", "bnd", "blf", "blf", "blf", "blfs", "btm",
                  "btm", "btm", "btm", "blvd", "blvd", "blvd", "blvd", "br", "br",
                  "br", "brg", "brg", "brg", "brk", "brk", "brks", "bg", "bgs",
                  "byp", "byp", "byp", "byp", "byp", "cp", "cp", "cp", "cyn", "cyn",
                  "cyn", "cpe", "cpe", "cswy", "cswy", "cswy", "ctr", "ctr", "ctr",
                  "ctr", "ctr", "ctr", "ctr", "ctr", "ctrs", "cir", "cir", "cir",
                  "cir", "cir", "cir", "cirs", "clf", "clf", "clfs", "clfs", "clb",
                  "clb", "cmn", "cmns", "cor", "cor", "cors", "cors", "crse", "crse",
                  "ct", "ct", "cts", "cts", "cv", "cv", "cvs", "crk", "crk", "cres",
                  "cres", "cres", "cres", "crst", "xing", "xing", "xing", "xrd",
                  "xrds", "curv", "dl", "dl", "dm", "dm", "dv", "dv", "dv", "dv",
                  "dr", "dr", "dr", "dr", "drs", "est", "est", "ests", "ests",
                  "expy", "expy", "expy", "expy", "expy", "expy", "ext", "ext",
                  "ext", "ext", "exts", "fall", "fls", "fls", "fry", "fry", "fry",
                  "fld", "fld", "flds", "flds", "flt", "flt", "flts", "flts", "frd",
                  "frd", "frds", "frst", "frst", "frst", "frg", "frg", "frg", "frgs",
                  "frk", "frk", "frks", "frks", "ft", "ft", "ft", "fwy", "fwy",
                  "fwy", "fwy", "fwy", "gdn", "gdn", "gdn", "gdn", "gdns", "gdns",
                  "gdns", "gtwy", "gtwy", "gtwy", "gtwy", "gtwy", "gln", "gln",
                  "glns", "grn", "grn", "grns", "grv", "grv", "grv", "grvs", "hbr",
                  "hbr", "hbr", "hbr", "hbr", "hbrs", "hvn", "hvn", "hts", "hts",
                  "hwy", "hwy", "hwy", "hwy", "hwy", "hwy", "hl", "hl", "hls",
                  "hls", "holw", "holw", "holw", "holw", "holw", "inlt", "is",
                  "is", "is", "iss", "iss", "iss", "isle", "isle", "jct", "jct",
                  "jct", "jct", "jct", "jct", "jcts", "jcts", "jcts", "ky", "ky",
                  "kys", "kys", "knl", "knl", "knl", "knls", "knls", "lk", "lk",
                  "lks", "lks", "land", "lndg", "lndg", "lndg", "ln", "ln", "lgt",
                  "lgt", "lgts", "lf", "lf", "lck", "lck", "lcks", "lcks", "ldg",
                  "ldg", "ldg", "ldg", "loop", "loop", "mall", "mnr", "mnr", "mnrs",
                  "mnrs", "mdw", "mdws", "mdws", "mdws", "mdws", "mews", "ml",
                  "mls", "msn", "msn", "mtwy", "mt", "mt", "mt", "mtn", "mtn",
                  "mtn", "mtn", "mtn", "mtn", "mtns", "mtns", "nck", "nck", "orch",
                  "orch", "orch", "oval", "oval", "opas", "park", "park", "park",
                  "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pass",
                  "psge", "path", "path", "pike", "pike", "pne", "pnes", "pnes",
                  "pl", "pln", "pln", "plns", "plns", "plz", "plz", "plz", "pt",
                  "pt", "pts", "pts", "prt", "prt", "prts", "prts", "pr", "pr",
                  "pr", "radl", "radl", "radl", "radl", "ramp", "rnch", "rnch",
                  "rnch", "rnch", "rpd", "rpd", "rpds", "rpds", "rst", "rst", "rdg",
                  "rdg", "rdg", "rdgs", "rdgs", "riv", "riv", "riv", "riv", "rd",
                  "rd", "rds", "rds", "rte", "row", "rue", "run", "shl", "shl",
                  "shls", "shls", "shr", "shr", "shr", "shrs", "shrs", "shrs",
                  "skwy", "spg", "spg", "spg", "spg", "spgs", "spgs", "spgs", "spgs",
                  "spur", "spur", "sq", "sq", "sq", "sq", "sq", "sqs", "sqs", "sta",
                  "sta", "sta", "sta", "stra", "stra", "stra", "stra", "stra",
                  "stra", "stra", "strm", "strm", "strm", "st", "st", "st", "st",
                  "sts", "smt", "ste", "smt", "smt", "smt", "ter", "ter", "ter",
                  "trwy", "trce", "trce", "trce", "trak", "trak", "trak", "trak",
                  "trak", "trfy", "trl", "trl", "trl", "trl", "trlr", "trlr", "trlr",
                  "tunl", "tunl", "tunl", "tunl", "tunl", "tunl", "tpke", "tpke",
                  "tpke", "upas", "un", "un", "uns", "vly", "vly", "vly", "vly",
                  "vlys", "vlys", "via", "via", "via", "via", "vw", "vw", "vws",
                  "vws", "vlg", "vlg", "vlg", "vlg", "vlg", "vlg", "vlgs", "vlgs",
                  "vl", "vl", "vis", "vis", "vis", "vis", "vis", "walk", "walk",
                  "wall", "way", "way", "ways", "wl", "wls", "wls"))


randomAddresses = function(n){
  tibble(
    addresses = 
      replicate(
        n, 
        sample(c(sample(10:10000, 1, replace = TRUE) %>% paste0,
                 sample(c("harper", "davis", "van cortland", "marry", "von brown"), 1),
                 sample(USPS$common_abbrev, 1)), 3) %>% paste(collapse = " ")
      )
  )
}

ht.create <- function() new.env()

ht.insert <- function(ht, key, value) ht[[key]] <- value
ht.insert <- Vectorize(ht.insert, c("key", "value"))

ht.lookup <- function(ht, key) ht[[key]]
ht.lookup <- Vectorize(ht.lookup, "key")

addHashTable2 = function(.x, .y, key, value){
  key = enquo(key)
  value = enquo(value)
  
  if (!all(c(as_label(key), as_label(value)) %in% names(.y))) {
    stop(paste0("`.y` must contain `", as_label(key),
                "` and `", as_label(value), "` columns"))
  }
  
  if((.y %>% distinct(!!key, !!value) %>% nrow)!=
     (.y %>% distinct(!!key) %>% nrow)){
    warning(paste0(
      "\nThe number of unique values of the ", as_label(key),
      " variable is different\n",
      " from the number of unique values of the ",
      as_label(key), " and ", as_label(value)," pairs!\n",
      "The dictionary will only return the last values for a given key!"))
  }
  
  ht = ht.create()
  ht %>% ht.insert(.y %>% distinct(!!key, !!value) %>% pull(!!key),
                   .y %>% distinct(!!key, !!value) %>% pull(!!value))
  attr(.x, "hashTab") = ht
  .x
}

replaceString = function(.data, value){
  value = enquo(value)
  
  #Test whether the value variable is in .data
  if(!(as_label(value) %in% names(.data))){
    stop(paste("The", as_label(value),
               "variable does not exist in the .data table!"))
  }
  
  #Dictionary attribute presence test
  if(!("hashTab" %in% names(attributes(.data)))) {
    stop(paste0(
      "\nThere is no dictionary attribute in the .data table!\n",
      "Use addHashTable or addHashTable2 to add a dictionary attribute."))
  }
  
  ht = attr(.data, "hashTab")
  txtRep = function(txt){
    txt = str_split(txt, " ")[[1]]
    httxt = ht.lookup(ht, txt)
    txt[httxt!="NULL"] = httxt[httxt!="NULL"]
    paste(txt, collapse = " ")
  }
  .data %>% rowwise(!!value) %>%  
    mutate(!!value := txtRep(!!value))
}

The replaceString function has been modified to replace abbreviations regardless of where they are in the sentence. See how to use it.

set.seed(1111)
df=randomAddresses(10)
df

output

# A tibble: 10 x 1
   addresses             
   <chr>                 
 1 marry wall 8995       
 2 cen 9192 marry        
 3 bayoo 3745 davis      
 4 marry hollows 4104    
 5 grdn 7162 marry       
 6 lck harper 1211       
 7 9405 van cortland knol
 8 7984 von brown viadct 
 9 4365 von brown rue    
10 6399 von brown mssn 

Now we're going to modify this tibble.

df %>% addHashTable2(USPS, common_abbrev, usps_abbrev) %>% 
  replaceString(addresses)

output

# A tibble: 10 x 1
# Rowwise:  addresses
   addresses            
   <chr>                
 1 marry wall 8995      
 2 ctr 9192 marry       
 3 byu 3745 davis       
 4 marry holw 4104      
 5 gdn 7162 marry       
 6 lck harper 1211      
 7 9405 van cortland knl
 8 7984 von brown via   
 9 4365 von brown rue   
10 6399 von brown msn  

Good luck and fast mutations of big data!!

Marek Fiołka
  • 4,825
  • 1
  • 5
  • 20
3

Latest update for all interested

I am writing an additional answer, because my original answer couldn't hold such long text and code anymore.

Dear colleagues, below I have collected all the functions that were created here in one collective code block so that anyone who wants to can try it out and do not have to combine it with several answers.

First of all, I unified all the functions so that each accepts two arguments at the input and returns a modified tibble at the output. I also moved all internal functions outside of processing functions.

Finally, I performed benchmarks for tables with 100, 1,000, 10,000, 100,000 and 1,000,000 rows.

Here is all the code

library(tidyverse)
library(data.table)

library(tidyverse)
USPS = tibble(
 common_abbrev = c("allee", "alley", "ally", "aly",
 "anex", "annex", "annx", "anx", "arc", "arcade", "av", "ave",
 "aven", "avenu", "avenue", "avn", "avnue", "bayoo", "bayou",
 "bch", "beach", "bend", "bnd", "blf", "bluf", "bluff", "bluffs",
 "bot", "btm", "bottm", "bottom", "blvd", "boul", "boulevard",
 "boulv", "br", "brnch", "branch", "brdge", "brg", "bridge", "brk",
 "brook", "brooks", "burg", "burgs", "byp", "bypa", "bypas", "bypass",
 "byps", "camp", "cp", "cmp", "canyn", "canyon", "cnyn", "cape",
 "cpe", "causeway", "causwa", "cswy", "cen", "cent", "center",
 "centr", "centre", "cnter", "cntr", "ctr", "centers", "cir",
 "circ", "circl", "circle", "crcl", "crcle", "circles", "clf",
 "cliff", "clfs", "cliffs", "clb", "club", "common", "commons",
 "cor", "corner", "corners", "cors", "course", "crse", "court",
 "ct", "courts", "cts", "cove", "cv", "coves", "creek", "crk",
 "crescent", "cres", "crsent", "crsnt", "crest", "crossing", "crssng",
 "xing", "crossroad", "crossroads", "curve", "dale", "dl", "dam",
 "dm", "div", "divide", "dv", "dvd", "dr", "driv", "drive", "drv",
 "drives", "est", "estate", "estates", "ests", "exp", "expr",
 "express", "expressway", "expw", "expy", "ext", "extension",
 "extn", "extnsn", "exts", "fall", "falls", "fls", "ferry", "frry",
 "fry", "field", "fld", "fields", "flds", "flat", "flt", "flats",
 "flts", "ford", "frd", "fords", "forest", "forests", "frst",
 "forg", "forge", "frg", "forges", "fork", "frk", "forks", "frks",
 "fort", "frt", "ft", "freeway", "freewy", "frway", "frwy", "fwy",
 "garden", "gardn", "grden", "grdn", "gardens", "gdns", "grdns",
 "gateway", "gatewy", "gatway", "gtway", "gtwy", "glen", "gln",
 "glens", "green", "grn", "greens", "grov", "grove", "grv", "groves",
 "harb", "harbor", "harbr", "hbr", "hrbor", "harbors", "haven",
 "hvn", "ht", "hts", "highway", "highwy", "hiway", "hiwy", "hway",
 "hwy", "hill", "hl", "hills", "hls", "hllw", "hollow", "hollows",
 "holw", "holws", "inlt", "is", "island", "islnd", "islands",
 "islnds", "iss", "isle", "isles", "jct", "jction", "jctn", "junction",
 "junctn", "juncton", "jctns", "jcts", "junctions", "key", "ky",
 "keys", "kys", "knl", "knol", "knoll", "knls", "knolls", "lk",
 "lake", "lks", "lakes", "land", "landing", "lndg", "lndng", "lane",
 "ln", "lgt", "light", "lights", "lf", "loaf", "lck", "lock",
 "lcks", "locks", "ldg", "ldge", "lodg", "lodge", "loop", "loops",
 "mall", "mnr", "manor", "manors", "mnrs", "meadow", "mdw", "mdws",
 "meadows", "medows", "mews", "mill", "mills", "missn", "mssn",
 "motorway", "mnt", "mt", "mount", "mntain", "mntn", "mountain",
 "mountin", "mtin", "mtn", "mntns", "mountains", "nck", "neck",
 "orch", "orchard", "orchrd", "oval", "ovl", "overpass", "park",
 "prk", "parks", "parkway", "parkwy", "pkway", "pkwy", "pky",
 "parkways", "pkwys", "pass", "passage", "path", "paths", "pike",
 "pikes", "pine", "pines", "pnes", "pl", "plain", "pln", "plains",
 "plns", "plaza", "plz", "plza", "point", "pt", "points", "pts",
 "port", "prt", "ports", "prts", "pr", "prairie", "prr", "rad",
 "radial", "radiel", "radl", "ramp", "ranch", "ranches", "rnch",
 "rnchs", "rapid", "rpd", "rapids", "rpds", "rest", "rst", "rdg",
 "rdge", "ridge", "rdgs", "ridges", "riv", "river", "rvr", "rivr",
 "rd", "road", "roads", "rds", "route", "row", "rue", "run", "shl",
 "shoal", "shls", "shoals", "shoar", "shore", "shr", "shoars",
 "shores", "shrs", "skyway", "spg", "spng", "spring", "sprng",
 "spgs", "spngs", "springs", "sprngs", "spur", "spurs", "sq",
 "sqr", "sqre", "squ", "square", "sqrs", "squares", "sta", "station",
 "statn", "stn", "stra", "strav", "straven", "stravenue", "stravn",
 "strvn", "strvnue", "stream", "streme", "strm", "street", "strt",
 "st", "str", "streets", "smt", "suite", "sumit", "sumitt", "summit",
 "ter", "terr", "terrace", "throughway", "trace", "traces", "trce",
 "track", "tracks", "trak", "trk", "trks", "trafficway", "trail",
 "trails", "trl", "trls", "trailer", "trlr", "trlrs", "tunel",
 "tunl", "tunls", "tunnel", "tunnels", "tunnl", "trnpk", "turnpike",
 "turnpk", "underpass", "un", "union", "unions", "valley", "vally",
 "vlly", "vly", "valleys", "vlys", "vdct", "via", "viadct", "viaduct",
 "view", "vw", "views", "vws", "vill", "villag", "village", "villg",
 "villiage", "vlg", "villages", "vlgs", "ville", "vl", "vis",
 "vist", "vista", "vst", "vsta", "walk", "walks", "wall", "wy",
 "way", "ways", "well", "wells", "wls"),
 usps_abbrev = c("aly",
 "aly", "aly", "aly", "anx", "anx", "anx", "anx", "arc", "arc",
 "ave", "ave", "ave", "ave", "ave", "ave", "ave", "byu", "byu",
 "bch", "bch", "bnd", "bnd", "blf", "blf", "blf", "blfs", "btm",
 "btm", "btm", "btm", "blvd", "blvd", "blvd", "blvd", "br", "br",
 "br", "brg", "brg", "brg", "brk", "brk", "brks", "bg", "bgs",
 "byp", "byp", "byp", "byp", "byp", "cp", "cp", "cp", "cyn", "cyn",
 "cyn", "cpe", "cpe", "cswy", "cswy", "cswy", "ctr", "ctr", "ctr",
 "ctr", "ctr", "ctr", "ctr", "ctr", "ctrs", "cir", "cir", "cir",
 "cir", "cir", "cir", "cirs", "clf", "clf", "clfs", "clfs", "clb",
 "clb", "cmn", "cmns", "cor", "cor", "cors", "cors", "crse", "crse",
 "ct", "ct", "cts", "cts", "cv", "cv", "cvs", "crk", "crk", "cres",
 "cres", "cres", "cres", "crst", "xing", "xing", "xing", "xrd",
 "xrds", "curv", "dl", "dl", "dm", "dm", "dv", "dv", "dv", "dv",
 "dr", "dr", "dr", "dr", "drs", "est", "est", "ests", "ests",
 "expy", "expy", "expy", "expy", "expy", "expy", "ext", "ext",
 "ext", "ext", "exts", "fall", "fls", "fls", "fry", "fry", "fry",
 "fld", "fld", "flds", "flds", "flt", "flt", "flts", "flts", "frd",
 "frd", "frds", "frst", "frst", "frst", "frg", "frg", "frg", "frgs",
 "frk", "frk", "frks", "frks", "ft", "ft", "ft", "fwy", "fwy",
 "fwy", "fwy", "fwy", "gdn", "gdn", "gdn", "gdn", "gdns", "gdns",
 "gdns", "gtwy", "gtwy", "gtwy", "gtwy", "gtwy", "gln", "gln",
 "glns", "grn", "grn", "grns", "grv", "grv", "grv", "grvs", "hbr",
 "hbr", "hbr", "hbr", "hbr", "hbrs", "hvn", "hvn", "hts", "hts",
 "hwy", "hwy", "hwy", "hwy", "hwy", "hwy", "hl", "hl", "hls",
 "hls", "holw", "holw", "holw", "holw", "holw", "inlt", "is",
 "is", "is", "iss", "iss", "iss", "isle", "isle", "jct", "jct",
 "jct", "jct", "jct", "jct", "jcts", "jcts", "jcts", "ky", "ky",
 "kys", "kys", "knl", "knl", "knl", "knls", "knls", "lk", "lk",
 "lks", "lks", "land", "lndg", "lndg", "lndg", "ln", "ln", "lgt",
 "lgt", "lgts", "lf", "lf", "lck", "lck", "lcks", "lcks", "ldg",
 "ldg", "ldg", "ldg", "loop", "loop", "mall", "mnr", "mnr", "mnrs",
 "mnrs", "mdw", "mdws", "mdws", "mdws", "mdws", "mews", "ml",
 "mls", "msn", "msn", "mtwy", "mt", "mt", "mt", "mtn", "mtn",
 "mtn", "mtn", "mtn", "mtn", "mtns", "mtns", "nck", "nck", "orch",
 "orch", "orch", "oval", "oval", "opas", "park", "park", "park",
 "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pkwy", "pass",
 "psge", "path", "path", "pike", "pike", "pne", "pnes", "pnes",
 "pl", "pln", "pln", "plns", "plns", "plz", "plz", "plz", "pt",
 "pt", "pts", "pts", "prt", "prt", "prts", "prts", "pr", "pr",
 "pr", "radl", "radl", "radl", "radl", "ramp", "rnch", "rnch",
 "rnch", "rnch", "rpd", "rpd", "rpds", "rpds", "rst", "rst", "rdg",
 "rdg", "rdg", "rdgs", "rdgs", "riv", "riv", "riv", "riv", "rd",
 "rd", "rds", "rds", "rte", "row", "rue", "run", "shl", "shl",
 "shls", "shls", "shr", "shr", "shr", "shrs", "shrs", "shrs",
 "skwy", "spg", "spg", "spg", "spg", "spgs", "spgs", "spgs", "spgs",
 "spur", "spur", "sq", "sq", "sq", "sq", "sq", "sqs", "sqs", "sta",
 "sta", "sta", "sta", "stra", "stra", "stra", "stra", "stra",
 "stra", "stra", "strm", "strm", "strm", "st", "st", "st", "st",
 "sts", "smt", "ste", "smt", "smt", "smt", "ter", "ter", "ter",
 "trwy", "trce", "trce", "trce", "trak", "trak", "trak", "trak",
 "trak", "trfy", "trl", "trl", "trl", "trl", "trlr", "trlr", "trlr",
 "tunl", "tunl", "tunl", "tunl", "tunl", "tunl", "tpke", "tpke",
 "tpke", "upas", "un", "un", "uns", "vly", "vly", "vly", "vly",
 "vlys", "vlys", "via", "via", "via", "via", "vw", "vw", "vws",
 "vws", "vlg", "vlg", "vlg", "vlg", "vlg", "vlg", "vlgs", "vlgs",
 "vl", "vl", "vis", "vis", "vis", "vis", "vis", "walk", "walk",
 "wall", "way", "way", "ways", "wl", "wls", "wls"))

randomAddresses = function(n){
  tibble(
    addresses = paste(
      sample(10:10000, n, replace = TRUE),
      sample(c("harper", "davis", "van cortland", "marry", "von brown"), n, replace = TRUE),
      sample(USPS$common_abbrev, n, replace = TRUE)
    )
  )
}

set.seed(1111)
df = randomAddresses(10)

USPS_conv2 = function(x, y) {
  t = str_split(x, " ")
  comm = t[[1]][length(t[[1]])]
  str_replace(x, comm, y[comm])
}
USPS_conv2 = Vectorize(USPS_conv2, "x")

f_MK_conv2 <- function(x, y) {
  x %>% mutate(
    addresses = USPS_conv2(addresses, 
      array(data = y$usps_abbrev, dimnames = list(y$common_abbrev))))
}
f_MK_conv2(df, USPS)


ht.create <- function() new.env()

ht.insert <- function(ht, key, value) ht[[key]] <- value
ht.insert <- Vectorize(ht.insert, c("key", "value"))

ht.lookup <- function(ht, key) ht[[key]]
ht.lookup <- Vectorize(ht.lookup, "key")

ht.delete <- function(ht, key) rm(list = key, envir = ht, inherits = FALSE)
ht.delete <- Vectorize(ht.delete, "key")


f_MK_replaceString <- function(x, y) {
  ht <- ht.create()
  ht.insert(ht, y$common_abbrev, y$usps_abbrev)

  txt <- x$addresses
  i <- sapply(strsplit(txt, ""), function(x) max(which(x == " ")))
  txt <- paste0(
    str_sub(txt, end = i),
    ht.lookup(ht, str_sub(txt, start = i + 1))
  )
  x %>% mutate(addresses = txt)
}
f_MK_replaceString(df, USPS)

f_TIC1 <- function(x, y) {
  x %>% mutate(addresses = sapply(
    strsplit(x$addresses, " "),
    function(x) {
      with(y, {
        idx <- match(x, common_abbrev)
        paste0(ifelse(is.na(idx), x, usps_abbrev[idx]),
               collapse = " "
        )
      })
    }
  )
  )
}
f_TIC1(df, USPS)


f_TIC2 <- function(x, y) {
  res <- c()
  for (s in x$addresses) {
    v <- unlist(strsplit(s, "\\W+"))
    for (p in v) {
      k <- match(p, y$common_abbrev)
      if (!is.na(k)) {
        s <- with(
          y,
          gsub(
            sprintf("\\b%s\\b", common_abbrev[k]),
            usps_abbrev[k],
            s
          )
        )
      }
    }
    res <- append(res, s)
  }
  x %>% mutate(addresses = res)
}
f_TIC2(df, USPS)


f_TIC3 <- function(x, y) {
  x.split <- strsplit(x$addresses, " ")
  lut <- with(y, setNames(usps_abbrev, common_abbrev))
  grp <- rep(seq_along(x.split), lengths(x.split))
  xx <- unlist(x.split)
  r <- lut[xx]
  x %>% mutate(addresses = tapply(
    replace(xx, !is.na(r), na.omit(r)),
    grp,
    function(s) paste0(s, collapse = " ")
  ))
}
f_TIC3(df, USPS)

f_TIC4 <- function(x, y) {
  xb <- gsub("^.*\\s+", "", x$addresses)
  rp <- with(USPS, usps_abbrev[match(xb, common_abbrev)])
  x %>% mutate(addresses = paste0(gsub("\\w+$", "", x$addresses), replace(xb, !is.na(rp), na.omit(rp))))
}
f_TIC4(df, USPS)

f_JM <- function(x, y) {
  x$abbreviation <- gsub("^.* ", "", x$addresses)
  setDT(x)
  setDT(y)
  x[y, abbreviation := usps_abbrev, on = .(abbreviation = common_abbrev)]
  
  x$addresses <- paste(str_extract(x$addresses, "^.* "), x$abbreviation, sep = "")
  x$abbreviation <- NULL
  return(as_tibble(x))
}
f_JM(df, USPS)

set.seed(1111)
df = randomAddresses(100)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df, USPS),
  f_MK_replaceString(df, USPS),
  f_TIC1(df, USPS),
  f_TIC2(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)

set.seed(1111)
df = randomAddresses(1000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df, USPS),
  f_MK_replaceString(df, USPS),
  f_TIC1(df, USPS),
  f_TIC2(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)

set.seed(1111)
df = randomAddresses(10000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_conv2(df, USPS),
  f_MK_replaceString(df, USPS),
  f_TIC1(df, USPS),
  f_TIC2(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)

set.seed(1111)
df = randomAddresses(100000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_replaceString(df, USPS),
  f_TIC3(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)

set.seed(1111)
df = randomAddresses(1000000)

library(microbenchmark)
mb1 <- microbenchmark(
  f_MK_replaceString(df, USPS),
  f_TIC4(df, USPS),
  f_JM(df, USPS),
  times = 20L
)
ggplot2::autoplot(mb1)

And now the result in the form of charts

enter image description here

enter image description here

enter image description here

enter image description here

enter image description here

Time for conclusions and summary

@jared_mamrot - you are absolutely right. data.table is amazing!!

@ThomasIsCoding - bravo for f_TIC4. Its simplicity is beautiful!!

@AnyoneWhoComesBy - congratulations if you've read this to the end. I believe that you, too, could learn a lot here!!

Marek Fiołka
  • 4,825
  • 1
  • 5
  • 20
  • Thanks @Marek! There is a lot to learn here! I don't understand your 'magic' but I'm going to come back to your answer and work my way through it when I get some spare time. Also, for @AnyoneWhoComesBy, if you change the line `x$abbreviation <- gsub("^.* ", "", x$addresses)` in my function to `x$abbreviation <- gsub("^.* ", "", x$addresses, perl = TRUE)` you get a ~8% speed up with n=100 addresses and a ~25% speed up when n=1000000 addresses. I don't think you can get any faster in R, but I'd very much like to be proven wrong :) – jared_mamrot Nov 03 '21 at 22:59
  • Comment for @jared_mamrot. Of course, 'magic' was just a colloquialism. My point is that when writing the function `replaceString`, I used a few unclear and probably little known mechanisms. I will describe them for generations in the next few comments. – Marek Fiołka Nov 04 '21 at 14:58
  • Comment for @jared_mamrot. **1.** Perhaps the most unclear thing was to build a dictionary based on the environments. Of course your idea with `data.table` turned out to be faster, but note that the idea of a hash table dictionary built on environments can be useful for many other tasks. Especially when you will have a large number of objects larger and more complex than just text. This 'magic' solution is perfect for this. – Marek Fiołka Nov 04 '21 at 14:59
  • Comment for @jared_mamrot. **2.** The secon trick was to hide this environment in the `tibble` attribute. Although I think you know well how the attribute works, some R users may be a bit surprised by this solution. – Marek Fiołka Nov 04 '21 at 14:59
  • Comment for @jared_mamrot. **3.** A third element that I think might be useful to you is how to write the functions `addHashTable` and `replaceString` to be user-friendly and usable using `dplyr` semantics to write the variable names directly, such as in this command `USPS = USPS %>% addHashTable(common_abbrev, usps_abbrev)`. This can be a very useful formula for writing various other custom functions according to this semantics. – Marek Fiołka Nov 04 '21 at 14:59
  • Thank you for clarifying these points @Marek; I really appreciate it. And thank you for contributing to StackOverflow. You are awesome :) – jared_mamrot Nov 04 '21 at 21:39
  • 1
    Dear Jared, you are a very nice guy! And it was awesome when you donated a reward of 500 reputation points for answering a question asked by another user. Thank you very much for that!! – Marek Fiołka Nov 05 '21 at 06:49
  • You're welcome. It was an exceptional answer and it deserved recognition. Your answer to this question is also exceptional. I'm very glad you didn't "give up" on StackOverflow :) – jared_mamrot Nov 05 '21 at 08:52