0

I'm mapping a large number of lat/long coordinate pairs to associated zip codes. The number of records is too large to use an API like Google Maps or geonames because of call limits.

I have a lookup table which contains zip codes and the lat/long centroid of each zip code. You can get the lookup table here:

# zipcode data with lat/lon coordinates
url <- "http://www.boutell.com/zipcodes/zipcode.zip"
fil <- "ziplatlong.zip"

# download an unzip
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="zips")

library(readr)
ziplkp<-read_csv("zips/zipcode.csv")

For each lat/long pair in my data, I'd like to match it to the nearest zip code centroid by finding the minimum absolute difference between that lat/long pair and each centroid in my lookup table.

What is the most efficient way to apply such a a "lookup" function row-wise to a large number of records?

Sample Data: A list of lat/long coordinates:

latlongdata <- 
  structure(list(dropoff_longitude = c(-73.981705, -73.993553, 
-73.973305, -73.988823, -73.938484, -74.015503, -73.95472, -73.9571, 
-73.971298, -73.99794), dropoff_latitude = c(40.760559, 40.756348, 
40.762646, 40.777504, 40.684692, 40.709881, 40.783371, 40.776657, 
40.752148, 40.720535)), row.names = c(8807218L, 9760613L, 3175671L, 
10878727L, 2025038L, 5345659L, 14474481L, 1650223L, 684883L, 
9129975L), class = "data.frame", .Names = c("dropoff_longitude", 
"dropoff_latitude"))

    print(latlongdata)
         dropoff_longitude dropoff_latitude
8807218          -73.98171         40.76056
9760613          -73.99355         40.75635
3175671          -73.97330         40.76265
10878727         -73.98882         40.77750
2025038          -73.93848         40.68469
5345659          -74.01550         40.70988
14474481         -73.95472         40.78337
1650223          -73.95710         40.77666
684883           -73.97130         40.75215
9129975          -73.99794         40.72053

**ZipLooker function: Finds the minimum absolute distance from an input coordinate pair to the nearest zip code centroid and returns that zip

library(dplyr)
ZipLooker<-function(dropoff_longitude,dropoff_latitude){
  if(is.na(dropoff_longitude)|is.na(dropoff_latitude)){
    z<-NA_character_
  } else {

    tryCatch({
      x<-ziplkp1 
      x$latdiff=abs(dropoff_latitude-x$Latitude)
      x$londiff=abs(dropoff_longitude-x$Longitude)
      x$totdiff=x$latdiff+x$londiff
      z<-head(top_n(x,1,-totdiff),n=1)$Postal
      return(z)
    }, error=function(e) NA)
  }
}

Apply the Ziplooker function using dplyr's rowwsie() function

  latlongdata %>% 
  rowwise() %>% 
  mutate(zipcode=ZipLooker(dropoff_longitude,dropoff_latitude)
         )
Tim_K
  • 659
  • 10
  • 24
  • If you're a student, I recommend Smarty Streets. They give free accounts to students/academics without a call limit. – MichaelChirico Aug 11 '15 at 17:14
  • Thanks, Michael. Not a student, and $1,000/month is slightly outside the budget right now! ;) Also it looks like Smarty Streets is useful for looking up lat/long coordinates given a zipcode, but I'm trying to go the other way – Tim_K Aug 11 '15 at 17:34
  • 1
    In terms of the problem you are having with `ZipLooker` and `mutate`: in this case having an `if` without the `else` will return `character(0)` if you have a missing value in one of your `dropoff` variables, so wrap the whole `TryCatch` with an `else`. Also, because you end up returning a character variable here, it will help to use `"NA"` or `NA_character_` in your `if` statement. Don't forget to define the `ziplkp` argument when using `ZipLooker` or you'll get all `NA` values back. – aosmith Aug 11 '15 at 19:00
  • Thanks aosmith, all great suggestions and corrections. ZipLooker now seems to work, so I altered the question to be more specific about efficiency. – Tim_K Aug 12 '15 at 00:23

3 Answers3

2

Here's a complete solution for you:

library(sp)
library(maptools)
library(zipcode)

# grab the zip code boundaries
url <- "http://www2.census.gov/geo/tiger/GENZ2014/shp/cb_2014_us_zcta510_500k.zip"
fil <- "ztca.zip"

# don't waste bandwidth
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="ztca")

# read them in (this takes a bit)
ztca <- readShapePoly("ztca/cb_2014_us_zcta510_500k.shp", verbose=TRUE)

# extract NY
ny <- ztca[as.character(ztca$ZCTA5CE10) %in% as.character(zipcode[zipcode$state=="NY",]$zip),]

# your points
latlongdata <- 
  structure(list(dropoff_longitude = c(-73.981705, -73.993553, 
-73.973305, -73.988823, -73.938484, -74.015503, -73.95472, -73.9571, 
-73.971298, -73.99794), dropoff_latitude = c(40.760559, 40.756348, 
40.762646, 40.777504, 40.684692, 40.709881, 40.783371, 40.776657, 
40.752148, 40.720535)), row.names = c(8807218L, 9760613L, 3175671L, 
10878727L, 2025038L, 5345659L, 14474481L, 1650223L, 684883L, 
9129975L), class = "data.frame", .Names = c("dropoff_longitude", 
"dropoff_latitude"))

# make them all super spatial-like (must be in lon,lat format)
pts <- SpatialPoints(as.matrix(latlongdata[,1:2]))

# figure out where they are (this can take a bit)
dat <- pts %over% ny

# merge your data back in (there are many ways to do this)
dat$lon <- latlongdata$dropoff_longitude
dat$lat <- latlongdata$dropoff_latitude
rownames(dat) <- rownames(latlongdata)

# boom
dat
##          ZCTA5CE10     AFFGEOID10 GEOID10 ALAND10 AWATER10       lon      lat
## 8807218      10019 8600000US10019   10019 1780742        0 -73.98171 40.76056
## 9760613      10018 8600000US10018   10018  836253        0 -73.99355 40.75635
## 3175671      10022 8600000US10022   10022 1107169        0 -73.97330 40.76265
## 10878727     10069 8600000US10069   10069  249044        0 -73.98882 40.77750
## 2025038      11221 8600000US11221   11221 3582803        0 -73.93848 40.68469
## 5345659      10280 8600000US10280   10280  300652    38759 -74.01550 40.70988
## 14474481     10128 8600000US10128   10128 1206195        0 -73.95472 40.78337
## 1650223      10028 8600000US10028   10028  811363        0 -73.95710 40.77666
## 684883       10017 8600000US10017   10017  820953        0 -73.97130 40.75215
## 9129975      10013 8600000US10013   10013 1425085        0 -73.99794 40.72053
hrbrmstr
  • 77,368
  • 11
  • 139
  • 205
  • This is perfect, thanks! I know that the lat/long pairs are specific to the NY region, any way to cut down on the size of ztca to speed this up? – Tim_K Aug 11 '15 at 22:32
0

I use something along these lines to convert lon-lat to containing polygons:

library(maptools)
points.file<-readShapePoints("path.to.pts.shp")
poly.file<-read.ShapePoy("path.to.poly.shp")
points.file %over% poly.file
MichaelChirico
  • 33,841
  • 14
  • 113
  • 198
  • Thanks. Any advice on converting lat/long pairs to a shape file? – Tim_K Aug 11 '15 at 17:35
  • I'm sure there's a way to do this in R, but I don't know off the top of my head. What I do is use [QGIS](http://www.qgis.org/en/site/) (open-source GIS program), which has the ability to read .csv files with lat-lon specifications, which you can then save as a shapefile. – MichaelChirico Aug 11 '15 at 17:36
  • Also, not sure if `readShapePoints` can handle this format, but [this](http://stackoverflow.com/questions/30127293/how-to-convert-csv-to-shp-in-r) question suggests `R` can readily turn your lon-lat into an `rgdal` file, whatever that is... – MichaelChirico Aug 11 '15 at 17:37
0

Comparing the speed of the dplyr rowwise option vs. hrbrmstr's maptools solution, looks like dplyr wins out (at least on the smaller dataset)

# Test lat/long data
latlongdata <- 
structure(list(dropoff_longitude = c(-73.981705, -73.993553, 
-73.973305, -73.988823, -73.938484, -74.015503, -73.95472, -73.9571, 
-73.971298, -73.99794), dropoff_latitude = c(40.760559, 40.756348, 
40.762646, 40.777504, 40.684692, 40.709881, 40.783371, 40.776657, 
40.752148, 40.720535)), row.names = c(8807218L, 9760613L, 3175671L, 
10878727L, 2025038L, 5345659L, 14474481L, 1650223L, 684883L, 
9129975L), class = "data.frame", .Names = c("dropoff_longitude", 
"dropoff_latitude"))


# zipcode data with lat/lon coordinates
url <- "http://www.boutell.com/zipcodes/zipcode.zip"
fil <- "ziplatlong.zip"

# download an unzip
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="zips")

library(readr)
ziplkp<-read_csv("zips/zipcode.csv")


# Method 1: dplyr + ZipLooker function

ZipLooker<-function(dropoff_longitude,dropoff_latitude){
  if(is.na(dropoff_longitude)|is.na(dropoff_latitude)){
    z<-NA_character_
  } else {

    tryCatch({
      x<-ziplkp
      x$latdiff=abs(dropoff_latitude-x$latitude)
      x$londiff=abs(dropoff_longitude-x$longitude)
      x$totdiff=x$latdiff+x$londiff
      z<-head(top_n(x,1,-totdiff),n=1)$zip
      return(z)
    }, error=function(e) NA_character_)
  }
}


latlongdata %>% 
  rowwise() %>% 
  mutate(zipcode=ZipLooker(dropoff_longitude,dropoff_latitude)
  )




# Method 2: maptools + sp
library(sp)
library(maptools)

# grab the zip code boundaries
url <- "http://www2.census.gov/geo/tiger/GENZ2014/shp/cb_2014_us_zcta510_500k.zip"
fil <- "ztca.zip"

# don't waste bandwidth
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="ztca")

# read them in (this takes a bit)
ztca <- readShapePoly("ztca/cb_2014_us_zcta510_500k.shp", verbose=TRUE)


# extract NY
ny <- ztca[as.character(ztca$ZCTA5CE10) %in% as.character(ziplkp[ziplkp$state=="NY",]$zip),]

# make them all super spatial-like (must be in lon,lat format)
pts <- SpatialPoints(as.matrix(latlongdata[,1:2]))

# figure out where they are (this can take a bit)
dat <- pts %over% ny

# merge your data back in (there are many ways to do this)
dat$lon <- latlongdata$dropoff_longitude
dat$lat <- latlongdata$dropoff_latitude
rownames(dat) <- rownames(latlongdata)



# comparing the two (only the bulkiest parts)

library(microbenchmark)
microbenchmark(
dat <- pts %over% ny
,
latlongdata %>% 
  rowwise() %>% 
  mutate(zipcode=ZipLooker(dropoff_longitude,dropoff_latitude)
  )
,times = 10)

Output:

Unit: milliseconds

expr
dat <- pts %over% ny
latlongdata %>% rowwise() %>% mutate(zipcode = ZipLooker(dropoff_longitude,      dropoff_latitude))

       min        lq   median       uq      max neval
 275.89494 286.38187 297.9254 421.8727 445.7165    10
  95.18166  97.09873 101.8102 108.8677 122.0515    10
Tim_K
  • 659
  • 10
  • 24
  • I should not that after a few spot checks, the maptools method is more accurate. Some of the zip codes chosen using the dplyr method are off by a bit; sometimes it picks an adjacent zip. – Tim_K Aug 12 '15 at 00:57