7

say I have two sets of shape files that cover the same region and often, but not always share borders, e.g. US counties and PUMAs. I'd like to define a new scale of polygon that uses both PUMAs and counties as atomic building blocks, i.e. neither can ever be split, but I'd still like as many units as possible. Here is a toy example:

library(sp)
# make fake data
# 1) counties:
Cty <- SpatialPolygons(list(
    Polygons(list(Polygon(cbind(x=c(0,2,2,1,0,0),y=c(0,0,2,2,1,0)), hole=FALSE)),"county1"),
    Polygons(list(Polygon(cbind(x=c(2,4,4,3,3,2,2),y=c(0,0,2,2,1,1,0)),hole=FALSE)),"county2"),
    Polygons(list(Polygon(cbind(x=c(4,5,5,4,4),y=c(0,0,3,2,0)),hole=FALSE)),"county3"),
    Polygons(list(Polygon(cbind(x=c(0,1,2,2,0,0),y=c(1,2,2,3,3,1)),hole=FALSE)),"county4"),
    Polygons(list(Polygon(cbind(x=c(2,3,3,4,4,3,3,2,2),y=c(1,1,2,2,3,3,4,4,1)),hole=FALSE)),"county5"),
    Polygons(list(Polygon(cbind(x=c(0,2,2,1,0,0),y=c(3,3,4,5,5,3)),hole=FALSE)),"county6"),
    Polygons(list(Polygon(cbind(x=c(1,2,3,4,1),y=c(5,4,4,5,5)),hole=FALSE)),"county7"),
    Polygons(list(Polygon(cbind(x=c(3,4,4,5,5,4,3,3),y=c(3,3,2,3,5,5,4,3)),hole=FALSE)),"county8")
))

counties <- SpatialPolygonsDataFrame(Cty, data = data.frame(ID=paste0("county",1:8),
            row.names=paste0("county",1:8),
            stringsAsFactors=FALSE)
)
# 2) PUMAs:
Pum <- SpatialPolygons(list(
            Polygons(list(Polygon(cbind(x=c(0,4,4,3,3,2,2,1,0,0),y=c(0,0,2,2,1,1,2,2,1,0)), hole=FALSE)),"puma1"),
            Polygons(list(Polygon(cbind(x=c(4,5,5,4,3,3,4,4),y=c(0,0,5,5,4,3,3,0)),hole=FALSE)),"puma2"),
            Polygons(list(Polygon(cbind(x=c(0,1,2,2,3,3,2,0,0),y=c(1,2,2,1,1,2,3,3,1)),hole=FALSE)),"puma3"),
            Polygons(list(Polygon(cbind(x=c(2,3,4,4,3,3,2,2),y=c(3,2,2,3,3,4,4,3)),hole=FALSE)),"puma4"),
            Polygons(list(Polygon(cbind(x=c(0,1,1,3,4,0,0),y=c(3,3,4,4,5,5,3)),hole=FALSE)),"puma5"),
            Polygons(list(Polygon(cbind(x=c(1,2,2,1,1),y=c(3,3,4,4,3)),hole=FALSE)),"puma6")
    ))
Pumas <- SpatialPolygonsDataFrame(Pum, data = data.frame(ID=paste0("puma",1:6),
            row.names=paste0("puma",1:6),
            stringsAsFactors=FALSE)
)

# desired result:
Cclust <- SpatialPolygons(list(
            Polygons(list(Polygon(cbind(x=c(0,4,4,3,3,2,2,1,0,0),y=c(0,0,2,2,1,1,2,2,1,0)), hole=FALSE)),"ctyclust1"),
            Polygons(list(Polygon(cbind(x=c(4,5,5,4,3,3,4,4),y=c(0,0,5,5,4,3,3,0)),hole=FALSE)),"ctyclust2"),
            Polygons(list(Polygon(cbind(x=c(0,1,2,2,3,3,4,4,3,3,2,2,0,0),y=c(1,2,2,1,1,2,2,3,3,4,4,3,3,1)),hole=FALSE)),"ctyclust3"),
            Polygons(list(Polygon(cbind(x=c(0,2,2,3,4,0,0),y=c(3,3,4,4,5,5,3)),hole=FALSE)),"ctyclust4")
    ))
CtyClusters <- SpatialPolygonsDataFrame(Cclust, data = data.frame(ID = paste0("ctyclust", 1:4),
            row.names = paste0("ctyclust", 1:4),
            stringsAsFactors=FALSE)
)

# take a look
par(mfrow = c(1, 2))
plot(counties, border = gray(.3), lwd = 4)
plot(Pumas, add = TRUE, border = "#EEBB00", lty = 2, lwd = 2)
legend(-.5, -.3, lty = c(1, 2), lwd = c(4, 2), col = c(gray(.3), "#EEBB00"),
    legend = c("county line", "puma line"), xpd = TRUE, bty = "n")
text(coordinates(counties), counties@data$ID,col = gray(.3))
text(coordinates(Pumas), Pumas@data$ID, col = "#EEBB00",cex=1.5)
title("building blocks")
#desired result:
plot(CtyClusters)
title("desired result")
text(-.5, -.5, "maximum units possible,\nwithout breaking either PUMAs or counties",
    xpd = TRUE, pos = 4)

enter image description here I've naively tried many of the g* functions in the rgeos package to achieve this and have not made headway. Does anyone know of a nice function or awesome trick for this task? Thank you!

[I'm also open to suggestions on a better title]

Community
  • 1
  • 1
tim riffe
  • 5,651
  • 1
  • 26
  • 40

3 Answers3

3

I think you could do this with a smart set of tests for containment. This gets two of your parts, the simple paired case where puma1 contains county1 and county2, and puma2 contains county8 and county3.

library(rgeos)

## pumas by counties
pbyc <- gContains(Pumas, counties, byid = TRUE)

## row / col pairs of where contains test is TRUE for Pumas
pbyc.pairs <-  cbind(row(pbyc)[pbyc], col(pbyc)[pbyc])

par(mfrow = c(nrow(pbyc.pairs), 1))

for (i in 1:nrow(pbyc.pairs)) {
plot(counties, col = "white")

plot(gUnion(counties[pbyc.pairs[i,1], ], Pumas[pbyc.pairs[i,2], ]), col = "red", add = TRUE)

}

The plotting there is dumbly redundant, but I think it shows a start. You need to find which contains tests accumulate the most smaller parts, and then union those. Sorry I haven't put the effort in to finish but I think this would work.

Josh O'Brien
  • 159,210
  • 26
  • 366
  • 455
mdsumner
  • 29,099
  • 6
  • 83
  • 91
  • thanks @mdsummer, I was able to keep moving in the direction you suggest and arrive at a full, albeit ugly, solution. – tim riffe Sep 06 '12 at 17:01
3

Here's a relatively succinct solution which:

  • Uses rgeos::gRelate() to identify Pumas that overlap but don't completely encompass/cover each county.To understand what it does, run example(gRelate) and see this Wikipedia page. (h.t. to Tim Riffe)

  • Uses RBGL::connectedComp() to identify groups of Pumas that should thus be merged. (For pointers on installing the RBGL package, see my answer to this SO question.)

  • Uses rgeos::gUnionCascaded() to merge the indicated Pumas.

    library(rgeos)
    library(RBGL)
    
    ## Identify groups of Pumas that should be merged
    x <- gRelate(counties, Pumas, byid=TRUE)
    x <- matrix(grepl("2.2......", x), ncol=ncol(x), dimnames=dimnames(x))
    k <- x %*% t(x)
    l <- connectedComp(as(k, "graphNEL"))
    
    ## Extend gUnionCascaded so that each SpatialPolygon gets its own ID.
    gMerge <- function(ii) {
        x <- gUnionCascaded(Pumas[ii,])
        spChFIDs(x, paste(ii, collapse="_"))
    }
    
    ## Merge Pumas as needed
    res <- do.call(rbind, sapply(l, gMerge))
    
    plot(res)
    

enter image description here

Community
  • 1
  • 1
Josh O'Brien
  • 159,210
  • 26
  • 366
  • 455
  • OK, now I'm off to try this foo on my real polygons. Looking much better so far than what I was going on, and I'm appreciative of your help – tim riffe Sep 08 '12 at 06:01
  • say what?! it works brilliantly even on the datasets that were being problematic! well done sir, hats off! – tim riffe Sep 08 '12 at 06:25
  • @timriffe -- Great to hear! I've had mixed luck using **rgeos** on my own messy real-world GIS data, so your success will encourage me to keep trying with it. – Josh O'Brien Sep 09 '12 at 22:27
  • it appears to only work as I expect when the shapefiles being used are clean and perfect. Luckily that's (mostly) the case with Tiger files. Will stick with this tool until something more is needed. – tim riffe Sep 10 '12 at 02:37
1

After much trial and error, I've come up with the following inelegant solution, rather in keeping with the tip by @mdsummer, but adding more checks, removing redundant merged polygons, and checking. Yikes. If someone can take what I've done and make it cleaner, then I'll accept that answer rather this, which I'd like to avoid if at all possible:

library(rgeos)
pbyc <- gCovers(Pumas, counties, byid = TRUE) | 
        gContains(Pumas, counties, byid = TRUE) | 
        gOverlaps(Pumas, counties, byid = TRUE) |

        t(gCovers(counties, Pumas, byid = TRUE) | 
            gContains(counties, Pumas, byid = TRUE) |  
            gOverlaps(counties, Pumas, byid = TRUE))


## row / col pairs of where test is TRUE for Pumas or counties
pbyc.pairs <-  cbind(row(pbyc)[pbyc], col(pbyc)[pbyc])

Potentials <- apply(pbyc.pairs, 1, function(x,counties,Pumas){
     gUnion(counties[x[1], ], Pumas[x[2], ])
    }, counties = counties, Pumas= Pumas)
for (i in 1:length(Potentials)){
  Potentials[[i]]@polygons[[1]]@ID <- paste0("p",i)
}
Potentials <- do.call("rbind",Potentials)

# remove redundant polygons:
Redundants <- gEquals(Potentials, byid = TRUE)
Redundants <- row(Redundants)[Redundants & lower.tri(Redundants)]
Potentials <- Potentials[-c(Redundants),]

# for each Potential summary polygon, see which counties and Pumas are contained:
keep.i <- vector(length=length(Potentials))

for (i in 1:length(Potentials)){
  ctyblocki <- gUnionCascaded(counties[c(gCovers(Potentials[i, ], counties, byid = TRUE)), ])
  Pumablocki <- gUnionCascaded(Pumas[c(gCovers(Potentials[i, ], Pumas, byid = TRUE)), ]) 
  keep.i[i] <- gEquals(ctyblocki, Potentials[i, ]) & gEquals(Pumablocki, Potentials[i, ])    
}
# what do we have left?
NewUnits <- Potentials[keep.i, ]

plot(NewUnits)

enter image description here

tim riffe
  • 5,651
  • 1
  • 26
  • 40
  • ha! It turns out that this solution works for this problem, but when I tried it on real PUMA and county shape files there were more strange border situations that I didn't foresee. Got them figured out and my final (for now) function is a beast, but yes, gets the job done :-) – tim riffe Sep 07 '12 at 22:14