1

Or: The search for a faster and more accurate way to rasterize small OpenStreetMap extracts into population-weighted rasters.

I'd like to turn a small .pbf file into a GeoTiff which will be easier to do further spatial analysis on. For the purpose of this question I will constrain the requirements to dealing with polygon geometry since I already found a solution that works very well for lines. It works so well that I am considering converting all my polygons into lines.

To give an example of the type of data that I'd like to convert:

wget https://download.geofabrik.de/europe/liechtenstein-latest.osm.pbf
osmium tags-filter liechtenstein-latest.osm.pbf landuse=grass -o liechtenstein_grass.pbf
ogr2ogr -t_srs EPSG:3857 liechtenstein_grass.sqlite -dsco SPATIALITE=YES -nln multipolygons -nlt POLYGON -skipfailures liechtenstein_grass.pbf

I found a zonal statistics script here which we might be able to build from to solve this problem: http://pcjericks.github.io/py-gdalogr-cookbook/raster_layers.html#calculate-zonal-statistics

The above script takes a vector layer and a raster layer, iterates on the vector features by clipping the raster and doing some statistics on that.

Instead of normal zonal statistics I would like to count the number of vector features which intersect with each raster pixel. I have a global raster grid Int32 with a unique value for each pixel.

{qgis_process} run native:creategrid -- TYPE=2 EXTENT="-20037760, -8399416, 20037760, 18454624 [EPSG:3857]" HSPACING=1912 VSPACING=1912 HOVERLAY=0 VOVERLAY=0 CRS="EPSG:3857" OUTPUT="grid.gpkg"

sqlite3 land.gpkg
SELECT load_extension("mod_spatialite");
alter table output add column ogcfod int;
update output set ogcfod = fid;

gdal_rasterize -l output -a ogcfod -tap -tr 1912.0 1912.0 -a_nodata 0.0 -ot Int32 -of GTiff -co COMPRESS=DEFLATE -co PREDICTOR=2 grid.gpkg grid.tif -te -20037760 -8399416 20037760 18454624

So I'm thinking if we could still iterate on the vector features (as there are far, far fewer of those and there are 88m+ zones in the raster grid) it will probably be much more performant.

We want a script script which takes a vector layer and a raster layer, iterates on the vector features looks up the values of all the pixels the feature covers and then adds one to a dictionary: {px_id: qty}

However, when trying to make this script work it keeps giving me the same geometry... It only shows me 1 of the pixel IDs over and over

import sys
import gdal
import numpy
import ogr
import osr
from rich import inspect, print


def zonal_stats(feat, lyr, raster):
    # Get raster georeference info
    transform = raster.GetGeoTransform()
    xOrigin = transform[0]
    yOrigin = transform[3]
    pixelWidth = transform[1]
    pixelHeight = transform[5]

    # Reproject vector geometry to same projection as raster
    sourceSR = lyr.GetSpatialRef()
    targetSR = osr.SpatialReference()
    targetSR.ImportFromWkt(raster.GetProjectionRef())
    coordTrans = osr.CoordinateTransformation(sourceSR, targetSR)
    feat = lyr.GetNextFeature()
    geom = feat.GetGeometryRef()
    geom.Transform(coordTrans)

    # Get extent of feat
    geom = feat.GetGeometryRef()
    if geom.GetGeometryName() == "MULTIPOLYGON":
        count = 0
        pointsX = []
        pointsY = []
        for polygon in geom:
            geomInner = geom.GetGeometryRef(count)
            ring = geomInner.GetGeometryRef(0)
            numpoints = ring.GetPointCount()
            for p in range(numpoints):
                lon, lat, z = ring.GetPoint(p)
                pointsX.append(lon)
                pointsY.append(lat)
            count += 1
    elif geom.GetGeometryName() == "POLYGON":
        ring = geom.GetGeometryRef(0)
        numpoints = ring.GetPointCount()
        pointsX = []
        pointsY = []
        for p in range(numpoints):
            lon, lat, z = ring.GetPoint(p)
            pointsX.append(lon)
            pointsY.append(lat)

    else:
        sys.exit("ERROR: Geometry needs to be either Polygon or Multipolygon")

    xmin = min(pointsX)
    xmax = max(pointsX)
    ymin = min(pointsY)
    ymax = max(pointsY)

    print(xmin, xmax)
    print(ymin, ymax)

    # Specify offset and rows and columns to read
    xoff = int((xmin - xOrigin) / pixelWidth)
    yoff = int((yOrigin - ymax) / pixelWidth)
    xcount = int((xmax - xmin) / pixelWidth) + 1
    ycount = int((ymax - ymin) / pixelWidth) + 1

    # Create memory target raster
    target_ds = gdal.GetDriverByName("MEM").Create(
        "", xcount, ycount, 1, gdal.GDT_Int32
    )
    target_ds.SetGeoTransform(
        (
            xmin,
            pixelWidth,
            0,
            ymax,
            0,
            pixelHeight,
        )
    )

    # Create for target raster the same projection as for the value raster
    raster_srs = osr.SpatialReference()
    raster_srs.ImportFromWkt(raster.GetProjectionRef())
    target_ds.SetProjection(raster_srs.ExportToWkt())

    # Rasterize zone polygon to raster
    gdal.RasterizeLayer(target_ds, [1], lyr, burn_values=[1])

    # Read raster as arrays
    banddataraster = raster.GetRasterBand(1)
    dataraster = banddataraster.ReadAsArray(xoff, yoff, xcount, ycount)

    bandmask = target_ds.GetRasterBand(1)
    datamask = bandmask.ReadAsArray(0, 0, xcount, ycount)

    print(dataraster)

    # Mask zone of raster
    # zoneraster = numpy.ma.masked_array(dataraster, numpy.logical_not(datamask))

    # print(zoneraster)

    # exit()


def loop_zonal_stats(lyr, raster):
    featList = range(lyr.GetFeatureCount())
    statDict = {}

    for FID in featList:
        feat = lyr.GetFeature(FID)
        meanValue = zonal_stats(feat, lyr, raster)
        statDict[FID] = meanValue
    return statDict


def main(input_zonal_raster, input_value_polygon):
    raster = gdal.Open(input_zonal_raster)
    shp = ogr.Open(input_value_polygon)
    lyr = shp.GetLayer()

    return loop_zonal_stats(lyr, raster)


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print(
            "[ ERROR ] you must supply two arguments: input-zone-raster-name.tif input-value-shapefile-name.shp "
        )
        sys.exit(1)

    main(sys.argv[1], sys.argv[2])

Prior research:

{qgis_process} run qgis:heatmapkerneldensityestimation -- INPUT="{basen}.json.geojson" RADIUS=2868 RADIUS_FIELD=None PIXEL_SIZE=1912 WEIGHT_FIELD=None KERNEL=4 DECAY=0 OUTPUT_VALUE=0 OUTPUT="{basen}.tif
jaksco
  • 423
  • 7
  • 18
  • 1
    This seems suspect: "def zonal_stats(feat, lyr, raster): (...) feat = lyr.GetNextFeature() " maybe it does not cause problems in your code, or maybe it even does, but in any case perhaps not the clearest way .. to have a parameter, and then later a local variable by the same name? I was trying to track how your looping code goes, maybe there is a simple bug. – antont May 30 '21 at 02:25
  • 1
    hmmm yeah that is weird. `for FID in featList: if FID: ` seems to have fixed it. I guess sometimes FID is None? weird – jaksco May 30 '21 at 03:35
  • 1
    cool!! It is working. That was a lot easier than I thought it would be. I'll post an Answer shortly. Thanks for the help antont! – jaksco May 30 '21 at 03:52

1 Answers1

0

This seems to work. Output is CSV columns=["px_id", "qty"]

"""
python rasterlayerzonalvectorcounts.py grid.tif liechtenstein_grass.sqlite

MIT License
Based on https://github.com/pcjericks/py-gdalogr-cookbook/blob/master/raster_layers.rst#calculate-zonal-statistics
"""

import sys
import osr
import os
import ogr
import numpy
import gdal
import pandas
from joblib import Parallel, delayed
from collections import Counter


def zonal_stats(FID, lyr, raster):
    feat = lyr.GetFeature(FID)

    # Get raster georeference info
    transform = raster.GetGeoTransform()
    xOrigin = transform[0]
    yOrigin = transform[3]
    pixelWidth = transform[1]
    pixelHeight = transform[5]

    # Reproject vector geometry to same projection as raster
    sourceSR = lyr.GetSpatialRef()
    targetSR = osr.SpatialReference()
    targetSR.ImportFromWkt(raster.GetProjectionRef())
    coordTrans = osr.CoordinateTransformation(sourceSR, targetSR)
    geom = feat.GetGeometryRef()
    geom.Transform(coordTrans)

    # Get extent of feat
    geom = feat.GetGeometryRef()
    xmin, xmax, ymin, ymax = geom.GetEnvelope()

    # Specify offset and rows and columns to read
    xoff = int((xmin - xOrigin) / pixelWidth)
    yoff = int((yOrigin - ymax) / pixelWidth)
    xcount = int((xmax - xmin) / pixelWidth) + 1
    ycount = int((ymax - ymin) / pixelWidth) + 1

    # Create memory target raster
    target_ds = gdal.GetDriverByName("MEM").Create(
        "", xcount, ycount, 1, gdal.GDT_Int32
    )
    target_ds.SetGeoTransform(
        (
            xmin,
            pixelWidth,
            0,
            ymax,
            0,
            pixelHeight,
        )
    )

    # Create for target raster the same projection as for the value raster
    raster_srs = osr.SpatialReference()
    raster_srs.ImportFromWkt(raster.GetProjectionRef())
    target_ds.SetProjection(raster_srs.ExportToWkt())

    # Rasterize zone polygon to raster
    gdal.RasterizeLayer(target_ds, [1], lyr, burn_values=[1])

    # Read raster as arrays
    banddataraster = raster.GetRasterBand(1)
    dataraster = banddataraster.ReadAsArray(xoff, yoff, xcount, ycount)

    bandmask = target_ds.GetRasterBand(1)
    datamask = bandmask.ReadAsArray(0, 0, xcount, ycount)

    # Mask zone of raster
    zoneraster = numpy.ma.masked_array(dataraster, numpy.logical_not(datamask))

    return numpy.array(zoneraster).tolist()


def loop_zonal_stats(input_value_polygon, input_zonal_raster):
    shp = ogr.Open(input_value_polygon)
    lyr = shp.GetLayer()
    print("Processing", lyr.GetFeatureCount(), "features")
    featList = range(lyr.GetFeatureCount())

    def processFID(input_value_polygon, input_zonal_raster, FID):
        shp = ogr.Open(input_value_polygon)
        raster = gdal.Open(input_zonal_raster)
        lyr = shp.GetLayer()
        if FID:
            px_ids = zonal_stats(FID, lyr, raster)
            # print(px_ids)
            px_ids = [item for sublist in px_ids for item in sublist]
            return px_ids

    return Parallel(n_jobs=8)(
        delayed(processFID)(input_value_polygon, input_zonal_raster, FID)
        for FID in featList
    )


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print(
            "[ ERROR ] you must supply two arguments: input-zone-raster-name.tif input-value-shapefile-name.shp "
        )
        sys.exit(1)

    input_zonal_raster = sys.argv[1]
    input_value_polygon = sys.argv[2]

    counts = list(
        filter(None, loop_zonal_stats(input_value_polygon, input_zonal_raster))
    )
    counts = Counter([item for sublist in counts for item in sublist])

    pandas.DataFrame.from_dict(data=counts, orient="index").to_csv(
        os.path.splitext(input_value_polygon)[0] + ".csv", header=False
    )

This one will create an output GeoTiff with the same grid system as the source zonal GeoTiff

I wonder if it could be sped up by using What is the purpose of meshgrid in Python / NumPy?

"""
python rasterlayerzonalvectorcounts.py grid.tif liechtenstein_grass.sqlite

MIT License
Based on https://github.com/pcjericks/py-gdalogr-cookbook/blob/master/raster_layers.rst#calculate-zonal-statistics
"""

import sys
import osr
import os
import ogr
import numpy
import gdal
import pandas
from joblib import Parallel, delayed
from collections import Counter
from rich import print, inspect


def zonal_stats(FID, lyr, raster):
    feat = lyr.GetFeature(FID)

    # Get raster georeference info
    transform = raster.GetGeoTransform()
    xOrigin = transform[0]
    yOrigin = transform[3]
    pixelWidth = transform[1]
    pixelHeight = transform[5]

    # Get extent of feat
    geom = feat.GetGeometryRef()
    xmin, xmax, ymin, ymax = geom.GetEnvelope()

    # Specify offset and rows and columns to read
    xoff = int((xmin - xOrigin) / pixelWidth)
    yoff = int((yOrigin - ymax) / pixelWidth)
    xcount = int((xmax - xmin) / pixelWidth) + 1
    ycount = int((ymax - ymin) / pixelWidth) + 1

    feat_arr = []
    # if xcount != 1 or ycount != 1:
    # print(xoff, yoff, xcount, ycount)
    for x in range(xcount):
        for y in range(ycount):
            # print(xoff + x, yoff + y)
            feat_arr.append((xoff + x, yoff + y))

    return feat_arr


def loop_zonal_stats(input_value_polygon, input_zonal_raster):
    shp = ogr.Open(input_value_polygon)
    lyr = shp.GetLayer()
    print("Processing", lyr.GetFeatureCount(), "features")
    featList = range(lyr.GetFeatureCount())

    def processFID(input_value_polygon, input_zonal_raster, FID):
        shp = ogr.Open(input_value_polygon)
        raster = gdal.Open(input_zonal_raster)
        lyr = shp.GetLayer()
        if FID:
            px_ids = zonal_stats(FID, lyr, raster)
            return px_ids

    return Parallel(n_jobs=1)(
        delayed(processFID)(input_value_polygon, input_zonal_raster, FID)
        for FID in featList
    )


if __name__ == "__main__":
    if len(sys.argv) != 3:
        print(
            "[ ERROR ] you must supply two arguments: input-zone-raster-name.tif input-value-shapefile-name.shp "
        )
        sys.exit(1)

    input_zonal_raster = sys.argv[1]
    input_value_polygon = sys.argv[2]

    counts = list(
        filter(None, loop_zonal_stats(input_value_polygon, input_zonal_raster))
    )
    counts = Counter([item for sublist in counts for item in sublist])

    raster_srs = osr.SpatialReference()
    raster_srs.ImportFromWkt(gdal.Open(input_zonal_raster).GetProjectionRef())

    raster_arr = numpy.empty((14045, 20960), numpy.int32)
    for px in counts.items():
        # print(px)
        raster_arr[px[0][1]][px[0][0]] = px[1]

    target_ds = gdal.GetDriverByName("GTiff").Create(
        os.path.splitext(input_value_polygon)[0] + ".tif",
        20960,
        14045,
        1,
        gdal.GDT_Int32,
        options=["COMPRESS=LZW"],
    )
    target_ds.SetGeoTransform(gdal.Open(input_zonal_raster).GetGeoTransform())
    target_ds.SetProjection(raster_srs.ExportToWkt())
    target_ds.GetRasterBand(1).WriteArray(raster_arr)
    target_ds.GetRasterBand(1).SetNoDataValue(0)
    target_ds.GetRasterBand(1).GetStatistics(0, 1)
    target_ds.FlushCache()
    target_ds = None

jaksco
  • 423
  • 7
  • 18