OSGP: Measuring Geographic Distributions – Median Center

(Open Source Geospatial Python)

The ‘What is it?’

Also known as the Center of Minimum Distance, the Median Center is a location that is the shortest total distance to all features in the study area (not to be confused with the Central Feature, which is the feature that is the shortest distance to all others). It can be used to find a suitable location for something that needs to be centrally located. The Median Center will gravitate towards an area with the most features.

The Median Center is good for finding the most accessible location.

The Formula!

The is no single formula or equation for calculating an exact Median Center, according to Andy Mitchell it is an iterative process involving calculating the Mean Center, summing the distances from it to each feature, offsetting the center slightly and summing the distances again until it eventually zones in on the optimum location that has the lowest sum.

The code below implements the Yehuda Vardi and Cun-Hui Zhang algorithm or the Weiszfeld algorithm.

The Code…

import math, sys
import numpy as np
from osgeo import ogr
from scipy.spatial.distance import cdist

## "W" for Weiszfield
## "YC" for Yehuda Vardi and Cun-Hui Zhang
algorithm = "YC"

## Weiszfield
## https://gist.github.com/endolith/2837160
def numersum(test_median,dataPoint):
    ## Provides the denominator of the weiszfeld algorithm depending on whether
    ## you are adjusting the candidate x or y
    return 1/math.sqrt((test_median[0]-dataPoint[0])**2 + (test_median[1]-dataPoint[1])**2)

def denomsum(test_median, xy_arr):
    ## Provides the denominator of the weiszfeld algorithm
    temp = 0.0
    for i in range(0,len(xy_arr)):
        temp += 1/math.sqrt((test_median[0] - xy_arr[i][0])**2 + (test_median[1] - xy_arr[i][1])**2)
    return temp

## Yehuda Vardi and Cun-Hui Zhang
## http://stackoverflow.com/questions/30299267/geometric-median-of-multidimensional-points
## user: orlp
def geometric_median(X, eps=1e-5):
    y = np.mean(X, 0)

    while True:
        D = cdist(X, [y])
        nonzeros = (D != 0)[:, 0]
        Dinv = 1 / D[nonzeros]
        Dinvs = np.sum(Dinv)
        W = Dinv / Dinvs
        T = np.sum(W * X[nonzeros], 0)
        num_zeros = len(X) - np.sum(nonzeros)
        if num_zeros == 0:
            y1 = T
        elif num_zeros == len(X):
            return y
        else:
            R = (T - y) * Dinvs
            r = np.linalg.norm(R)
            rinv = 0 if r == 0 else num_zeros/r
            y1 = max(0, 1-rinv)*T + min(1, rinv)*y
        if np.linalg.norm(y - y1) < eps:
            return y1
        y = y1

## set the driver for the data
driver = ogr.GetDriverByName("FileGDB")

## path to the FileGDB
gdb = r"C:\Users\Glen B\Documents\my_geodata.gdb"

## ope the GDB in write mode (1)
ds = driver.Open(gdb, 1)

## input feature class
input_lyr_name = "Birmingham_Secondary_Schools"

## name of output feature class
output_fc = "{0}_median_center".format(input_lyr_name)

## reference the layer using the layers name
if input_lyr_name in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    lyr = ds.GetLayerByName(input_lyr_name)
    print "{0} found in {1}".format(input_lyr_name, gdb)

## if the output layer already exists then delete it
if output_fc in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    ds.DeleteLayer(output_fc)
    print "Deleting: {0}".format(output_fc)

## create an array with coordinates of each feature
try:
    first_feat = lyr.GetFeature(1)
    ## centroid for points and polygons
    if first_feat.geometry().GetGeometryName() in ["POINT", "MULTIPOINT", "POLYGON", "MULTIPOLYGON"]:
        xy_arr = np.ndarray((len(lyr), 2), dtype=np.float)
        for i, pt in enumerate(lyr):
            ft_geom = pt.geometry()
            xy_arr[i] = (ft_geom.Centroid().GetX(), ft_geom.Centroid().GetY())

    ## for lines we get the midpoint of a line
    elif first_feat.geometry().GetGeometryName() in ["LINESTRING", "MULTILINESTRING"]:
        xy_arr = np.ndarray((len(lyr), 2), dtype=np.float)
        for i, ln in enumerate(lyr):
            line_geom = ln.geometry().ExportToWkt()
            shapely_line = MultiLineString(wkt.loads(line_geom))
            midpoint = shapely_line.interpolate(shapely_line.length/2)
            xy_arr[i] = (midpoint.x, midpoint.y)

except Exception:
    print "Unknown geometry for {}".format(input_lyr_name)
    sys.exit()

## if using Weiszfield
if algorithm == "W":
    ## https://gist.github.com/endolith/2837160
    avg_x, avg_y = np.mean(xy_arr, axis=0)
    test_median = [avg_x, avg_y]
    numIter = 50

## minimise the objective function
for x in range(0,numIter):
    denom = denomsum(test_median,xy_arr)
    nextx = 0.0
    nexty = 0.0

    for y in range(0,len(xy_arr)):
        nextx += (xy_arr[y][0] * numersum(test_median,xy_arr[y]))/denom
        nexty += (xy_arr[y][1] * numersum(test_median,xy_arr[y]))/denom

    test_median = [nextx,nexty]

## if using Yehuda Vardi and Cun-Hui Zhang
elif algorithm == "YC":
    test_median = geometric_median(xy_arr)

print "Median Center: {0}, {1}".format(test_median[0], test_median[1])

## create a new point layer with the same spatial ref as lyr
out_lyr = ds.CreateLayer(output_fc, lyr.GetSpatialRef(), ogr.wkbPoint)

## define and create new fields
x_fld = ogr.FieldDefn("X", ogr.OFTReal)
y_fld = ogr.FieldDefn("Y", ogr.OFTReal)
out_lyr.CreateField(x_fld)
out_lyr.CreateField(y_fld)

## create a new point for the mean center
pnt = ogr.Geometry(ogr.wkbPoint)
pnt.AddPoint(test_median[0], test_median[1])

## add the mean center to the new layer
feat_dfn = out_lyr.GetLayerDefn()
feat = ogr.Feature(feat_dfn)
feat.SetGeometry(pnt)
feat.SetField("X", test_median[0])
feat.SetField("Y", test_median[1])
out_lyr.CreateFeature(feat)

print "Created {0}".format(output_fc)

## free up resources
del ds, lyr, first_feat, feat, out_lyr

I’d like to give credit to…
Logan Byers from GIS StackExchange who aided in speeding up the computational time using NumPy and for forcing me to begin learning the wonders of NumPy.
orlp from Stack Overflow for their implementation of Yehuda Vardi and Cun-Hui Zhang’s algorithm for the geometric median.
Daniel J Lewis (I think) for the implementation of the Weiszfeld algorithm.

The Example:

I downloaded vector data that contains polygons for schools (and other features) from OS Open Map – Local that covered the West Midlands. I also downloaded OS Boundary Line data. Using Python and GDAL/OGR I extracted secondary schools from the data for Birmingham only. Everything was now in place to find the Median Center of all Secondary Schools for Birmingham. (see The Other Scripts section at the bottom of this post for processing the data)

birmingham_secondary_schools

Running the script from The Code section above calculates the coordinates of the Median Center for Secondary Schools in Birmingham and creates a point feature class in the File GDB.

birmingham_secondary_schools_median_center

OSGP Median Center (W):        407658.278755, 286696.905759
OSGP Median Center (YC):      407658.278752, 286696.905769
ArcGIS Median Center:             407658.009375, 286697.53996

What’s Next?

Weighted Mean Center (link will be updated once post is ready)

Also See…

Mean Center
Central Feature

The Resources:

ESRI Guide to GIS Volume 2: Chapter 2
see book review here.

Geoprocessing with Python

Python GDAL/OGR Cookbook

Setting up GDAL/OGR with FileGDB Driver for Python on Windows

< The Other Scripts >

Birmingham Secondary Schools

from osgeo import ogr
import os

## necessary drivers
shp_driver = ogr.GetDriverByName("ESRI Shapefile")
gdb_driver = ogr.GetDriverByName("FileGDB")

## input boundary shapefile and file reference file gdb
shapefile = r"C:\Users\Glen B\Documents\Schools\Data\GB\district_borough_unitary_region.shp"
gdb = r"C:\Users\Glen B\Documents\my_geodata.gdb"

shp_ds = shp_driver.Open(shapefile, 0)
gdb_ds = gdb_driver.Open(gdb, 1)

## filter boundary to just Birmingham
shp_layer = shp_ds.GetLayer(0)
shp_layer.SetAttributeFilter("NAME = 'Birmingham District (B)'")

## name the output
output_fc = "Birmingham_Secondary_Schools"

## if the output feature class already exists then delete it
if output_fc in [gdb_ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(gdb_ds.GetLayerCount())]:
    gdb_ds.DeleteLayer(output_fc)
    print "Deleting: {0}".format(output_fc)

## create the output feature class
out_lyr = gdb_ds.CreateLayer(output_fc, shp_layer.GetSpatialRef(), ogr.wkbPolygon)

## the folder that contains the data to extract Secondary Schools from
root_folder = r"C:\Users\Glen B\Documents\Schools\Vector\data"

## traverse through the folders and find ImportantBuildings files
## copy only those that intersect the Birmingham region
## transfer across attributes
count = 1
for root,dirs,files in os.walk(root_folder):
    for filename in files:
        if filename.endswith("ImportantBuilding.shp") and filename[0:2] in ["SP", "SO", "SJ", "SK"]:
            shp_path = "{0}\\{1}".format(root, filename)
            schools_ds = shp_driver.Open(shp_path, 0)
            schools_lyr = schools_ds.GetLayer(0)
            schools_lyr.SetAttributeFilter("CLASSIFICA = 'Secondary Education'")
            lyr_def = schools_lyr.GetLayerDefn()
            if count == 1:
                for i in range(lyr_def.GetFieldCount()):
                    out_lyr.CreateField(lyr_def.GetFieldDefn(i))
                count += 1
            shp_layer.ResetReading()
            for shp_feat in shp_layer:
                birm_geom = shp_feat.GetGeometryRef()

                for school_feat in schools_lyr:
                    school_geom = school_feat.GetGeometryRef()

                    if school_geom.Intersects(birm_geom):
                        feat_dfn = out_lyr.GetLayerDefn()
                        feat = ogr.Feature(feat_dfn)
                        feat.SetGeometry(school_geom)
                        for i in range(lyr_def.GetFieldCount()):
                            feat.SetField(lyr_def.GetFieldDefn(i).GetNameRef(), school_feat.GetField(i))

                        out_lyr.CreateFeature(feat)
                        feat.Destroy()

del shp_ds, shp_layer, gdb_ds

The Usual 🙂

As always please feel free to comment to help make the code more efficient, highlight errors, or let me know if this was of any use to you.