OSGP: Create Chainage ticks along a Line at Specified Distance Intervals

Posted on September 18, 2017 by Glen Bambrick

Interested in learning ArcPy? check out this course.

This builds on from the previous post creating points at specified distances along a line. Here, we create perpendicular chainage ticks that traverse the main line.

from osgeo import ogr
from shapely.geometry import MultiLineString, LineString, Point
from shapely import wkt
import sys, math

## http://wikicode.wikidot.com/get-angle-of-line-between-two-points
## angle between two points
def getAngle(pt1, pt2):
    x_diff = pt2.x - pt1.x
    y_diff = pt2.y - pt1.y
    return math.degrees(math.atan2(y_diff, x_diff))

## start and end points of chainage tick
## get the first end point of a tick
def getPoint1(pt, bearing, dist):
    angle = bearing + 90
    bearing = math.radians(angle)
    x = pt.x + dist * math.cos(bearing)
    y = pt.y + dist * math.sin(bearing)
    return Point(x, y)
## get the second end point of a tick
def getPoint2(pt, bearing, dist):
    bearing = math.radians(bearing)
    x = pt.x + dist * math.cos(bearing)
    y = pt.y + dist * math.sin(bearing)
    return Point(x, y)

## set the driver for the data
driver = ogr.GetDriverByName("FileGDB")
## path to the FileGDB
gdb = r"C:\Users\******\Documents\ArcGIS\Default.gdb"
## open the GDB in write mode (1)
ds = driver.Open(gdb, 1)

## linear feature class
input_lyr_name = "input_line"

## distance between each points
distance = 10
## the length of each tick
tick_length = 20

## output tick line fc name
output_lns = "{0}_{1}m_lines".format(input_lyr_name, distance)

## list to hold all the point coords
list_points = []

## reference the layer using the layers name
if input_lyr_name in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    lyr = ds.GetLayerByName(input_lyr_name)
    print "{0} found in {1}".format(input_lyr_name, gdb)

## if the output already exists then delete it
if output_lns in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    ds.DeleteLayer(output_lns)
    print "Deleting: {0}".format(output_lns)

## create a new line layer with the same spatial ref as lyr
out_ln_lyr = ds.CreateLayer(output_lns, lyr.GetSpatialRef(), ogr.wkbLineString)

## distance/chainage attribute
chainage_fld = ogr.FieldDefn("CHAINAGE", ogr.OFTReal)
out_ln_lyr.CreateField(chainage_fld)
## check the geometry is a line
first_feat = lyr.GetFeature(1)

## accessing linear feature classes using FileGDB driver always returns a MultiLinestring
if first_feat.geometry().GetGeometryName() in ["LINESTRING", "MULTILINESTRING"]:
    for ln in lyr:
        ## list to hold all the point coords
        list_points = []
        ## set the current distance to place the point
        current_dist = distance
        ## get the geometry of the line as wkt
        line_geom = ln.geometry().ExportToWkt()
        ## make shapely MultiLineString object
        shapely_line = MultiLineString(wkt.loads(line_geom))
        ## get the total length of the line
        line_length = shapely_line.length
        ## append the starting coordinate to the list
        list_points.append(Point(list(shapely_line[0].coords)[0]))
        ## https://nathanw.net/2012/08/05/generating-chainage-distance-nodes-in-qgis/
        ## while the current cumulative distance is less than the total length of the line
        while current_dist < line_length:
            ## use interpolate and increase the current distance
            list_points.append(shapely_line.interpolate(current_dist))
            current_dist += distance
        ## append end coordinate to the list
        list_points.append(Point(list(shapely_line[0].coords)[-1]))

        ## add lines to the layer
        ## this can probably be cleaned up better
        ## but it works and is fast!
        for num, pt in enumerate(list_points, 1):
            ## start chainage 0
            if num == 1:
                angle = getAngle(pt, list_points[num])
                line_end_1 = getPoint1(pt, angle, tick_length/2)
                angle = getAngle(line_end_1, pt)
                line_end_2 = getPoint2(line_end_1, angle, tick_length)
                tick = LineString([(line_end_1.x, line_end_1.y), (line_end_2.x, line_end_2.y)])
                feat_dfn_ln = out_ln_lyr.GetLayerDefn()
                feat_ln = ogr.Feature(feat_dfn_ln)
                feat_ln.SetGeometry(ogr.CreateGeometryFromWkt(tick.wkt))
                feat_ln.SetField("CHAINAGE", 0)
                out_ln_lyr.CreateFeature(feat_ln)

            ## everything in between
            if num < len(list_points) - 1:
                angle = getAngle(pt, list_points[num])
                line_end_1 = getPoint1(list_points[num], angle, tick_length/2)
                angle = getAngle(line_end_1, list_points[num])
                line_end_2 = getPoint2(line_end_1, angle, tick_length)
                tick = LineString([(line_end_1.x, line_end_1.y), (line_end_2.x, line_end_2.y)])
                feat_dfn_ln = out_ln_lyr.GetLayerDefn()
                feat_ln = ogr.Feature(feat_dfn_ln)
                feat_ln.SetGeometry(ogr.CreateGeometryFromWkt(tick.wkt))
                feat_ln.SetField("CHAINAGE", distance * num)
                out_ln_lyr.CreateFeature(feat_ln)

            ## end chainage
            if num == len(list_points):
                angle = getAngle(list_points[num - 2], pt)
                line_end_1 = getPoint1(pt, angle, tick_length/2)
                angle = getAngle(line_end_1, pt)
                line_end_2 = getPoint2(line_end_1, angle, tick_length)
                tick = LineString([(line_end_1.x, line_end_1.y), (line_end_2.x, line_end_2.y)])
                feat_dfn_ln = out_ln_lyr.GetLayerDefn()
                feat_ln = ogr.Feature(feat_dfn_ln)
                feat_ln.SetGeometry(ogr.CreateGeometryFromWkt(tick.wkt))
                feat_ln.SetField("CHAINAGE", int(line_length))
                out_ln_lyr.CreateFeature(feat_ln)

del ds

The output is a linear feature class containing chainage ticks and distance attribute.

Please leave comments if this can be improved or if you found it useful.

OSGP: Measuring Geographic Distributions – Standard Distance

Posted on April 26, 2017 by Glen Bambrick

Interested in learning ArcPy? check out this course.

(Open Source Geospatial Python)

The ‘What is it?’

The Standard Distance, also know as the Standard Distance Deviation, is the average distance all features vary from the Mean Center and measures the compactness of a distribution. The Standard Distance is a value representing the distance in units from the Mean Center and is usually plotted on a map as a circle for a visual indication of dispersion, the Standard Distance is the radius.

The Standard Distance works best in the absence of a strong directional trend. According to Andy Mitchell, if a directional trend is present you are better off using the Standard Deviational Ellipse.

You can use the Standard Distance to compare territories between species, which has the wider/broader territory, or to compare changes over time such as the dispersion of burglaries for each calendar month.

In a Normal Distribution you would expect around 68% of all points to fall within the Standard Distance.

The Formula!

The mean x-coordinate is subtracted from the x-coordinate value for each point and the difference is squared. The sum of all the squared values for x minus the x-mean is divided by the number of points. This is also performed for y-coordinates. The resulting values for x and y are summed and then we take the square root of this value to return the value to original distance units.

First we get the mean X and Y…

…and then the Standard Distance

For Point features the X and Y coordinates of each feature is used, for Polygons the centroid of each feature represents the X and Y coordinate to use, and for Linear features the mid-point of each line is used for the X and Y coordinate.

The Code…

from osgeo import ogr
from shapely.geometry import MultiLineString
from shapely import wkt
import numpy as np
import sys, math

## set the driver for the data
driver = ogr.GetDriverByName("FileGDB")
## path to the FileGDB
gdb = r"C:\Users\Glen B\Documents\ArcGIS\Default.gdb"
## ope the GDB in write mode (1)
ds = driver.Open(gdb, 1)

input_lyr_name = "Birmingham_Burglaries_2016"

output_fc = "{0}_standard_distance".format(input_lyr_name)

## reference the layer using the layers name
if input_lyr_name in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    lyr = ds.GetLayerByName(input_lyr_name)
    print "{0} found in {1}".format(input_lyr_name, gdb)

if output_fc in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    ds.DeleteLayer(output_fc)
    print "Deleting: {0}".format(output_fc)

try:
    ## for points and polygons we use the centroid
    first_feat = lyr.GetFeature(1)
    if first_feat.geometry().GetGeometryName() in ["POINT", "MULTIPOINT", "POLYGON", "MULTIPOLYGON"]:
        xy_arr = np.ndarray((len(lyr), 2), dtype=np.float)
        for i, pt in enumerate(lyr):
            ft_geom = pt.geometry()
            xy_arr[i] = (ft_geom.Centroid().GetX(), ft_geom.Centroid().GetY())

    ## for lines we get the midpoint of a line
    elif first_feat.geometry().GetGeometryName() in ["LINESTRING", "MULTILINESTRING"]:
        xy_arr = np.ndarray((len(lyr), 2), dtype=np.float)
        for i, ln in enumerate(lyr):
            line_geom = ln.geometry().ExportToWkt()
            shapely_line = MultiLineString(wkt.loads(line_geom))
            midpoint = shapely_line.interpolate(shapely_line.length/2)
            xy_arr[i] = (midpoint.x, midpoint.y)

except Exception:
    print "Unknown geometry for {}".format(input_lyr_name)
    sys.exit()

avg_x, avg_y = np.mean(xy_arr, axis=0)

print "Mean Center: {0}, {1}".format(avg_x, avg_y)

sum_of_sq_diff_x = 0.0
sum_of_sq_diff_y = 0.0

for x, y in xy_arr:
    diff_x = math.pow(x - avg_x, 2)
    diff_y = math.pow(y - avg_y, 2)
    sum_of_sq_diff_x += diff_x
    sum_of_sq_diff_y += diff_y

sum_of_results = (sum_of_sq_diff_x/lyr.GetFeatureCount()) + (sum_of_sq_diff_y/lyr.GetFeatureCount())
standard_distance = math.sqrt(sum_of_results)
print "Standard Distance: {0}". format(standard_distance)

## create a point with the mean center
## and buffer by the standard distance
pnt = ogr.Geometry(ogr.wkbPoint)
pnt.AddPoint(avg_x, avg_y)
polygon = pnt.Buffer(standard_distance, 90)

## create a new polygon layer with the same spatial ref as lyr
out_lyr = ds.CreateLayer(output_fc, lyr.GetSpatialRef(), ogr.wkbPolygon)

## define and create new fields
x_fld = ogr.FieldDefn("X", ogr.OFTReal)
y_fld = ogr.FieldDefn("Y", ogr.OFTReal)
stnd_dst = ogr.FieldDefn("Standard_Distance", ogr.OFTReal)
out_lyr.CreateField(x_fld)
out_lyr.CreateField(y_fld)
out_lyr.CreateField(stnd_dst)

## add the standard distance buffer to the layer
feat_dfn = out_lyr.GetLayerDefn()
feat = ogr.Feature(feat_dfn)
feat.SetGeometry(polygon)
feat.SetField("X", avg_x)
feat.SetField("Y", avg_y)
feat.SetField("Standard_Distance", standard_distance)
out_lyr.CreateFeature(feat)

print "Created {0}".format(output_fc)

## free up resources
del feat, ds, lyr, out_lyr

I’d like to give credit to Logan Byers from GIS StackExchange who aided in speeding up the computational time using NumPy and for forcing me to begin learning the wonders of NumPy (getting there bit by bit).

The Example:

I downloaded crime data from DATA.POLICE.UK for the West Midlands Police from January 2016 to December 2016. I used some Python to extract just the Burglary data and made this into a feature class in the File GDB. Next, I downloaded OS Boundary Line data and clipped the Burglary data to just Birmingham. Everything was now in place to find the Standard Distance of all burglaries for Birmingham in 2016. (see The Other Scripts section at the bottom of this post for processing the data)

Running the script from The Code section above calculates the Standard Distance for burglaries in Birmingham for 2016 and creates a polygon feature class in the File GDB.

OSGP Mean Center: 407926.695396, 286615.428507
ArcGIS Mean Center: 407926.695396, 286615.428507

OSGP Standard Distance: 6416.076596
ArcGIS Standard Distance: 6416.076596

Also See…

Mean Center
Central Feature
Median Center
Initial Data Assessment

The Resources:

ESRI Guide to GIS Volume 2: Chapter 2 (I highly recommend this book)
see book review here.

Geoprocessing with Python

Python GDAL/OGR Cookbook

Setting up GDAL/OGR with FileGDB Driver for Python on Windows

< The Other Scripts >

1. Extract Burglary Data for West Midlands

import csv, os
from osgeo import ogr, osr

## set the driver for the data
driver = ogr.GetDriverByName("FileGDB")

## path to the FileGDB
gdb = r"C:\Users\Glen B\Documents\my_geodata.gdb"

## ope the GDB in write mode (1)
ds = driver.Open(gdb, 1)

## the coordinates in the csv files are lat/long
source = osr.SpatialReference()
source.ImportFromEPSG(4326)

## we need the data in British National Grid
target = osr.SpatialReference()
target.ImportFromEPSG(27700)

transform = osr.CoordinateTransformation(source, target)

## set the output fc name
output_fc = "WM_Burglaries_2016"

## if the output fc already exists delete it
if output_fc in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    ds.DeleteLayer(output_fc)
    print "Deleting: {0}".format(output_fc)

out_lyr = ds.CreateLayer(output_fc, target, ogr.wkbPoint)

## define and create new fields
mnth_fld = ogr.FieldDefn("Month", ogr.OFTString)
rep_by_fld = ogr.FieldDefn("Reported_by", ogr.OFTString)
fls_wthn_fld = ogr.FieldDefn("Falls_within", ogr.OFTString)
loc_fld = ogr.FieldDefn("Location", ogr.OFTString)
lsoa_c_fld = ogr.FieldDefn("LSOA_code", ogr.OFTString)
lsoa_n_fld = ogr.FieldDefn("LSOA_name", ogr.OFTString)
crime_fld = ogr.FieldDefn("Crime_type", ogr.OFTString)
outcome_fld = ogr.FieldDefn("Last_outcome", ogr.OFTString)

out_lyr.CreateField(mnth_fld)
out_lyr.CreateField(rep_by_fld)
out_lyr.CreateField(fls_wthn_fld)
out_lyr.CreateField(loc_fld)
out_lyr.CreateField(lsoa_c_fld)
out_lyr.CreateField(lsoa_n_fld)
out_lyr.CreateField(crime_fld)
out_lyr.CreateField(outcome_fld)

## where the downloaded csv files reside
root_folder = r"C:\Users\Glen B\Documents\Crime"

## for each csv
for root,dirs,files in os.walk(root_folder):
    for filename in files:
        if filename.endswith(".csv"):
            csv_path = "{0}\\{1}".format(root, filename)
            with open(csv_path, "rb") as csvfile:
                reader = csv.reader(csvfile, delimiter=",")
                next(reader,None)
                ## create a point with attributes for each burglary
                for row in reader:
                    if row[9] == "Burglary":
                        pnt = ogr.Geometry(ogr.wkbPoint)
                        pnt.AddPoint(float(row[4]), float(row[5]))
                        pnt.Transform(transform)
                        feat_dfn = out_lyr.GetLayerDefn()
                        feat = ogr.Feature(feat_dfn)
                        feat.SetGeometry(pnt)
                        feat.SetField("Month", row[1])
                        feat.SetField("Reported_by", row[2])
                        feat.SetField("Falls_within", row[3])
                        feat.SetField("Location", row[6])
                        feat.SetField("LSOA_code", row[7])
                        feat.SetField("LSOA_name", row[8])
                        feat.SetField("Crime_type", row[9])
                        feat.SetField("Last_outcome", row[10])
                        out_lyr.CreateFeature(feat)

del ds, feat, out_lyr

2. Birmingham Burglaries Only

from osgeo import ogr

## required drivers
shp_driver = ogr.GetDriverByName("ESRI Shapefile")
gdb_driver = ogr.GetDriverByName("FileGDB")

## input boundary shapefile and file gdb
shapefile = r"C:\Users\Glen B\Documents\Crime\Data\GB\district_borough_unitary_region.shp"
gdb = r"C:\Users\Glen B\Documents\my_geodata.gdb"

## open the shapefile in read mode and gdb in write mode
shp_ds = shp_driver.Open(shapefile, 0)
gdb_ds = gdb_driver.Open(gdb, 1)

## reference the necessary layers
shp_layer = shp_ds.GetLayer(0)
gdb_layer = gdb_ds.GetLayerByName("WM_Burglaries_2016")

## filter the shapefile
shp_layer.SetAttributeFilter("NAME = 'Birmingham District (B)'")

## set the name for the output feature class
output_fc = "Birmingham_Burglaries_2016"

## if the output already exists then delete it
if output_fc in [gdb_ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(gdb_ds.GetLayerCount())]:
    gdb_ds.DeleteLayer(output_fc)
    print "Deleting: {0}".format(output_fc)

## create an output layer
out_lyr = gdb_ds.CreateLayer(output_fc, shp_layer.GetSpatialRef(), ogr.wkbPoint)

## copy the schema from the West Midlands burglaries
## and use it for the Birmingham burglaries
lyr_def = gdb_layer.GetLayerDefn()
for i in range(lyr_def.GetFieldCount()):
    out_lyr.CreateField (lyr_def.GetFieldDefn(i))

## only get burglaries that intersect the Birmingham region
for shp_feat in shp_layer:
    print shp_feat.GetField("NAME")
    birm_geom = shp_feat.GetGeometryRef()
    for gdb_feat in gdb_layer:
        burglary_geom = gdb_feat.GetGeometryRef()
        if burglary_geom.Intersects(birm_geom):
            feat_dfn = out_lyr.GetLayerDefn()
            feat = ogr.Feature(feat_dfn)
            feat.SetGeometry(burglary_geom)

            ## populate the attribute table
            for i in range(lyr_def.GetFieldCount()):
                feat.SetField(lyr_def.GetFieldDefn(i).GetNameRef(), gdb_feat.GetField(i))
            ## create the feature
            out_lyr.CreateFeature(feat)
            feat.Destroy()

del shp_ds, shp_layer, gdb_ds, gdb_layer

The Usual 🙂

As always please feel free to comment to help make the code more efficient, highlight errors, or let me know if this was of any use to you.

OSGP: Measuring Geographic Distributions – Weighted Mean Center

Posted on April 25, 2017 by Glen Bambrick

Interested in learning ArcPy? check out this course.

(Open Source Geospatial Python)

The ‘What is it?’

See Mean Center.

The unweighted center is mainly used for events that occur at a place and time such as burglaries. The weighted center, however, is predominantly used for stationary features such as retail outlets or delineated areas for example (such as Census tracts). The Weighted Mean Center does not take into account distance between features in the dataset.

The weight needs to be a numerical attribute. The greater the value, the higher the weight for that feature.

The Formula!

The Weighted Mean Center is calculated by multiplying the x and y coordinate by the weight for that feature and summing all for both x and y individually, and then dividing this by the sum of all the weights.

The Code…

from osgeo import ogr
from shapely.geometry import MultiLineString
from shapely import wkt
import numpy as np
import sys

## set the driver for the data
driver = ogr.GetDriverByName("ESRI Shapefile")
## folder where the shapefile resides
folder = r"C:\Users\glen.bambrick\Documents\GDAL\shp\\"
## name of the shapefile concatenated with folder
shp = "{0}Census2011_Small_Areas_generalised20m.shp".format(folder)
## open the shapefile
ds = driver.Open(shp, 0)
## reference the only layer in the shapefile
lyr = ds.GetLayer(0)

## create an output data source
out_ds = driver.CreateDataSource("{0}{1}_wgt_mean_center.shp".format(folder,lyr.GetName()))

## output mean center weighted filename
output_fc = "{0}{1}_wgt_mean_center".format(folder,lyr.GetName())

## field that has numerical weight
weight_fld = "TOTAL2011"

try:
    first_feat = lyr.GetFeature(1)
    xy_arr = np.ndarray((len(lyr), 2), dtype=np.float)
    wgt_arr = np.ndarray((len(lyr), 1), dtype=np.float)
    ## use the centroid for points and polygons
    if first_feat.geometry().GetGeometryName() in ["POINT", "MULTIPOINT", "POLYGON", "MULTIPOLYGON"]:
        for i, pt in enumerate(lyr):
            ft_geom = pt.geometry()
            weight = pt.GetField(weight_fld)
            xy_arr[i] = (ft_geom.Centroid().GetX() * weight, ft_geom.Centroid().GetY() * weight)
            wgt_arr[i] = weight
    ## midpoint of lines
    elif first_feat.geometry().GetGeometryName() in ["LINESTRING", "MULTILINESTRING"]:
        for i, ln in enumerate(lyr):
            line_geom = ln.geometry().ExportToWkt()
            weight = ln.GetField(weight_fld)
            shapely_line = MultiLineString(wkt.loads(line_geom))
            midpoint = shapely_line.interpolate(shapely_line.length/2)
            xy_arr[i] = (midpoint.x * weight, midpoint.y * weight)
            wgt_arr[i] = weight

except Exception:
    print "Unknown geometry or Incorrect field name for {}".format(input_lyr_name)
    sys.exit()

## do the maths
sum_x, sum_y = np.sum(xy_arr, axis=0)
sum_wgt = np.sum(wgt_arr)
weighted_x, weighted_y = sum_x/sum_wgt, sum_y/sum_wgt

print "Weighted Mean Center: {0}, {1}".format(weighted_x, weighted_y)

## create a new point layer with the same spatial ref as lyr
out_lyr = out_ds.CreateLayer(output_fc, lyr.GetSpatialRef(), ogr.wkbPoint)

## define and create new fields
x_fld = ogr.FieldDefn("X", ogr.OFTReal)
y_fld = ogr.FieldDefn("Y", ogr.OFTReal)
out_lyr.CreateField(x_fld)
out_lyr.CreateField(y_fld)

## create a new point for the mean center weighted
pnt = ogr.Geometry(ogr.wkbPoint)
pnt.AddPoint(weighted_x, weighted_y)

## add the mean center weighted to the new layer
feat_dfn = out_lyr.GetLayerDefn()
feat = ogr.Feature(feat_dfn)
feat.SetGeometry(pnt)
feat.SetField("X", weighted_x)
feat.SetField("Y", weighted_y)
out_lyr.CreateFeature(feat)

print "Created: {0}.shp".format(output_fc)

## free up resources
del ds, out_ds, lyr, first_feat, feat, out_lyr

The Example:

I downloaded the Small Areas of Ireland from the CSO. You will have to acknowledge a disclaimer. The data contains population information for the 2011 Census. Once downloaded unzip Census2011_Small_Areas_generalised20m.zip to working folder.

Running the script from The Code section above calculates the Weighted Mean Center of all Small Areas based on the population count for each for 2011 and creates a point Shapefile as the output.

OSGP Weighted Mean Center: 238557.427484, 208347.116116
ArcGIS Weighted Mean Center: 238557.427484, 208347.116116

Also See…

Mean Center
Central Feature
Median Center
Initial Data Assessment

The Resources:

ESRI Guide to GIS Volume 2: Chapter 2 (I highly recommend this book)
see book review here.

Geoprocessing with Python

Python GDAL/OGR Cookbook

The Usual 🙂

As always please feel free to comment to help make the code more efficient, highlight errors, or let me know if this was of any use to you.

OSGP: Standard GIS Tools – Initial Data Assessment

Posted on April 24, 2017 by Glen Bambrick

(Open Source Geospatial Python)

Here we will look at the general makeup of a downloaded spatial dataset – a Shapefile from the Central Statistics Office in Ireland containing census data from 2011. We will look at getting the spatial reference of the file along with a breakdown of the field names, type, width and precision. We can print the top ten records or the entire attribute table and get a list of unique values for a field and the count of each.

Download the Small Areas of Ireland from the CSO. You will have to acknowledge a disclaimer. Once downloaded unzip Census2011_Small_Areas_generalised20m.zip to working folder. We will now begin to interrogate this Shapefile.

First we import the necessary modules…

# import modules
from osgeo import ogr
from tabulate import tabulate
from operator import itemgetter

tabulate will allow us to print out formatted tables. Using ogr we can access the inner workings of the downloaded Shapefile. Please note that osgeo and tabulate are not standard Python libraries and will need to be installed.

Using the ESRI Shapefile driver we open the Shapefile in read mode (0) and access the data (lyr).

# use Shapefile driver
driver = ogr.GetDriverByName("ESRI Shapefile")
# reference Shapefile
shp = r"C:\Users\Glen B\Documents\GDAL\shp\Census2011_Small_Areas_generalised20m.shp"
# open the file
ds = driver.Open(shp, 0)
# reference the only layer in a Shapefile
lyr = ds.GetLayer(0)

Spatial Reference Information

Straight away we cant print the spatial reference information associated with the Shapefile (contained in the .prj file)

print lyr.GetSpatialRef()

This will print out…

PROJCS["TM65_Irish_Grid",
    GEOGCS["GCS_TM65",
        DATUM["TM65",
            SPHEROID["Airy_Modified",6377340.189,299.3249646]],
        PRIMEM["Greenwich",0.0],
        UNIT["Degree",0.0174532925199433]],
    PROJECTION["Transverse_Mercator"],
    PARAMETER["False_Easting",200000.0],
    PARAMETER["False_Northing",250000.0],
    PARAMETER["Central_Meridian",-8.0],
    PARAMETER["Scale_Factor",1.000035],
    PARAMETER["Latitude_Of_Origin",53.5],
    UNIT["Meter",1.0]]

You can also access this information individually…

# projected coordinate system
proj_string = lyr.GetSpatialRef().GetAttrValue("PROJCS", 0)
# geographic coordinate system
geog_string = lyr.GetSpatialRef().GetAttrValue("GEOGCS", 0)
# EPSG Code if available
epsg = lyr.GetSpatialRef().GetAttrValue("AUTHORITY", 1)
# datum
datum = lyr.GetSpatialRef().GetAttrValue("DATUM", 0)

print "\nFile: {0}\n\nProjected: {1}\nEPSG: {2}\n".format(lyr.GetName(),proj_string, epsg)
print "Geographic: {0}\nDatum: {1}\n".format(geog_string, datum)

The output…

File: Census2011_Small_Areas_generalised20m

Projected: TM65_Irish_Grid
EPSG: None

Geographic: GCS_TM65
Datum: TM65

If there is an EPSG code in the .prj file it will be printed instead of None.

Geometry Type

If we reference the first feature we can get the geometry of the Shapefile…

first_feat = lyr.GetFeature(1)
print "Geometry Type: {0}\n".format(first_feat.geometry().GetGeometryName())

In this instance it is a polygon Shapefile.

Geometry Type: POLYGON

Field Information

Let’s get some information on the data through the Layer Definition.

# https://pcjericks.github.io/py-gdalogr-cookbook/vector_layers.html
lyr_def = lyr.GetLayerDefn()

But before we do we need to create a few list structures. These will be used to hold the accessed information and enable us to neatly print them to screen.

# list to hold headers for filed information
header_list = ["FIELD NAME", "TYPE", "WIDTH", "PRECISION"]
# list will be populated with field information
output_list = []
# list will be populated with field names and used for attribute headers
fld_names = []

Cycle through each field and populate the necessary lists…

# for each field
for i in range(lyr_def.GetFieldCount()):
    # reference the field name
    fld_name = lyr_def.GetFieldDefn(i).GetName()
    # reference the field type
    fld_type = lyr_def.GetFieldDefn(i).GetFieldTypeName(lyr_def.GetFieldDefn(i).GetType())
    # reference the field width
    fld_width = lyr_def.GetFieldDefn(i).GetWidth()
    # reference the field precision
    fld_precision = lyr_def.GetFieldDefn(i).GetPrecision()
    # append these as a list to the output_list
    output_list.append([fld_name, fld_type, str(fld_width), str(fld_precision)])
    # append field name to fld_name
    fld_names.append(fld_name)

The output_list is a list of lists containing information for each field, the field name, data type, width and precision, this is matched in the header_list. The fld_names will be used further down to print out attributes, this list hold the field names as headers. Let’s print the field information…

print "{0}\n".format(tabulate(output_list, header_list))

Here’s the output…

FIELD NAME   TYPE     WIDTH   PRECISION
------------ ------ ------- -----------
NUTS1        String       3           0
NUTS1NAME    String       7           0
NUTS2        String       4           0
NUTS2NAME    String      26           0
NUTS3        String       5           0
NUTS3NAME    String      15           0
COUNTY       String       2           0
COUNTYNAME   String      25           0
CSOED        String      11           0
OSIED        String      13           0
EDNAME       String      45           0
SMALL_AREA   String      61           0
GEOGID       String      65           0
MALE2011     Real        20          10
FEMALE2011   Real        20          10
TOTAL2011    Real        20          10
PPOCC2011    Real        20          10
UNOCC2011    Real        20          10
VACANT2011   Real        20          10
HS2011       Real        20          10
PCVAC2011    Real        20          10
CREATEDATE   String      10           0

Attribute Table

Next we print out some attributes for a set of features, the first ten.

# number of features from the first to print attributes for
num_to_return = 10
#num_to_return = lyr.GetFeatureCount()

Use the commented out line if you want to print attributes for all features. Create an empty list to hold the attributes. Some fields contain characters from the Irish language so we account for this so that the attributes are printed correctly.

# list will be populated with attribute data
att_table = []

# for each feature in the Shapefile
for count, feature in enumerate(lyr):
    # up to the number of set features to print
    if count < num_to_return:
        # count will beacome the Feature ID
        atts = [count]
        # for each field append the data to atts list
        for name in fld_names:
            try:
                # if the attribute is a string then decode with Celtic Languages
                atts.append(feature.GetField(name).decode("iso8859_14"))
            except Exception:
                atts.append(feature.GetField(name))
        # append the data for the feature to the att_table list
        att_table.append(atts)

The count becomes the Feature ID but we have no field for this so we will create one…

# add a FID header (count)
fld_names.insert(0, "FID")

So let’s print out the attributes…

print tabulate(att_table, fld_names)
print "{0} out of {1} features".format(num_to_return, lyr.GetFeatureCount())

Here’s the output…

  FID NUTS1   NUTS1NAME   NUTS2   NUTS2NAME            NUTS3   NUTS3NAME         COUNTY COUNTYNAME       CSOED   OSIED EDNAME                           SMALL_AREA GEOGID       MALE2011   FEMALE2011   TOTAL2011   PPOCC2011   UNOCC2011   VACANT2011   HS2011   PCVAC2011 CREATEDATE
----- ------- ----------- ------- -------------------- ------- --------------- -------- -------------- ------- ------- ------------------------------ ------------ ---------- ---------- ------------ ----------- ----------- ----------- ------------ -------- ----------- ------------
    0 IE0     Ireland     IE02    Southern and Eastern IE022   Mid-East              15 Wicklow County   15039  257005 Aughrim                           257005002 A257005002        137          138         275          84          18           15      102        14.7 27-03-2012
    1 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01054  017049 Tinnahinch                        017049001 A017049001        186          176         362         111          25           24      136        17.6 27-03-2012
    2 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01053  017032 Marley                            017032001 A017032001        194          173         367         121           8            5      129         3.9 27-03-2012
    3 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01054  017049 Tinnahinch                        017049002 A017049002         75           75         150          67          29           29       96        30.2 27-03-2012
    4 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01054  017049 Tinnahinch                        017049003 A017049003         84           81         165          64          16           14       80        17.5 27-03-2012
    5 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01015  017005 Ballyellin                        017005002 A017005002        105           99         204          71           6            5       77         6.5 27-03-2012
    6 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01015  017005 Ballyellin                        017005001 A017005001        115          108         223          70           9            8       79        10.1 27-03-2012
    7 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01033  017033 Muinebeag (Bagenalstown) Rural    017033001 A017033001        201          205         406         143          15           14      158         8.9 27-03-2012
    8 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01034  017034 Muinebeag (Bagenalstown) Urban    017034002 A017034002        142          116         258          89           9            9       98         9.2 27-03-2012
    9 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01034  017034 Muinebeag (Bagenalstown) Urban    017034003 A017034003        174          169         343         107           6            4      113         3.5 27-03-2012
10 out of 18488 features

Unique Values and Counts

Next we’ll get a list of the unique COUNTYNAME entries and a count to see how many small areas are in each. (The below works for text fields only)

# rest to first feature
lyr.ResetReading()

# field to return unique list and count of
field = "COUNTYNAME"

# create empty dictionary
values_dict = {}

# for each feature
for feature in lyr:
    attribute = feature.GetField(field).decode("iso8859_14")
    # if the COUNTYNAME is not already in the dictionary add it and assign a value of 1
    if attribute not in values_dict:
        values_dict[attribute] = 1
    # otherwise do not add it and increase the existing value by 1
    else:
        values_dict[attribute] = values_dict[attribute] + 1

## convert dictionary to list for use with tabulate
key_value_list = [[key, value] for key, value in values_dict.items()]

## print results
print "\nTotal Feature Count: {0}\n".format(lyr.GetFeatureCount())
print tabulate(sorted(key_value_list), [field, "Count"])

And here’s the output…

Total Feature Count: 18488

COUNTYNAME               Count
---------------------- -------
Carlow County              210
Cavan County               322
Clare County               511
Cork City                  519
Cork County               1650
Donegal County             761
Dublin City               2202
Dún Laoghaire-Rathdown     760
Fingal                     938
Galway City                307
Galway County              741
Kerry County               701
Kildare County             731
Kilkenny County            372
Laois County               305
Leitrim County             173
Limerick City              258
Limerick County            514
Longford County            179
Louth County               462
Mayo County                643
Meath County               636
Monaghan County            244
North Tipperary            283
Offaly County              286
Roscommon County           303
Sligo County               307
South Dublin               906
South Tipperary            350
Waterford City             198
Waterford County           275
Westmeath County           338
Wexford County             615
Wicklow County             488

Alternatively we could print out based on the highest count descending by replacing the last print statement with…

# http://stackoverflow.com/questions/17555218/python-how-to-sort-a-list-of-lists-by-the-fourth-element-in-each-list
print tabulate(sorted(key_value_list, key = itemgetter(1), reverse = True), [field, "Count"])

…to get…

COUNTYNAME               Count
---------------------- -------
Dublin City               2202
Cork County               1650
Fingal                     938
South Dublin               906
Donegal County             761
...

I will add to these as I come across something useful. If you know of any neat things to add please comment below. Please also comment if anything is unclear or if this was useful to you.

OSGP: Measuring Geographic Distributions – Median Center

Posted on March 22, 2017 by Glen Bambrick

This blog post has a new home at Final Draft Mapping.

Measuring Geographic Distributions with GeoPandas: Median Center

(Open Source Geospatial Python)

The ‘What is it?’

Also known as the Center of Minimum Distance, the Median Center is a location that is the shortest total distance to all features in the study area (not to be confused with the Central Feature, which is the feature that is the shortest distance to all others). It can be used to find a suitable location for something that needs to be centrally located. The Median Center will gravitate towards an area with the most features.

The Median Center is good for finding the most accessible location.

The Formula!

The is no single formula or equation for calculating an exact Median Center, according to Andy Mitchell it is an iterative process involving calculating the Mean Center, summing the distances from it to each feature, offsetting the center slightly and summing the distances again until it eventually zones in on the optimum location that has the lowest sum.

The code below implements the Yehuda Vardi and Cun-Hui Zhang algorithm or the Weiszfeld algorithm.

The Code…

import math, sys
import numpy as np
from osgeo import ogr
from scipy.spatial.distance import cdist

## "W" for Weiszfield
## "YC" for Yehuda Vardi and Cun-Hui Zhang
algorithm = "YC"

## Weiszfield
## https://gist.github.com/endolith/2837160
def numersum(test_median,dataPoint):
    ## Provides the denominator of the weiszfeld algorithm depending on whether
    ## you are adjusting the candidate x or y
    return 1/math.sqrt((test_median[0]-dataPoint[0])**2 + (test_median[1]-dataPoint[1])**2)

def denomsum(test_median, xy_arr):
    ## Provides the denominator of the weiszfeld algorithm
    temp = 0.0
    for i in range(0,len(xy_arr)):
        temp += 1/math.sqrt((test_median[0] - xy_arr[i][0])**2 + (test_median[1] - xy_arr[i][1])**2)
    return temp

## Yehuda Vardi and Cun-Hui Zhang
## http://stackoverflow.com/questions/30299267/geometric-median-of-multidimensional-points
## user: orlp
def geometric_median(X, eps=1e-5):
    y = np.mean(X, 0)

    while True:
        D = cdist(X, [y])
        nonzeros = (D != 0)[:, 0]
        Dinv = 1 / D[nonzeros]
        Dinvs = np.sum(Dinv)
        W = Dinv / Dinvs
        T = np.sum(W * X[nonzeros], 0)
        num_zeros = len(X) - np.sum(nonzeros)
        if num_zeros == 0:
            y1 = T
        elif num_zeros == len(X):
            return y
        else:
            R = (T - y) * Dinvs
            r = np.linalg.norm(R)
            rinv = 0 if r == 0 else num_zeros/r
            y1 = max(0, 1-rinv)*T + min(1, rinv)*y
        if np.linalg.norm(y - y1) < eps:
            return y1
        y = y1

## set the driver for the data
driver = ogr.GetDriverByName("FileGDB")

## path to the FileGDB
gdb = r"C:\Users\Glen B\Documents\my_geodata.gdb"

## ope the GDB in write mode (1)
ds = driver.Open(gdb, 1)

## input feature class
input_lyr_name = "Birmingham_Secondary_Schools"

## name of output feature class
output_fc = "{0}_median_center".format(input_lyr_name)

## reference the layer using the layers name
if input_lyr_name in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    lyr = ds.GetLayerByName(input_lyr_name)
    print "{0} found in {1}".format(input_lyr_name, gdb)

## if the output layer already exists then delete it
if output_fc in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    ds.DeleteLayer(output_fc)
    print "Deleting: {0}".format(output_fc)

## create an array with coordinates of each feature
try:
    first_feat = lyr.GetFeature(1)
    ## centroid for points and polygons
    if first_feat.geometry().GetGeometryName() in ["POINT", "MULTIPOINT", "POLYGON", "MULTIPOLYGON"]:
        xy_arr = np.ndarray((len(lyr), 2), dtype=np.float)
        for i, pt in enumerate(lyr):
            ft_geom = pt.geometry()
            xy_arr[i] = (ft_geom.Centroid().GetX(), ft_geom.Centroid().GetY())

    ## for lines we get the midpoint of a line
    elif first_feat.geometry().GetGeometryName() in ["LINESTRING", "MULTILINESTRING"]:
        xy_arr = np.ndarray((len(lyr), 2), dtype=np.float)
        for i, ln in enumerate(lyr):
            line_geom = ln.geometry().ExportToWkt()
            shapely_line = MultiLineString(wkt.loads(line_geom))
            midpoint = shapely_line.interpolate(shapely_line.length/2)
            xy_arr[i] = (midpoint.x, midpoint.y)

except Exception:
    print "Unknown geometry for {}".format(input_lyr_name)
    sys.exit()

## if using Weiszfield
if algorithm == "W":
    ## https://gist.github.com/endolith/2837160
    avg_x, avg_y = np.mean(xy_arr, axis=0)
    test_median = [avg_x, avg_y]
    numIter = 50

## minimise the objective function
for x in range(0,numIter):
    denom = denomsum(test_median,xy_arr)
    nextx = 0.0
    nexty = 0.0

    for y in range(0,len(xy_arr)):
        nextx += (xy_arr[y][0] * numersum(test_median,xy_arr[y]))/denom
        nexty += (xy_arr[y][1] * numersum(test_median,xy_arr[y]))/denom

    test_median = [nextx,nexty]

## if using Yehuda Vardi and Cun-Hui Zhang
elif algorithm == "YC":
    test_median = geometric_median(xy_arr)

print "Median Center: {0}, {1}".format(test_median[0], test_median[1])

## create a new point layer with the same spatial ref as lyr
out_lyr = ds.CreateLayer(output_fc, lyr.GetSpatialRef(), ogr.wkbPoint)

## define and create new fields
x_fld = ogr.FieldDefn("X", ogr.OFTReal)
y_fld = ogr.FieldDefn("Y", ogr.OFTReal)
out_lyr.CreateField(x_fld)
out_lyr.CreateField(y_fld)

## create a new point for the mean center
pnt = ogr.Geometry(ogr.wkbPoint)
pnt.AddPoint(test_median[0], test_median[1])

## add the mean center to the new layer
feat_dfn = out_lyr.GetLayerDefn()
feat = ogr.Feature(feat_dfn)
feat.SetGeometry(pnt)
feat.SetField("X", test_median[0])
feat.SetField("Y", test_median[1])
out_lyr.CreateFeature(feat)

print "Created {0}".format(output_fc)

## free up resources
del ds, lyr, first_feat, feat, out_lyr

I’d like to give credit to…
Logan Byers from GIS StackExchange who aided in speeding up the computational time using NumPy and for forcing me to begin learning the wonders of NumPy.
orlp from Stack Overflow for their implementation of Yehuda Vardi and Cun-Hui Zhang’s algorithm for the geometric median.
Daniel J Lewis (I think) for the implementation of the Weiszfeld algorithm.

The Example:

I downloaded vector data that contains polygons for schools (and other features) from OS Open Map – Local that covered the West Midlands. I also downloaded OS Boundary Line data. Using Python and GDAL/OGR I extracted secondary schools from the data for Birmingham only. Everything was now in place to find the Median Center of all Secondary Schools for Birmingham. (see The Other Scripts section at the bottom of this post for processing the data)

Running the script from The Code section above calculates the coordinates of the Median Center for Secondary Schools in Birmingham and creates a point feature class in the File GDB.

OSGP Median Center (W): 407658.278755, 286696.905759
OSGP Median Center (YC): 407658.278752, 286696.905769
ArcGIS Median Center: 407658.009375, 286697.53996

What’s Next?

Weighted Mean Center (link will be updated once post is ready)

Also See…

Mean Center
Central Feature

The Resources:

ESRI Guide to GIS Volume 2: Chapter 2
see book review here.

Geoprocessing with Python

Python GDAL/OGR Cookbook

Setting up GDAL/OGR with FileGDB Driver for Python on Windows

< The Other Scripts >

Birmingham Secondary Schools

from osgeo import ogr
import os

## necessary drivers
shp_driver = ogr.GetDriverByName("ESRI Shapefile")
gdb_driver = ogr.GetDriverByName("FileGDB")

## input boundary shapefile and file reference file gdb
shapefile = r"C:\Users\Glen B\Documents\Schools\Data\GB\district_borough_unitary_region.shp"
gdb = r"C:\Users\Glen B\Documents\my_geodata.gdb"

shp_ds = shp_driver.Open(shapefile, 0)
gdb_ds = gdb_driver.Open(gdb, 1)

## filter boundary to just Birmingham
shp_layer = shp_ds.GetLayer(0)
shp_layer.SetAttributeFilter("NAME = 'Birmingham District (B)'")

## name the output
output_fc = "Birmingham_Secondary_Schools"

## if the output feature class already exists then delete it
if output_fc in [gdb_ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(gdb_ds.GetLayerCount())]:
    gdb_ds.DeleteLayer(output_fc)
    print "Deleting: {0}".format(output_fc)

## create the output feature class
out_lyr = gdb_ds.CreateLayer(output_fc, shp_layer.GetSpatialRef(), ogr.wkbPolygon)

## the folder that contains the data to extract Secondary Schools from
root_folder = r"C:\Users\Glen B\Documents\Schools\Vector\data"

## traverse through the folders and find ImportantBuildings files
## copy only those that intersect the Birmingham region
## transfer across attributes
count = 1
for root,dirs,files in os.walk(root_folder):
    for filename in files:
        if filename.endswith("ImportantBuilding.shp") and filename[0:2] in ["SP", "SO", "SJ", "SK"]:
            shp_path = "{0}\\{1}".format(root, filename)
            schools_ds = shp_driver.Open(shp_path, 0)
            schools_lyr = schools_ds.GetLayer(0)
            schools_lyr.SetAttributeFilter("CLASSIFICA = 'Secondary Education'")
            lyr_def = schools_lyr.GetLayerDefn()
            if count == 1:
                for i in range(lyr_def.GetFieldCount()):
                    out_lyr.CreateField(lyr_def.GetFieldDefn(i))
                count += 1
            shp_layer.ResetReading()
            for shp_feat in shp_layer:
                birm_geom = shp_feat.GetGeometryRef()

                for school_feat in schools_lyr:
                    school_geom = school_feat.GetGeometryRef()

                    if school_geom.Intersects(birm_geom):
                        feat_dfn = out_lyr.GetLayerDefn()
                        feat = ogr.Feature(feat_dfn)
                        feat.SetGeometry(school_geom)
                        for i in range(lyr_def.GetFieldCount()):
                            feat.SetField(lyr_def.GetFieldDefn(i).GetNameRef(), school_feat.GetField(i))

                        out_lyr.CreateFeature(feat)
                        feat.Destroy()

del shp_ds, shp_layer, gdb_ds

The Usual 🙂

As always please feel free to comment to help make the code more efficient, highlight errors, or let me know if this was of any use to you.

Geospatiality

make geospatial your speciality

Category Archives: GIS

OSGP: Create Chainage ticks along a Line at Specified Distance Intervals

OSGP: Measuring Geographic Distributions – Standard Distance

OSGP: Measuring Geographic Distributions – Weighted Mean Center

OSGP: Standard GIS Tools – Initial Data Assessment

OSGP: Measuring Geographic Distributions – Median Center