OSGP: Create Chainage ticks along a Line at Specified Distance Intervals

This builds on from the previous post creating points at specified distances along a line. Here, we create perpendicular chainage ticks that traverse the main line.

from osgeo import ogr
from shapely.geometry import MultiLineString, LineString, Point
from shapely import wkt
import sys, math

## http://wikicode.wikidot.com/get-angle-of-line-between-two-points
## angle between two points
def getAngle(pt1, pt2):
    x_diff = pt2.x - pt1.x
    y_diff = pt2.y - pt1.y
    return math.degrees(math.atan2(y_diff, x_diff))

## start and end points of chainage tick
## get the first end point of a tick
def getPoint1(pt, bearing, dist):
    angle = bearing + 90
    bearing = math.radians(angle)
    x = pt.x + dist * math.cos(bearing)
    y = pt.y + dist * math.sin(bearing)
    return Point(x, y)
## get the second end point of a tick
def getPoint2(pt, bearing, dist):
    bearing = math.radians(bearing)
    x = pt.x + dist * math.cos(bearing)
    y = pt.y + dist * math.sin(bearing)
    return Point(x, y)

## set the driver for the data
driver = ogr.GetDriverByName("FileGDB")
## path to the FileGDB
gdb = r"C:\Users\******\Documents\ArcGIS\Default.gdb"
## open the GDB in write mode (1)
ds = driver.Open(gdb, 1)

## linear feature class
input_lyr_name = "input_line"

## distance between each points
distance = 10
## the length of each tick
tick_length = 20

## output tick line fc name
output_lns = "{0}_{1}m_lines".format(input_lyr_name, distance)

## list to hold all the point coords
list_points = []

## reference the layer using the layers name
if input_lyr_name in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    lyr = ds.GetLayerByName(input_lyr_name)
    print "{0} found in {1}".format(input_lyr_name, gdb)

## if the output already exists then delete it
if output_lns in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    ds.DeleteLayer(output_lns)
    print "Deleting: {0}".format(output_lns)

## create a new line layer with the same spatial ref as lyr
out_ln_lyr = ds.CreateLayer(output_lns, lyr.GetSpatialRef(), ogr.wkbLineString)

## distance/chainage attribute
chainage_fld = ogr.FieldDefn("CHAINAGE", ogr.OFTReal)
out_ln_lyr.CreateField(chainage_fld)
## check the geometry is a line
first_feat = lyr.GetFeature(1)

## accessing linear feature classes using FileGDB driver always returns a MultiLinestring
if first_feat.geometry().GetGeometryName() in ["LINESTRING", "MULTILINESTRING"]:
    for ln in lyr:
        ## list to hold all the point coords
        list_points = []
        ## set the current distance to place the point
        current_dist = distance
        ## get the geometry of the line as wkt
        line_geom = ln.geometry().ExportToWkt()
        ## make shapely MultiLineString object
        shapely_line = MultiLineString(wkt.loads(line_geom))
        ## get the total length of the line
        line_length = shapely_line.length
        ## append the starting coordinate to the list
        list_points.append(Point(list(shapely_line[0].coords)[0]))
        ## https://nathanw.net/2012/08/05/generating-chainage-distance-nodes-in-qgis/
        ## while the current cumulative distance is less than the total length of the line
        while current_dist < line_length:
            ## use interpolate and increase the current distance
            list_points.append(shapely_line.interpolate(current_dist))
            current_dist += distance
        ## append end coordinate to the list
        list_points.append(Point(list(shapely_line[0].coords)[-1]))

        ## add lines to the layer
        ## this can probably be cleaned up better
        ## but it works and is fast!
        for num, pt in enumerate(list_points, 1):
            ## start chainage 0
            if num == 1:
                angle = getAngle(pt, list_points[num])
                line_end_1 = getPoint1(pt, angle, tick_length/2)
                angle = getAngle(line_end_1, pt)
                line_end_2 = getPoint2(line_end_1, angle, tick_length)
                tick = LineString([(line_end_1.x, line_end_1.y), (line_end_2.x, line_end_2.y)])
                feat_dfn_ln = out_ln_lyr.GetLayerDefn()
                feat_ln = ogr.Feature(feat_dfn_ln)
                feat_ln.SetGeometry(ogr.CreateGeometryFromWkt(tick.wkt))
                feat_ln.SetField("CHAINAGE", 0)
                out_ln_lyr.CreateFeature(feat_ln)

            ## everything in between
            if num < len(list_points) - 1:
                angle = getAngle(pt, list_points[num])
                line_end_1 = getPoint1(list_points[num], angle, tick_length/2)
                angle = getAngle(line_end_1, list_points[num])
                line_end_2 = getPoint2(line_end_1, angle, tick_length)
                tick = LineString([(line_end_1.x, line_end_1.y), (line_end_2.x, line_end_2.y)])
                feat_dfn_ln = out_ln_lyr.GetLayerDefn()
                feat_ln = ogr.Feature(feat_dfn_ln)
                feat_ln.SetGeometry(ogr.CreateGeometryFromWkt(tick.wkt))
                feat_ln.SetField("CHAINAGE", distance * num)
                out_ln_lyr.CreateFeature(feat_ln)

            ## end chainage
            if num == len(list_points):
                angle = getAngle(list_points[num - 2], pt)
                line_end_1 = getPoint1(pt, angle, tick_length/2)
                angle = getAngle(line_end_1, pt)
                line_end_2 = getPoint2(line_end_1, angle, tick_length)
                tick = LineString([(line_end_1.x, line_end_1.y), (line_end_2.x, line_end_2.y)])
                feat_dfn_ln = out_ln_lyr.GetLayerDefn()
                feat_ln = ogr.Feature(feat_dfn_ln)
                feat_ln.SetGeometry(ogr.CreateGeometryFromWkt(tick.wkt))
                feat_ln.SetField("CHAINAGE", int(line_length))
                out_ln_lyr.CreateFeature(feat_ln)

del ds

The output is a linear feature class containing chainage ticks and distance attribute.

chainage_ticks_along_linechainage_ticks_along_line_attributes

Please leave comments if this can be improved or if you found it useful.

OSGP: Create Points at Specified Distance Interval Along a Line

This workflow with Python using OGR and Shapely creates points along a line at a specified distance interval. I use the FileGDB driver here to read from and write data to but you can change these to suit your requirements. The code is commented…

from osgeo import ogr
from shapely.geometry import MultiLineString, Point
from shapely import wkt
import sys

## set the driver for the data
driver = ogr.GetDriverByName("FileGDB")
################################################################################
## CHANGE gdb, input_lyr_name, distance, output_pts (optional)

## path to the FileGDB
gdb = r"C:\Users\******\Documents\ArcGIS\Default.gdb"
## open the GDB in write mode (1)
ds = driver.Open(gdb, 1)

## single linear feature
input_lyr_name = "input_line"

## distance between each points
distance = 10

## output point fc name
output_pts = "{0}_{1}m_points".format(input_lyr_name, distance)
################################################################################

## reference the layer using the layers name
if input_lyr_name in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    lyr = ds.GetLayerByName(input_lyr_name)
    print "{0} found in {1}".format(input_lyr_name, gdb)
## if the feature class cannot be found exit gracefully
else:
    print "{0} NOT found in {1}".format(input_lyr_name, gdb)
    sys.exit()

## if the output already exists then delete it
if output_pts in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    ds.DeleteLayer(output_pts)
    print "Deleting: {0}".format(output_pts)

## create a new point layer with the same spatial ref as lyr
out_lyr = ds.CreateLayer(output_pts, lyr.GetSpatialRef(), ogr.wkbPoint)

## create a field to hold the distance values
dist_fld = ogr.FieldDefn("DISTANCE", ogr.OFTReal)
out_lyr.CreateField(dist_fld)
## check the geometry is a line
first_feat = lyr.GetFeature(1)

## accessing linear feature classes using FileGDB driver always returns a MultiLinestring
if first_feat.geometry().GetGeometryName() in ["LINESTRING", "MULTILINESTRING"]:
    for ln in lyr:
        ## list to hold all the point coords
        list_points = []
        ## set the current distance to place the point
        current_dist = distance
        ## get the geometry of the line as wkt
        line_geom = ln.geometry().ExportToWkt()
        ## make shapely MultiLineString object
        shapely_line = MultiLineString(wkt.loads(line_geom))
        ## get the total length of the line
        line_length = shapely_line.length
        ## append the starting coordinate to the list
        list_points.append(Point(list(shapely_line[0].coords)[0]))
        ## https://nathanw.net/2012/08/05/generating-chainage-distance-nodes-in-qgis/
        ## while the current cumulative distance is less than the total length of the line
        while current_dist < line_length:
            ## use interpolate and increase the current distance
            list_points.append(shapely_line.interpolate(current_dist))
            current_dist += distance
        ## append end coordinate to the list
        list_points.append(Point(list(shapely_line[0].coords)[-1]))

        ## add points to the layer
        ## for each point in the list
        for num, pt in enumerate(list_points, 1):
            ## create a point object
            pnt = ogr.Geometry(ogr.wkbPoint)
            pnt.AddPoint(pt.x, pt.y)
            feat_dfn = out_lyr.GetLayerDefn()
            feat = ogr.Feature(feat_dfn)
            feat.SetGeometry(pnt)
            ## populate the distance values for each point.
            ## start point
            if num == 1:
                feat.SetField("DISTANCE", 0)
            elif num < len(list_points):
                feat.SetField("DISTANCE", distance * (num - 1))
            ## end point
            elif num == len(list_points):
                feat.SetField("DISTANCE", int(line_length))
            ## add the point feature to the output.
            out_lyr.CreateFeature(feat)

else:
    print "Error: make sure {0} is a linear feature class with at least one feature".format(input_lyr_name)
    sys.exit()

del ds, out_lyr

This will create a point feature class with a single attribute containing the distance value from the start of the line.

distance_points_along_linedistance_points_along_line_attributesPlease leave any constructive feedback if you think this can be improved or if it worked for you.

Also see Create Chainage Ticks for creating perpendicular lines traversing the main line at specified distances.

OSGP: Measuring Geographic Distributions – Standard Distance

(Open Source Geospatial Python)

The ‘What is it?’

The Standard Distance, also know as the Standard Distance Deviation, is the average distance all features vary from the Mean Center and measures the compactness of a distribution. The Standard Distance is a value representing the distance in units from the Mean Center and is usually plotted on a map as a circle for a visual indication of dispersion, the Standard Distance is the radius.

The Standard Distance works best in the absence of a strong directional trend. According to Andy Mitchell, if a directional trend is present you are better off using the Standard Deviational Ellipse.

You can use the Standard Distance to compare territories between species, which has the wider/broader territory, or to compare changes over time such as the dispersion of burglaries for each calendar month.

In a Normal Distribution you would expect around 68% of all points to fall within the Standard Distance.

The Formula!

The mean x-coordinate is subtracted from the x-coordinate value for each point and the difference is squared. The sum of all the squared values for x minus the x-mean is divided by the number of points. This is also performed for y-coordinates. The resulting values for x and y are summed and then we take the square root of this value to return the value to original distance units.

First we get the mean X and Y…

Mean Center Formula

…and then the Standard Distance

Standard Distance Formula

For Point features the X and Y coordinates of each feature is used, for Polygons the centroid of each feature represents the X and Y coordinate to use, and for Linear features the mid-point of each line is used for the X and Y coordinate.

The Code…

from osgeo import ogr
from shapely.geometry import MultiLineString
from shapely import wkt
import numpy as np
import sys, math

## set the driver for the data
driver = ogr.GetDriverByName("FileGDB")
## path to the FileGDB
gdb = r"C:\Users\Glen B\Documents\ArcGIS\Default.gdb"
## ope the GDB in write mode (1)
ds = driver.Open(gdb, 1)

input_lyr_name = "Birmingham_Burglaries_2016"

output_fc = "{0}_standard_distance".format(input_lyr_name)

## reference the layer using the layers name
if input_lyr_name in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    lyr = ds.GetLayerByName(input_lyr_name)
    print "{0} found in {1}".format(input_lyr_name, gdb)

if output_fc in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    ds.DeleteLayer(output_fc)
    print "Deleting: {0}".format(output_fc)

try:
    ## for points and polygons we use the centroid
    first_feat = lyr.GetFeature(1)
    if first_feat.geometry().GetGeometryName() in ["POINT", "MULTIPOINT", "POLYGON", "MULTIPOLYGON"]:
        xy_arr = np.ndarray((len(lyr), 2), dtype=np.float)
        for i, pt in enumerate(lyr):
            ft_geom = pt.geometry()
            xy_arr[i] = (ft_geom.Centroid().GetX(), ft_geom.Centroid().GetY())

    ## for lines we get the midpoint of a line
    elif first_feat.geometry().GetGeometryName() in ["LINESTRING", "MULTILINESTRING"]:
        xy_arr = np.ndarray((len(lyr), 2), dtype=np.float)
        for i, ln in enumerate(lyr):
            line_geom = ln.geometry().ExportToWkt()
            shapely_line = MultiLineString(wkt.loads(line_geom))
            midpoint = shapely_line.interpolate(shapely_line.length/2)
            xy_arr[i] = (midpoint.x, midpoint.y)

except Exception:
    print "Unknown geometry for {}".format(input_lyr_name)
    sys.exit()

avg_x, avg_y = np.mean(xy_arr, axis=0)

print "Mean Center: {0}, {1}".format(avg_x, avg_y)

sum_of_sq_diff_x = 0.0
sum_of_sq_diff_y = 0.0

for x, y in xy_arr:
    diff_x = math.pow(x - avg_x, 2)
    diff_y = math.pow(y - avg_y, 2)
    sum_of_sq_diff_x += diff_x
    sum_of_sq_diff_y += diff_y

sum_of_results = (sum_of_sq_diff_x/lyr.GetFeatureCount()) + (sum_of_sq_diff_y/lyr.GetFeatureCount())
standard_distance = math.sqrt(sum_of_results)
print "Standard Distance: {0}". format(standard_distance)

## create a point with the mean center
## and buffer by the standard distance
pnt = ogr.Geometry(ogr.wkbPoint)
pnt.AddPoint(avg_x, avg_y)
polygon = pnt.Buffer(standard_distance, 90)

## create a new polygon layer with the same spatial ref as lyr
out_lyr = ds.CreateLayer(output_fc, lyr.GetSpatialRef(), ogr.wkbPolygon)

## define and create new fields
x_fld = ogr.FieldDefn("X", ogr.OFTReal)
y_fld = ogr.FieldDefn("Y", ogr.OFTReal)
stnd_dst = ogr.FieldDefn("Standard_Distance", ogr.OFTReal)
out_lyr.CreateField(x_fld)
out_lyr.CreateField(y_fld)
out_lyr.CreateField(stnd_dst)

## add the standard distance buffer to the layer
feat_dfn = out_lyr.GetLayerDefn()
feat = ogr.Feature(feat_dfn)
feat.SetGeometry(polygon)
feat.SetField("X", avg_x)
feat.SetField("Y", avg_y)
feat.SetField("Standard_Distance", standard_distance)
out_lyr.CreateFeature(feat)

print "Created {0}".format(output_fc)

## free up resources
del feat, ds, lyr, out_lyr

I’d like to give credit to Logan Byers from GIS StackExchange who aided in speeding up the computational time using NumPy and for forcing me to begin learning the wonders of NumPy (getting there bit by bit).

The Example:

I downloaded crime data from DATA.POLICE.UK for the West Midlands Police from January 2016 to December 2016. I used some Python to extract just the Burglary data and made this into a feature class in the File GDB. Next, I downloaded OS Boundary Line data and clipped the Burglary data to just Birmingham. Everything was now in place to find the Standard Distance of all burglaries for Birmingham in 2016. (see The Other Scripts section at the bottom of this post for processing the data)

birmingham_burgalries_2016

Running the script from The Code section above calculates the Standard Distance for burglaries in Birmingham for 2016 and creates a polygon feature class in the File GDB.

Standard Distance Circle

OSGP Mean Center:     407926.695396, 286615.428507
ArcGIS Mean Center:    407926.695396, 286615.428507

OSGP Standard Distance:      6416.076596
ArcGIS Standard Distance:    6416.076596

Also See…

Mean Center
Central Feature
Median Center
Initial Data Assessment

The Resources:

ESRI Guide to GIS Volume 2: Chapter 2 (I highly recommend this book)
see book review here.

Geoprocessing with Python

Python GDAL/OGR Cookbook

Setting up GDAL/OGR with FileGDB Driver for Python on Windows

< The Other Scripts >

1. Extract Burglary Data for West Midlands

import csv, os
from osgeo import ogr, osr

## set the driver for the data
driver = ogr.GetDriverByName("FileGDB")

## path to the FileGDB
gdb = r"C:\Users\Glen B\Documents\my_geodata.gdb"

## ope the GDB in write mode (1)
ds = driver.Open(gdb, 1)

## the coordinates in the csv files are lat/long
source = osr.SpatialReference()
source.ImportFromEPSG(4326)

## we need the data in British National Grid
target = osr.SpatialReference()
target.ImportFromEPSG(27700)

transform = osr.CoordinateTransformation(source, target)

## set the output fc name
output_fc = "WM_Burglaries_2016"

## if the output fc already exists delete it
if output_fc in [ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(ds.GetLayerCount())]:
    ds.DeleteLayer(output_fc)
    print "Deleting: {0}".format(output_fc)

out_lyr = ds.CreateLayer(output_fc, target, ogr.wkbPoint)

## define and create new fields
mnth_fld = ogr.FieldDefn("Month", ogr.OFTString)
rep_by_fld = ogr.FieldDefn("Reported_by", ogr.OFTString)
fls_wthn_fld = ogr.FieldDefn("Falls_within", ogr.OFTString)
loc_fld = ogr.FieldDefn("Location", ogr.OFTString)
lsoa_c_fld = ogr.FieldDefn("LSOA_code", ogr.OFTString)
lsoa_n_fld = ogr.FieldDefn("LSOA_name", ogr.OFTString)
crime_fld = ogr.FieldDefn("Crime_type", ogr.OFTString)
outcome_fld = ogr.FieldDefn("Last_outcome", ogr.OFTString)

out_lyr.CreateField(mnth_fld)
out_lyr.CreateField(rep_by_fld)
out_lyr.CreateField(fls_wthn_fld)
out_lyr.CreateField(loc_fld)
out_lyr.CreateField(lsoa_c_fld)
out_lyr.CreateField(lsoa_n_fld)
out_lyr.CreateField(crime_fld)
out_lyr.CreateField(outcome_fld)

## where the downloaded csv files reside
root_folder = r"C:\Users\Glen B\Documents\Crime"

## for each csv
for root,dirs,files in os.walk(root_folder):
    for filename in files:
        if filename.endswith(".csv"):
            csv_path = "{0}\\{1}".format(root, filename)
            with open(csv_path, "rb") as csvfile:
                reader = csv.reader(csvfile, delimiter=",")
                next(reader,None)
                ## create a point with attributes for each burglary
                for row in reader:
                    if row[9] == "Burglary":
                        pnt = ogr.Geometry(ogr.wkbPoint)
                        pnt.AddPoint(float(row[4]), float(row[5]))
                        pnt.Transform(transform)
                        feat_dfn = out_lyr.GetLayerDefn()
                        feat = ogr.Feature(feat_dfn)
                        feat.SetGeometry(pnt)
                        feat.SetField("Month", row[1])
                        feat.SetField("Reported_by", row[2])
                        feat.SetField("Falls_within", row[3])
                        feat.SetField("Location", row[6])
                        feat.SetField("LSOA_code", row[7])
                        feat.SetField("LSOA_name", row[8])
                        feat.SetField("Crime_type", row[9])
                        feat.SetField("Last_outcome", row[10])
                        out_lyr.CreateFeature(feat)

del ds, feat, out_lyr

2. Birmingham Burglaries Only

from osgeo import ogr

## required drivers
shp_driver = ogr.GetDriverByName("ESRI Shapefile")
gdb_driver = ogr.GetDriverByName("FileGDB")

## input boundary shapefile and file gdb
shapefile = r"C:\Users\Glen B\Documents\Crime\Data\GB\district_borough_unitary_region.shp"
gdb = r"C:\Users\Glen B\Documents\my_geodata.gdb"

## open the shapefile in read mode and gdb in write mode
shp_ds = shp_driver.Open(shapefile, 0)
gdb_ds = gdb_driver.Open(gdb, 1)

## reference the necessary layers
shp_layer = shp_ds.GetLayer(0)
gdb_layer = gdb_ds.GetLayerByName("WM_Burglaries_2016")

## filter the shapefile
shp_layer.SetAttributeFilter("NAME = 'Birmingham District (B)'")

## set the name for the output feature class
output_fc = "Birmingham_Burglaries_2016"

## if the output already exists then delete it
if output_fc in [gdb_ds.GetLayerByIndex(lyr_name).GetName() for lyr_name in range(gdb_ds.GetLayerCount())]:
    gdb_ds.DeleteLayer(output_fc)
    print "Deleting: {0}".format(output_fc)

## create an output layer
out_lyr = gdb_ds.CreateLayer(output_fc, shp_layer.GetSpatialRef(), ogr.wkbPoint)

## copy the schema from the West Midlands burglaries
## and use it for the Birmingham burglaries
lyr_def = gdb_layer.GetLayerDefn()
for i in range(lyr_def.GetFieldCount()):
    out_lyr.CreateField (lyr_def.GetFieldDefn(i))

## only get burglaries that intersect the Birmingham region
for shp_feat in shp_layer:
    print shp_feat.GetField("NAME")
    birm_geom = shp_feat.GetGeometryRef()
    for gdb_feat in gdb_layer:
        burglary_geom = gdb_feat.GetGeometryRef()
        if burglary_geom.Intersects(birm_geom):
            feat_dfn = out_lyr.GetLayerDefn()
            feat = ogr.Feature(feat_dfn)
            feat.SetGeometry(burglary_geom)

            ## populate the attribute table
            for i in range(lyr_def.GetFieldCount()):
                feat.SetField(lyr_def.GetFieldDefn(i).GetNameRef(), gdb_feat.GetField(i))
            ## create the feature
            out_lyr.CreateFeature(feat)
            feat.Destroy()

del shp_ds, shp_layer, gdb_ds, gdb_layer

The Usual 🙂

As always please feel free to comment to help make the code more efficient, highlight errors, or let me know if this was of any use to you.

OSGP: Measuring Geographic Distributions – Weighted Mean Center

(Open Source Geospatial Python)

The ‘What is it?’

See Mean Center.

The unweighted center is mainly used for events that occur at a place and time such as burglaries. The weighted center, however, is predominantly used for stationary features such as retail outlets or delineated areas for example (such as Census tracts). The Weighted Mean Center does not take into account distance between features in the dataset.

The weight needs to be a numerical attribute. The greater the value, the higher the weight for that feature.

The Formula!

The Weighted Mean Center is calculated by multiplying the x and y coordinate by the weight for that feature and summing all for both x and y individually, and then dividing this by the sum of all the weights.

Weighted Mean Center FormulaFor Point features the X and Y coordinates of each feature is used, for Polygons the centroid of each feature represents the X and Y coordinate to use, and for Linear features the mid-point of each line is used for the X and Y coordinate.

The Code…

from osgeo import ogr
from shapely.geometry import MultiLineString
from shapely import wkt
import numpy as np
import sys

## set the driver for the data
driver = ogr.GetDriverByName("ESRI Shapefile")
## folder where the shapefile resides
folder = r"C:\Users\glen.bambrick\Documents\GDAL\shp\\"
## name of the shapefile concatenated with folder
shp = "{0}Census2011_Small_Areas_generalised20m.shp".format(folder)
## open the shapefile
ds = driver.Open(shp, 0)
## reference the only layer in the shapefile
lyr = ds.GetLayer(0)

## create an output data source
out_ds = driver.CreateDataSource("{0}{1}_wgt_mean_center.shp".format(folder,lyr.GetName()))

## output mean center weighted filename
output_fc = "{0}{1}_wgt_mean_center".format(folder,lyr.GetName())

## field that has numerical weight
weight_fld = "TOTAL2011"

try:
    first_feat = lyr.GetFeature(1)
    xy_arr = np.ndarray((len(lyr), 2), dtype=np.float)
    wgt_arr = np.ndarray((len(lyr), 1), dtype=np.float)
    ## use the centroid for points and polygons
    if first_feat.geometry().GetGeometryName() in ["POINT", "MULTIPOINT", "POLYGON", "MULTIPOLYGON"]:
        for i, pt in enumerate(lyr):
            ft_geom = pt.geometry()
            weight = pt.GetField(weight_fld)
            xy_arr[i] = (ft_geom.Centroid().GetX() * weight, ft_geom.Centroid().GetY() * weight)
            wgt_arr[i] = weight
    ## midpoint of lines
    elif first_feat.geometry().GetGeometryName() in ["LINESTRING", "MULTILINESTRING"]:
        for i, ln in enumerate(lyr):
            line_geom = ln.geometry().ExportToWkt()
            weight = ln.GetField(weight_fld)
            shapely_line = MultiLineString(wkt.loads(line_geom))
            midpoint = shapely_line.interpolate(shapely_line.length/2)
            xy_arr[i] = (midpoint.x * weight, midpoint.y * weight)
            wgt_arr[i] = weight

except Exception:
    print "Unknown geometry or Incorrect field name for {}".format(input_lyr_name)
    sys.exit()

## do the maths
sum_x, sum_y = np.sum(xy_arr, axis=0)
sum_wgt = np.sum(wgt_arr)
weighted_x, weighted_y = sum_x/sum_wgt, sum_y/sum_wgt

print "Weighted Mean Center: {0}, {1}".format(weighted_x, weighted_y)

## create a new point layer with the same spatial ref as lyr
out_lyr = out_ds.CreateLayer(output_fc, lyr.GetSpatialRef(), ogr.wkbPoint)

## define and create new fields
x_fld = ogr.FieldDefn("X", ogr.OFTReal)
y_fld = ogr.FieldDefn("Y", ogr.OFTReal)
out_lyr.CreateField(x_fld)
out_lyr.CreateField(y_fld)

## create a new point for the mean center weighted
pnt = ogr.Geometry(ogr.wkbPoint)
pnt.AddPoint(weighted_x, weighted_y)

## add the mean center weighted to the new layer
feat_dfn = out_lyr.GetLayerDefn()
feat = ogr.Feature(feat_dfn)
feat.SetGeometry(pnt)
feat.SetField("X", weighted_x)
feat.SetField("Y", weighted_y)
out_lyr.CreateFeature(feat)

print "Created: {0}.shp".format(output_fc)

## free up resources
del ds, out_ds, lyr, first_feat, feat, out_lyr

I’d like to give credit to Logan Byers from GIS StackExchange who aided in speeding up the computational time using NumPy and for forcing me to begin learning the wonders of NumPy (which is still a work in progress)

The Example:

I downloaded the Small Areas of Ireland from the CSO. You will have to acknowledge a disclaimer. The data contains population information for the 2011 Census. Once downloaded unzip Census2011_Small_Areas_generalised20m.zip to working folder.

Small Areas

Running the script from The Code section above calculates the Weighted Mean Center of all Small Areas based on the population count for each for 2011 and creates a point Shapefile as the output.

Small Areas Weighted Mean Center

OSGP Weighted Mean Center:      238557.427484, 208347.116116
ArcGIS Weighted Mean Center:    238557.427484, 208347.116116

Also See…

Mean Center
Central Feature
Median Center
Initial Data Assessment

The Resources:

ESRI Guide to GIS Volume 2: Chapter 2 (I highly recommend this book)
see book review here.

Geoprocessing with Python

Python GDAL/OGR Cookbook

The Usual 🙂

As always please feel free to comment to help make the code more efficient, highlight errors, or let me know if this was of any use to you.

OSGP: Standard GIS Tools – Initial Data Assessment

(Open Source Geospatial Python)

Here we will look at the general makeup of a downloaded spatial dataset – a Shapefile from the Central Statistics Office in Ireland containing census data from 2011. We will look at getting the spatial reference of the file along with a breakdown of the field names, type, width and precision. We can print the top ten records or the entire attribute table and get a list of unique values for a field and the count of each.

Download the Small Areas of Ireland from the CSO. You will have to acknowledge a disclaimer. Once downloaded unzip Census2011_Small_Areas_generalised20m.zip to working folder. We will now begin to interrogate this Shapefile.

Small Areas

First we import the necessary modules…

# import modules
from osgeo import ogr
from tabulate import tabulate
from operator import itemgetter

tabulate will allow us to print out formatted tables. Using ogr we can access the inner workings of the downloaded Shapefile. Please note that osgeo and tabulate are not standard Python libraries and will need to be installed.

Using the ESRI Shapefile driver we open the Shapefile in read mode (0) and access the data (lyr).

# use Shapefile driver
driver = ogr.GetDriverByName("ESRI Shapefile")
# reference Shapefile
shp = r"C:\Users\Glen B\Documents\GDAL\shp\Census2011_Small_Areas_generalised20m.shp"
# open the file
ds = driver.Open(shp, 0)
# reference the only layer in a Shapefile
lyr = ds.GetLayer(0)

Spatial Reference Information

Straight away we cant print the spatial reference information associated with the Shapefile (contained in the .prj file)

print lyr.GetSpatialRef()

This will print out…

PROJCS["TM65_Irish_Grid",
    GEOGCS["GCS_TM65",
        DATUM["TM65",
            SPHEROID["Airy_Modified",6377340.189,299.3249646]],
        PRIMEM["Greenwich",0.0],
        UNIT["Degree",0.0174532925199433]],
    PROJECTION["Transverse_Mercator"],
    PARAMETER["False_Easting",200000.0],
    PARAMETER["False_Northing",250000.0],
    PARAMETER["Central_Meridian",-8.0],
    PARAMETER["Scale_Factor",1.000035],
    PARAMETER["Latitude_Of_Origin",53.5],
    UNIT["Meter",1.0]]

You can also access this information individually…

# projected coordinate system
proj_string = lyr.GetSpatialRef().GetAttrValue("PROJCS", 0)
# geographic coordinate system
geog_string = lyr.GetSpatialRef().GetAttrValue("GEOGCS", 0)
# EPSG Code if available
epsg = lyr.GetSpatialRef().GetAttrValue("AUTHORITY", 1)
# datum
datum = lyr.GetSpatialRef().GetAttrValue("DATUM", 0)

print "\nFile: {0}\n\nProjected: {1}\nEPSG: {2}\n".format(lyr.GetName(),proj_string, epsg)
print "Geographic: {0}\nDatum: {1}\n".format(geog_string, datum)

The output…

File: Census2011_Small_Areas_generalised20m

Projected: TM65_Irish_Grid
EPSG: None

Geographic: GCS_TM65
Datum: TM65

If there is an EPSG code in the .prj file it will be printed instead of None.

Geometry Type

If we reference the first feature we can get the geometry of the Shapefile

first_feat = lyr.GetFeature(1)
print "Geometry Type: {0}\n".format(first_feat.geometry().GetGeometryName())

In this instance it is a polygon Shapefile.

Geometry Type: POLYGON

Field Information

Let’s get some information on the data through the Layer Definition.

# https://pcjericks.github.io/py-gdalogr-cookbook/vector_layers.html
lyr_def = lyr.GetLayerDefn()

But before we do we need to create a few list structures. These will be used to hold the accessed information and enable us to neatly print them to screen.

# list to hold headers for filed information
header_list = ["FIELD NAME", "TYPE", "WIDTH", "PRECISION"]
# list will be populated with field information
output_list = []
# list will be populated with field names and used for attribute headers
fld_names = []

Cycle through each field and populate the necessary lists…

# for each field
for i in range(lyr_def.GetFieldCount()):
    # reference the field name
    fld_name = lyr_def.GetFieldDefn(i).GetName()
    # reference the field type
    fld_type = lyr_def.GetFieldDefn(i).GetFieldTypeName(lyr_def.GetFieldDefn(i).GetType())
    # reference the field width
    fld_width = lyr_def.GetFieldDefn(i).GetWidth()
    # reference the field precision
    fld_precision = lyr_def.GetFieldDefn(i).GetPrecision()
    # append these as a list to the output_list
    output_list.append([fld_name, fld_type, str(fld_width), str(fld_precision)])
    # append field name to fld_name
    fld_names.append(fld_name)

The output_list is a list of lists containing information for each field, the field name, data type, width and precision, this is matched in the header_list. The fld_names will be used further down to print out attributes, this list hold the field names as headers. Let’s print the field information…

print "{0}\n".format(tabulate(output_list, header_list))

Here’s the output…

FIELD NAME   TYPE     WIDTH   PRECISION
------------ ------ ------- -----------
NUTS1        String       3           0
NUTS1NAME    String       7           0
NUTS2        String       4           0
NUTS2NAME    String      26           0
NUTS3        String       5           0
NUTS3NAME    String      15           0
COUNTY       String       2           0
COUNTYNAME   String      25           0
CSOED        String      11           0
OSIED        String      13           0
EDNAME       String      45           0
SMALL_AREA   String      61           0
GEOGID       String      65           0
MALE2011     Real        20          10
FEMALE2011   Real        20          10
TOTAL2011    Real        20          10
PPOCC2011    Real        20          10
UNOCC2011    Real        20          10
VACANT2011   Real        20          10
HS2011       Real        20          10
PCVAC2011    Real        20          10
CREATEDATE   String      10           0

Attribute Table

Next we print out some attributes for a set of features, the first ten.

# number of features from the first to print attributes for
num_to_return = 10
#num_to_return = lyr.GetFeatureCount()

Use the commented out line if you want to print attributes for all features. Create an empty list to hold the attributes. Some fields contain characters from the Irish language so we account for this so that the attributes are printed correctly.

# list will be populated with attribute data
att_table = []

# for each feature in the Shapefile
for count, feature in enumerate(lyr):
    # up to the number of set features to print
    if count < num_to_return:
        # count will beacome the Feature ID
        atts = [count]
        # for each field append the data to atts list
        for name in fld_names:
            try:
                # if the attribute is a string then decode with Celtic Languages
                atts.append(feature.GetField(name).decode("iso8859_14"))
            except Exception:
                atts.append(feature.GetField(name))
        # append the data for the feature to the att_table list
        att_table.append(atts)

The count becomes the Feature ID but we have no field for this so we will create one…

# add a FID header (count)
fld_names.insert(0, "FID")

So let’s print out the attributes…

print tabulate(att_table, fld_names)
print "{0} out of {1} features".format(num_to_return, lyr.GetFeatureCount())

Here’s the output…

  FID NUTS1   NUTS1NAME   NUTS2   NUTS2NAME            NUTS3   NUTS3NAME         COUNTY COUNTYNAME       CSOED   OSIED EDNAME                           SMALL_AREA GEOGID       MALE2011   FEMALE2011   TOTAL2011   PPOCC2011   UNOCC2011   VACANT2011   HS2011   PCVAC2011 CREATEDATE
----- ------- ----------- ------- -------------------- ------- --------------- -------- -------------- ------- ------- ------------------------------ ------------ ---------- ---------- ------------ ----------- ----------- ----------- ------------ -------- ----------- ------------
    0 IE0     Ireland     IE02    Southern and Eastern IE022   Mid-East              15 Wicklow County   15039  257005 Aughrim                           257005002 A257005002        137          138         275          84          18           15      102        14.7 27-03-2012
    1 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01054  017049 Tinnahinch                        017049001 A017049001        186          176         362         111          25           24      136        17.6 27-03-2012
    2 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01053  017032 Marley                            017032001 A017032001        194          173         367         121           8            5      129         3.9 27-03-2012
    3 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01054  017049 Tinnahinch                        017049002 A017049002         75           75         150          67          29           29       96        30.2 27-03-2012
    4 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01054  017049 Tinnahinch                        017049003 A017049003         84           81         165          64          16           14       80        17.5 27-03-2012
    5 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01015  017005 Ballyellin                        017005002 A017005002        105           99         204          71           6            5       77         6.5 27-03-2012
    6 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01015  017005 Ballyellin                        017005001 A017005001        115          108         223          70           9            8       79        10.1 27-03-2012
    7 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01033  017033 Muinebeag (Bagenalstown) Rural    017033001 A017033001        201          205         406         143          15           14      158         8.9 27-03-2012
    8 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01034  017034 Muinebeag (Bagenalstown) Urban    017034002 A017034002        142          116         258          89           9            9       98         9.2 27-03-2012
    9 IE0     Ireland     IE02    Southern and Eastern IE024   South-East (IE)       01 Carlow County    01034  017034 Muinebeag (Bagenalstown) Urban    017034003 A017034003        174          169         343         107           6            4      113         3.5 27-03-2012
10 out of 18488 features

Unique Values and Counts

Next we’ll get a list of the unique COUNTYNAME entries and a count to see how many small areas are in each. (The below works for text fields only)

# rest to first feature
lyr.ResetReading()

# field to return unique list and count of
field = "COUNTYNAME"

# create empty dictionary
values_dict = {}

# for each feature
for feature in lyr:
    attribute = feature.GetField(field).decode("iso8859_14")
    # if the COUNTYNAME is not already in the dictionary add it and assign a value of 1
    if attribute not in values_dict:
        values_dict[attribute] = 1
    # otherwise do not add it and increase the existing value by 1
    else:
        values_dict[attribute] = values_dict[attribute] + 1

## convert dictionary to list for use with tabulate
key_value_list = [[key, value] for key, value in values_dict.items()]

## print results
print "\nTotal Feature Count: {0}\n".format(lyr.GetFeatureCount())
print tabulate(sorted(key_value_list), [field, "Count"])

And here’s the output…

Total Feature Count: 18488

COUNTYNAME               Count
---------------------- -------
Carlow County              210
Cavan County               322
Clare County               511
Cork City                  519
Cork County               1650
Donegal County             761
Dublin City               2202
Dún Laoghaire-Rathdown     760
Fingal                     938
Galway City                307
Galway County              741
Kerry County               701
Kildare County             731
Kilkenny County            372
Laois County               305
Leitrim County             173
Limerick City              258
Limerick County            514
Longford County            179
Louth County               462
Mayo County                643
Meath County               636
Monaghan County            244
North Tipperary            283
Offaly County              286
Roscommon County           303
Sligo County               307
South Dublin               906
South Tipperary            350
Waterford City             198
Waterford County           275
Westmeath County           338
Wexford County             615
Wicklow County             488

Alternatively we could print out based on the highest count descending by replacing the last print statement with…

# http://stackoverflow.com/questions/17555218/python-how-to-sort-a-list-of-lists-by-the-fourth-element-in-each-list
print tabulate(sorted(key_value_list, key = itemgetter(1), reverse = True), [field, "Count"])

…to get…

COUNTYNAME               Count
---------------------- -------
Dublin City               2202
Cork County               1650
Fingal                     938
South Dublin               906
Donegal County             761
...

I will add to these as I come across something useful. If you know of any neat things to add please comment below. Please also comment if anything is unclear or if this was useful to you.

See Also…

Setting up GDAL/OGR with FileGDB Driver for Python on Windows
Measuring Geographic Distributions #1.1 – Mean Center
Measuring Geographic Distributions #2.1 – Central Feature
Measuring Geographic Distributions #3.1 – Median Center