perfect-postcode/pipeline/transform/crime_spatial.py
2026-06-02 20:14:32 +01:00

475 lines
19 KiB
Python

"""Aggregate police.uk street crime to postcodes by spatial proximity.
Instead of attributing each incident to its published LSOA code, this transform
counts the anonymised incident *points* that fall within ``buffer_m`` (default
100m) of each postcode's boundary polygon (the polygon buffered outward). A point
inside several overlapping buffers counts for each postcode -- the same
multiplicity the tree-density filter uses for features near more than one
postcode. The wide 100m buffer deliberately smooths police.uk's snap-to-grid
coordinates, which would otherwise make the count hypersensitive to which side of
a narrow line a shared "map point" anchor happened to land on.
Counts are **area-normalised**: each postcode's count is divided by its buffered
catchment area and rescaled by the median catchment area, so the metric reflects
crime *density* rather than how much ground the buffer sweeps (a median-sized
catchment is left unchanged; a large rural postcode is no longer inflated simply
for covering more of the map). Normalising by the buffered area -- the region
that actually collects points -- rather than the raw polygon keeps tiny unit
postcodes from being over-inflated by the fixed buffer-ring floor. The headline
``"{type} (avg/yr)"`` is the simple mean of the per-year annualised counts, so it
equals the average of the by-year chart bars.
Outputs mirror the old LSOA transform's shape but are keyed on ``postcode``:
* ``crime_by_postcode.parquet`` -- ``postcode`` + ``"{type} (avg/yr)"`` columns.
* ``crime_by_postcode_by_year.parquet`` -- ``postcode`` + ``"{type} (by year)"``
nested ``list[struct{year, count}]`` columns, with Serious/Minor rollups.
Caveat: police.uk coordinates are snapped to a fixed set of anonymous "map
points", not true locations, and a share of rows have no coordinate at all
(dropped here). Spatial totals are therefore fuzzier than the old LSOA-tagged
counts -- by design, not a regression.
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
import numpy as np
import polars as pl
import shapely
from pyproj import Transformer
from pipeline.transform.crime import (
LEGACY_CRIME_TYPE_ALIASES,
MINOR_CRIME_TYPES,
SERIOUS_CRIME_TYPES,
find_street_crime_csvs,
)
from pipeline.transform.postcode_boundaries.loader import load_postcode_polygons
# Serious types first so column order is stable and self-documenting.
ALL_CRIME_TYPES: tuple[str, ...] = SERIOUS_CRIME_TYPES + MINOR_CRIME_TYPES
DEFAULT_BUFFER_M = 100.0
MONTH_DIR_RE = re.compile(r"^\d{4}-\d{2}$")
# Generous GB bounds; points outside fall in no English postcode anyway, but
# filtering first keeps the WGS84->BNG transform out of its undefined region.
LON_BOUNDS = (-9.5, 2.5)
LAT_BOUNDS = (49.0, 61.5)
# Read CSVs in chunks of files to bound peak memory while keeping the STRtree
# query vectorised over a useful number of points.
_CSV_BATCH = 64
def _month_calendar(csvs: list[Path]) -> tuple[list[int], dict[int, int], int]:
"""Derive annualisation denominators from the monthly directory names.
Each police.uk file lives under ``{crime_dir}/{YYYY-MM}/...`` and holds that
month's incidents, so the set of month directories is the set of observed
months. Returns the sorted distinct years, months-observed-per-year, and the
total month count (the avg/yr denominator).
"""
months = sorted(
{path.parent.name for path in csvs if MONTH_DIR_RE.fullmatch(path.parent.name)}
)
if not months:
raise ValueError("No valid YYYY-MM month directories found among crime CSVs")
months_in_year: dict[int, int] = {}
for month in months:
year = int(month[:4])
months_in_year[year] = months_in_year.get(year, 0) + 1
years = sorted(months_in_year)
return years, months_in_year, len(months)
def _build_tree(
polygons: np.ndarray, buffer_m: float
) -> tuple[np.ndarray, shapely.STRtree]:
"""Buffer postcode polygons outward by ``buffer_m`` and index them.
Buffer index == postcode index. Geometries that fail to buffer are replaced
with an empty polygon so the index stays aligned; they simply never match.
"""
buffers = shapely.buffer(polygons, buffer_m, quad_segs=8)
broken = shapely.is_missing(buffers) | ~shapely.is_valid(buffers)
if broken.any():
print(f" {int(broken.sum()):,} postcode buffers unusable; left empty")
buffers[broken] = shapely.from_wkt("POLYGON EMPTY")
return buffers, shapely.STRtree(buffers)
def _accumulate_counts(
csvs: list[Path],
tree: shapely.STRtree,
type_to_idx: dict[str, int],
year_to_idx: dict[int, int],
transformer: Transformer,
counts: np.ndarray,
) -> None:
"""Stream the crime CSVs, counting points-in-buffer per (postcode, type, year)."""
schema = {
"Longitude": pl.Float64,
"Latitude": pl.Float64,
"Month": pl.Utf8,
"Crime type": pl.Utf8,
}
years = list(year_to_idx)
total_points = 0
total_matches = 0
total_dropped = 0
unknown_type_counts: dict[str, int] = {}
for start in range(0, len(csvs), _CSV_BATCH):
batch = csvs[start : start + _CSV_BATCH]
frame = (
pl.scan_csv(
batch,
schema_overrides=schema,
ignore_errors=True,
)
.select("Longitude", "Latitude", "Month", "Crime type")
# strict=False: a single malformed Month drops only that row instead
# of aborting the whole build (a non-numeric year becomes null and is
# filtered out by the year membership check below).
.with_columns(
pl.col("Month").str.slice(0, 4).cast(pl.Int32, strict=False).alias("year")
)
.filter(
pl.col("Longitude").is_not_null()
& pl.col("Latitude").is_not_null()
& pl.col("Longitude").is_between(*LON_BOUNDS)
& pl.col("Latitude").is_between(*LAT_BOUNDS)
& pl.col("Crime type").is_not_null()
& (pl.col("Crime type") != "")
& pl.col("year").is_in(years)
)
# Canonicalise legacy pre-2014 crime-type names ("Violent crime",
# "Public disorder and weapons") to their current equivalents before
# indexing, so ~1.9M historical incidents are counted instead of
# dropped. `.replace` leaves current types unchanged.
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
# Map crime types to indices with default=None so an unrecognised
# type yields a null index we can *report* rather than silently drop
# (the legacy LSOA path surfaced unknown types via its dynamic pivot).
.with_columns(
pl.col("Crime type")
.replace_strict(type_to_idx, default=None, return_dtype=pl.Int32)
.alias("tidx"),
pl.col("year")
.replace_strict(year_to_idx, return_dtype=pl.Int32)
.alias("yidx"),
)
.select("Longitude", "Latitude", "Crime type", "tidx", "yidx")
.collect(engine="streaming")
)
if frame.height == 0:
continue
unknown = frame.filter(pl.col("tidx").is_null())
if unknown.height:
for name, cnt in unknown.group_by("Crime type").len().iter_rows():
unknown_type_counts[name] = unknown_type_counts.get(name, 0) + cnt
frame = frame.filter(pl.col("tidx").is_not_null())
if frame.height == 0:
continue
lon = frame["Longitude"].to_numpy()
lat = frame["Latitude"].to_numpy()
tidx = frame["tidx"].to_numpy()
yidx = frame["yidx"].to_numpy()
x, y = transformer.transform(lon, lat)
finite = np.isfinite(x) & np.isfinite(y)
total_dropped += int((~finite).sum())
if not finite.any():
continue
x, y, tidx, yidx = x[finite], y[finite], tidx[finite], yidx[finite]
total_points += x.size
points = shapely.points(x, y)
point_index, postcode_index = tree.query(points, predicate="intersects")
if point_index.size:
np.add.at(
counts,
(postcode_index, tidx[point_index], yidx[point_index]),
1,
)
total_matches += point_index.size
print(
f" files {start + len(batch):,}/{len(csvs):,}: "
f"{total_points:,} located points, {total_matches:,} postcode matches"
)
if total_dropped:
print(f"Dropped {total_dropped:,} points outside the BNG transform domain")
if unknown_type_counts:
total_unknown = sum(unknown_type_counts.values())
listed = ", ".join(
f"{name!r} ({cnt:,})"
for name, cnt in sorted(
unknown_type_counts.items(), key=lambda kv: kv[1], reverse=True
)
)
print(
f"WARNING: dropped {total_unknown:,} incidents with crime types not in "
f"ALL_CRIME_TYPES (taxonomy is stale -- update SERIOUS/MINOR_CRIME_TYPES): "
f"{listed}",
file=sys.stderr,
)
def _rollup_long(
long: pl.DataFrame, types: tuple[str, ...], rollup_name: str
) -> pl.DataFrame:
"""Sum per-year annualised counts across ``types`` into a single rollup."""
return (
long.filter(pl.col("Crime type").is_in(list(types)))
.group_by("postcode", "year")
.agg(pl.col("count").sum().round(1).alias("count"))
.with_columns(pl.lit(rollup_name).alias("Crime type"))
.select("postcode", "Crime type", "year", "count")
)
def _write_avg_yr(
postcodes: np.ndarray,
counts: np.ndarray,
years: list[int],
months_in_year: dict[int, int],
norm: np.ndarray,
output_path: Path,
) -> None:
"""Write ``postcode`` + ``"{type} (avg/yr)"`` density-normalised averages.
The headline figure is the **simple mean of the per-year annualised counts**
(each year scaled to a 12-month equivalent), so it equals the average of the
by-year chart bars instead of a month-weighted pooled rate. Each postcode's
value is then multiplied by ``norm`` (median_area / buffered catchment area)
so the metric is a density rather than a footprint-inflated raw count.
"""
months = np.array([months_in_year[year] for year in years], dtype=np.float64)
per_year = counts.astype(np.float64) * 12.0 / months[None, None, :]
# Average over the years *this postcode* actually has incidents of *this
# type* -- the same per-(postcode, type) x-span the by-year chart plots
# (server-rs/.../crime_by_year.rs), so the headline equals the mean of the
# by-year bars. Dividing by a global years-present count (years a type
# appeared anywhere in England) would deflate postcodes whose incidents
# cluster in only a few years of the ~13-year window.
years_present = np.clip((counts > 0).sum(axis=2), 1, None).astype(np.float64)
avg = per_year.sum(axis=2) / years_present # (n_postcodes, n_types)
avg = np.round(avg * norm[:, None], 1).astype(np.float32)
data: dict[str, np.ndarray] = {"postcode": postcodes}
for type_idx, name in enumerate(ALL_CRIME_TYPES):
data[f"{name} (avg/yr)"] = avg[:, type_idx]
# Serious/Minor rollup headlines, computed the SAME way as the by-year rollup
# bars (_write_by_year/_rollup_long): sum the rollup's types per year, then
# average over the years in which ANY of those types occurred. This keeps the
# headline equal to the mean of the "Serious/Minor crime (by year)" bars.
# Summing the per-type avg/yr values instead (as the merge previously did)
# divides each type by its OWN years-present and overstates the rollup when a
# postcode's serious/minor types occur in disjoint years.
for rollup_name, rollup_types in (
("Serious crime", SERIOUS_CRIME_TYPES),
("Minor crime", MINOR_CRIME_TYPES),
):
rollup_idx = [ALL_CRIME_TYPES.index(name) for name in rollup_types]
rollup_counts = counts[:, rollup_idx, :].sum(axis=1) # (n_postcodes, n_years)
rollup_per_year = per_year[:, rollup_idx, :].sum(axis=1)
rollup_years_present = np.clip(
(rollup_counts > 0).sum(axis=1), 1, None
).astype(np.float64)
rollup_avg = rollup_per_year.sum(axis=1) / rollup_years_present
data[f"{rollup_name} (avg/yr)"] = np.round(rollup_avg * norm, 1).astype(
np.float32
)
output_path.parent.mkdir(parents=True, exist_ok=True)
pl.DataFrame(data).write_parquet(output_path, compression="zstd")
print(f"Wrote postcode crime averages: {output_path}")
def _write_by_year(
postcodes: np.ndarray,
counts: np.ndarray,
years: list[int],
months_in_year: dict[int, int],
norm: np.ndarray,
output_path: Path,
) -> None:
"""Write nested ``"{type} (by year)"`` series plus Serious/Minor rollups.
Per-year counts are area-normalised by the same ``norm`` (median_area /
buffered catchment area) factor applied to the avg/yr headline, so the chart
bars and the headline figure remain mutually consistent.
"""
months = np.array([months_in_year[year] for year in years], dtype=np.float64)
annual = np.round(
counts.astype(np.float64) * 12.0 / months[None, None, :] * norm[:, None, None],
1,
)
pc_i, ty_i, yr_i = np.nonzero(counts)
if pc_i.size == 0:
raise ValueError("No crime points matched any postcode buffer")
type_names = np.array(ALL_CRIME_TYPES, dtype=object)
year_values = np.array(years, dtype=np.int32)
long = pl.DataFrame(
{
"postcode": postcodes[pc_i],
"Crime type": type_names[ty_i],
"year": year_values[yr_i],
"count": annual[pc_i, ty_i, yr_i].astype(np.float32),
}
)
serious = _rollup_long(long, SERIOUS_CRIME_TYPES, "Serious crime")
minor = _rollup_long(long, MINOR_CRIME_TYPES, "Minor crime")
combined = pl.concat([long, serious, minor])
by_type = (
combined.sort("year")
.group_by("postcode", "Crime type")
.agg(pl.struct("year", "count").alias("series"))
)
wide = by_type.pivot(on="Crime type", index="postcode", values="series")
type_cols = [c for c in wide.columns if c != "postcode"]
wide = wide.rename({col: f"{col} (by year)" for col in type_cols})
output_path.parent.mkdir(parents=True, exist_ok=True)
wide.write_parquet(output_path, compression="zstd")
print(f"Wrote postcode crime by-year series: {output_path} {wide.shape}")
def transform_crime_spatial(
crime_dir: Path,
boundaries_dir: Path,
output_path: Path,
by_year_output_path: Path,
buffer_m: float = DEFAULT_BUFFER_M,
max_postcodes: int | None = None,
max_files: int | None = None,
) -> None:
csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
if not csvs:
raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
if max_files is not None:
csvs = csvs[:max_files]
years, months_in_year, valid_month_count = _month_calendar(csvs)
print(
f"Found {len(csvs):,} street crime CSVs across {valid_month_count} months "
f"({years[0]}-{years[-1]})"
+ (f" (ignored {ignored_csv_count} non-street CSVs)" if ignored_csv_count else "")
)
postcodes, polygons = load_postcode_polygons(boundaries_dir, max_postcodes)
print(f"Buffering {len(postcodes):,} postcode polygons by {buffer_m:g}m...")
buffers, tree = _build_tree(polygons, buffer_m)
# Area-normalisation factor (median_area / catchment_area): divides out the
# size of each postcode's catchment so the count measures crime density, not
# how much ground the buffer sweeps. We normalise by the *buffered* area --
# the region that actually collects points -- rather than the raw polygon, so
# a tiny unit postcode isn't over-inflated by the fixed buffer-ring floor.
# Buffers are in EPSG:27700, so shapely.area is in m^2.
areas = shapely.area(buffers).astype(np.float64)
usable_area = np.isfinite(areas) & (areas > 0)
if not usable_area.any():
raise ValueError("No postcode buffers have a positive area to normalise by")
median_area = float(np.median(areas[usable_area]))
norm = np.zeros(len(postcodes), dtype=np.float64)
norm[usable_area] = median_area / areas[usable_area]
print(
f"Area-normalising to median catchment area {median_area:,.0f} m^2 "
f"({int(usable_area.sum()):,}/{len(areas):,} postcodes have usable area)"
)
type_to_idx = {name: idx for idx, name in enumerate(ALL_CRIME_TYPES)}
year_to_idx = {year: idx for idx, year in enumerate(years)}
counts = np.zeros((len(postcodes), len(ALL_CRIME_TYPES), len(years)), dtype=np.int32)
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
_accumulate_counts(csvs, tree, type_to_idx, year_to_idx, transformer, counts)
_write_avg_yr(postcodes, counts, years, months_in_year, norm, output_path)
_write_by_year(postcodes, counts, years, months_in_year, norm, by_year_output_path)
def main() -> None:
parser = argparse.ArgumentParser(
description="Count police.uk crime points within 50m of each postcode boundary"
)
parser.add_argument(
"--input",
type=Path,
default=Path("property-data/crime"),
help="Directory containing police.uk street crime CSVs",
)
parser.add_argument(
"--boundaries",
type=Path,
default=Path("property-data/postcode_boundaries/units"),
help="Directory of per-district postcode boundary GeoJSONs",
)
parser.add_argument(
"--output",
type=Path,
required=True,
help="Output parquet: postcode + '{type} (avg/yr)' columns",
)
parser.add_argument(
"--output-by-year",
type=Path,
required=True,
help="Output parquet: postcode + nested '{type} (by year)' columns",
)
parser.add_argument(
"--buffer-m",
type=float,
default=DEFAULT_BUFFER_M,
help="Outward buffer (metres) added to each postcode boundary",
)
parser.add_argument(
"--max-postcodes",
type=int,
default=None,
help="Testing only: process the first N postcodes",
)
parser.add_argument(
"--max-files",
type=int,
default=None,
help="Testing only: process the first N monthly CSV files",
)
args = parser.parse_args()
if args.buffer_m <= 0:
raise SystemExit("--buffer-m must be greater than zero")
transform_crime_spatial(
crime_dir=args.input,
boundaries_dir=args.boundaries,
output_path=args.output,
by_year_output_path=args.output_by_year,
buffer_m=args.buffer_m,
max_postcodes=args.max_postcodes,
max_files=args.max_files,
)
if __name__ == "__main__":
main()