perfect-postcode/pipeline/transform/crime_spatial.py

358 lines
12 KiB
Python

"""Aggregate police.uk street crime to postcodes by 50m spatial proximity.
Instead of attributing each incident to its published LSOA code, this transform
counts the anonymised incident *points* that fall within 50m of each postcode's
boundary polygon (the polygon buffered outward by 50m). A point inside several
overlapping buffers counts for each postcode -- the same multiplicity the
tree-density filter uses for features near more than one postcode.
The metric is a raw annualised count ("incidents/year within 50m"); there is no
per-capita denominator. Outputs mirror the old LSOA transform's shape but are
keyed on ``postcode`` instead of ``LSOA code``:
* ``crime_by_postcode.parquet`` -- ``postcode`` + ``"{type} (avg/yr)"`` columns.
* ``crime_by_postcode_by_year.parquet`` -- ``postcode`` + ``"{type} (by year)"``
nested ``list[struct{year, count}]`` columns, with Serious/Minor rollups.
Caveat: police.uk coordinates are snapped to a fixed set of anonymous "map
points", not true locations, and a share of rows have no coordinate at all
(dropped here). Spatial totals are therefore lower than, and fuzzier than, the
old LSOA-tagged counts -- by design, not a regression.
"""
from __future__ import annotations
import argparse
import re
from pathlib import Path
import numpy as np
import polars as pl
import shapely
from pyproj import Transformer
from pipeline.transform.crime import (
MINOR_CRIME_TYPES,
SERIOUS_CRIME_TYPES,
find_street_crime_csvs,
)
from pipeline.transform.postcode_boundaries.loader import load_postcode_polygons
# Serious types first so column order is stable and self-documenting.
ALL_CRIME_TYPES: tuple[str, ...] = SERIOUS_CRIME_TYPES + MINOR_CRIME_TYPES
DEFAULT_BUFFER_M = 50.0
MONTH_DIR_RE = re.compile(r"^\d{4}-\d{2}$")
# Generous GB bounds; points outside fall in no English postcode anyway, but
# filtering first keeps the WGS84->BNG transform out of its undefined region.
LON_BOUNDS = (-9.5, 2.5)
LAT_BOUNDS = (49.0, 61.5)
# Read CSVs in chunks of files to bound peak memory while keeping the STRtree
# query vectorised over a useful number of points.
_CSV_BATCH = 64
def _month_calendar(csvs: list[Path]) -> tuple[list[int], dict[int, int], int]:
"""Derive annualisation denominators from the monthly directory names.
Each police.uk file lives under ``{crime_dir}/{YYYY-MM}/...`` and holds that
month's incidents, so the set of month directories is the set of observed
months. Returns the sorted distinct years, months-observed-per-year, and the
total month count (the avg/yr denominator).
"""
months = sorted(
{path.parent.name for path in csvs if MONTH_DIR_RE.fullmatch(path.parent.name)}
)
if not months:
raise ValueError("No valid YYYY-MM month directories found among crime CSVs")
months_in_year: dict[int, int] = {}
for month in months:
year = int(month[:4])
months_in_year[year] = months_in_year.get(year, 0) + 1
years = sorted(months_in_year)
return years, months_in_year, len(months)
def _build_tree(
polygons: np.ndarray, buffer_m: float
) -> tuple[np.ndarray, shapely.STRtree]:
"""Buffer postcode polygons outward by ``buffer_m`` and index them.
Buffer index == postcode index. Geometries that fail to buffer are replaced
with an empty polygon so the index stays aligned; they simply never match.
"""
buffers = shapely.buffer(polygons, buffer_m, quad_segs=8)
broken = shapely.is_missing(buffers) | ~shapely.is_valid(buffers)
if broken.any():
print(f" {int(broken.sum()):,} postcode buffers unusable; left empty")
buffers[broken] = shapely.from_wkt("POLYGON EMPTY")
return buffers, shapely.STRtree(buffers)
def _accumulate_counts(
csvs: list[Path],
tree: shapely.STRtree,
type_to_idx: dict[str, int],
year_to_idx: dict[int, int],
transformer: Transformer,
counts: np.ndarray,
) -> None:
"""Stream the crime CSVs, counting points-in-buffer per (postcode, type, year)."""
schema = {
"Longitude": pl.Float64,
"Latitude": pl.Float64,
"Month": pl.Utf8,
"Crime type": pl.Utf8,
}
known_types = list(type_to_idx)
total_points = 0
total_matches = 0
total_dropped = 0
for start in range(0, len(csvs), _CSV_BATCH):
batch = csvs[start : start + _CSV_BATCH]
frame = (
pl.scan_csv(
batch,
schema_overrides=schema,
ignore_errors=True,
)
.select("Longitude", "Latitude", "Month", "Crime type")
.with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
.filter(
pl.col("Longitude").is_not_null()
& pl.col("Latitude").is_not_null()
& pl.col("Longitude").is_between(*LON_BOUNDS)
& pl.col("Latitude").is_between(*LAT_BOUNDS)
& pl.col("Crime type").is_in(known_types)
& pl.col("year").is_in(list(year_to_idx))
)
.with_columns(
pl.col("Crime type")
.replace_strict(type_to_idx, return_dtype=pl.Int32)
.alias("tidx"),
pl.col("year")
.replace_strict(year_to_idx, return_dtype=pl.Int32)
.alias("yidx"),
)
.select("Longitude", "Latitude", "tidx", "yidx")
.collect(engine="streaming")
)
rows_in = frame.height
if rows_in == 0:
continue
lon = frame["Longitude"].to_numpy()
lat = frame["Latitude"].to_numpy()
tidx = frame["tidx"].to_numpy()
yidx = frame["yidx"].to_numpy()
x, y = transformer.transform(lon, lat)
finite = np.isfinite(x) & np.isfinite(y)
total_dropped += int((~finite).sum())
if not finite.any():
continue
x, y, tidx, yidx = x[finite], y[finite], tidx[finite], yidx[finite]
total_points += x.size
points = shapely.points(x, y)
point_index, postcode_index = tree.query(points, predicate="intersects")
if point_index.size:
np.add.at(
counts,
(postcode_index, tidx[point_index], yidx[point_index]),
1,
)
total_matches += point_index.size
print(
f" files {start + len(batch):,}/{len(csvs):,}: "
f"{total_points:,} located points, {total_matches:,} postcode matches"
)
if total_dropped:
print(f"Dropped {total_dropped:,} points outside the BNG transform domain")
def _rollup_long(
long: pl.DataFrame, types: tuple[str, ...], rollup_name: str
) -> pl.DataFrame:
"""Sum per-year annualised counts across ``types`` into a single rollup."""
return (
long.filter(pl.col("Crime type").is_in(list(types)))
.group_by("postcode", "year")
.agg(pl.col("count").sum().round(1).alias("count"))
.with_columns(pl.lit(rollup_name).alias("Crime type"))
.select("postcode", "Crime type", "year", "count")
)
def _write_avg_yr(
postcodes: np.ndarray,
counts: np.ndarray,
valid_month_count: int,
output_path: Path,
) -> None:
"""Write ``postcode`` + ``"{type} (avg/yr)"`` annualised totals."""
totals = counts.sum(axis=2) # (n_postcodes, n_types)
avg = np.round(totals / valid_month_count * 12.0, 1).astype(np.float32)
data: dict[str, np.ndarray] = {"postcode": postcodes}
for type_idx, name in enumerate(ALL_CRIME_TYPES):
data[f"{name} (avg/yr)"] = avg[:, type_idx]
output_path.parent.mkdir(parents=True, exist_ok=True)
pl.DataFrame(data).write_parquet(output_path, compression="zstd")
print(f"Wrote postcode crime averages: {output_path}")
def _write_by_year(
postcodes: np.ndarray,
counts: np.ndarray,
years: list[int],
months_in_year: dict[int, int],
output_path: Path,
) -> None:
"""Write nested ``"{type} (by year)"`` series plus Serious/Minor rollups."""
months = np.array([months_in_year[year] for year in years], dtype=np.float64)
annual = np.round(counts.astype(np.float64) * 12.0 / months[None, None, :], 1)
pc_i, ty_i, yr_i = np.nonzero(counts)
if pc_i.size == 0:
raise ValueError("No crime points matched any postcode buffer")
type_names = np.array(ALL_CRIME_TYPES, dtype=object)
year_values = np.array(years, dtype=np.int32)
long = pl.DataFrame(
{
"postcode": postcodes[pc_i],
"Crime type": type_names[ty_i],
"year": year_values[yr_i],
"count": annual[pc_i, ty_i, yr_i].astype(np.float32),
}
)
serious = _rollup_long(long, SERIOUS_CRIME_TYPES, "Serious crime")
minor = _rollup_long(long, MINOR_CRIME_TYPES, "Minor crime")
combined = pl.concat([long, serious, minor])
by_type = (
combined.sort("year")
.group_by("postcode", "Crime type")
.agg(pl.struct("year", "count").alias("series"))
)
wide = by_type.pivot(on="Crime type", index="postcode", values="series")
type_cols = [c for c in wide.columns if c != "postcode"]
wide = wide.rename({col: f"{col} (by year)" for col in type_cols})
output_path.parent.mkdir(parents=True, exist_ok=True)
wide.write_parquet(output_path, compression="zstd")
print(f"Wrote postcode crime by-year series: {output_path} {wide.shape}")
def transform_crime_spatial(
crime_dir: Path,
boundaries_dir: Path,
output_path: Path,
by_year_output_path: Path,
buffer_m: float = DEFAULT_BUFFER_M,
max_postcodes: int | None = None,
max_files: int | None = None,
) -> None:
csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
if not csvs:
raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
if max_files is not None:
csvs = csvs[:max_files]
years, months_in_year, valid_month_count = _month_calendar(csvs)
print(
f"Found {len(csvs):,} street crime CSVs across {valid_month_count} months "
f"({years[0]}-{years[-1]})"
+ (f" (ignored {ignored_csv_count} non-street CSVs)" if ignored_csv_count else "")
)
postcodes, polygons = load_postcode_polygons(boundaries_dir, max_postcodes)
print(f"Buffering {len(postcodes):,} postcode polygons by {buffer_m:g}m...")
_buffers, tree = _build_tree(polygons, buffer_m)
type_to_idx = {name: idx for idx, name in enumerate(ALL_CRIME_TYPES)}
year_to_idx = {year: idx for idx, year in enumerate(years)}
counts = np.zeros((len(postcodes), len(ALL_CRIME_TYPES), len(years)), dtype=np.int32)
transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
_accumulate_counts(csvs, tree, type_to_idx, year_to_idx, transformer, counts)
_write_avg_yr(postcodes, counts, valid_month_count, output_path)
_write_by_year(postcodes, counts, years, months_in_year, by_year_output_path)
def main() -> None:
parser = argparse.ArgumentParser(
description="Count police.uk crime points within 50m of each postcode boundary"
)
parser.add_argument(
"--input",
type=Path,
default=Path("property-data/crime"),
help="Directory containing police.uk street crime CSVs",
)
parser.add_argument(
"--boundaries",
type=Path,
default=Path("property-data/postcode_boundaries/units"),
help="Directory of per-district postcode boundary GeoJSONs",
)
parser.add_argument(
"--output",
type=Path,
required=True,
help="Output parquet: postcode + '{type} (avg/yr)' columns",
)
parser.add_argument(
"--output-by-year",
type=Path,
required=True,
help="Output parquet: postcode + nested '{type} (by year)' columns",
)
parser.add_argument(
"--buffer-m",
type=float,
default=DEFAULT_BUFFER_M,
help="Outward buffer (metres) added to each postcode boundary",
)
parser.add_argument(
"--max-postcodes",
type=int,
default=None,
help="Testing only: process the first N postcodes",
)
parser.add_argument(
"--max-files",
type=int,
default=None,
help="Testing only: process the first N monthly CSV files",
)
args = parser.parse_args()
if args.buffer_m <= 0:
raise SystemExit("--buffer-m must be greater than zero")
transform_crime_spatial(
crime_dir=args.input,
boundaries_dir=args.boundaries,
output_path=args.output,
by_year_output_path=args.output_by_year,
buffer_m=args.buffer_m,
max_postcodes=args.max_postcodes,
max_files=args.max_files,
)
if __name__ == "__main__":
main()