"""Aggregate police.uk street crime to postcodes by spatial proximity. Instead of attributing each incident to its published LSOA code, this transform counts the anonymised incident *points* that fall within ``buffer_m`` (default 100m) of each postcode's boundary polygon (the polygon buffered outward). A point inside several overlapping buffers counts for each postcode -- the same multiplicity the tree-density filter uses for features near more than one postcode. The wide 100m buffer deliberately smooths police.uk's snap-to-grid coordinates, which would otherwise make the count hypersensitive to which side of a narrow line a shared "map point" anchor happened to land on. Counts are **area-normalised**: each postcode's count is divided by its buffered catchment area and rescaled by the median catchment area, so the metric reflects crime *density* rather than how much ground the buffer sweeps (a median-sized catchment is left unchanged; a large rural postcode is no longer inflated simply for covering more of the map). Normalising by the buffered area -- the region that actually collects points -- rather than the raw polygon keeps tiny unit postcodes from being over-inflated by the fixed buffer-ring floor. The headline ``"{type} (avg/yr)"`` is the simple mean of the per-year annualised counts, so it equals the average of the by-year chart bars. Outputs mirror the old LSOA transform's shape but are keyed on ``postcode``: * ``crime_by_postcode.parquet`` -- ``postcode`` + ``"{type} (avg/yr)"`` columns. * ``crime_by_postcode_by_year.parquet`` -- ``postcode`` + ``"{type} (by year)"`` nested ``list[struct{year, count}]`` columns, with Serious/Minor rollups. Caveat: police.uk coordinates are snapped to a fixed set of anonymous "map points", not true locations, and a share of rows have no coordinate at all (dropped here). Spatial totals are therefore fuzzier than the old LSOA-tagged counts -- by design, not a regression. """ from __future__ import annotations import argparse import re import sys from pathlib import Path import numpy as np import polars as pl import shapely from pyproj import Transformer from pipeline.transform.crime import ( LEGACY_CRIME_TYPE_ALIASES, MINOR_CRIME_TYPES, SERIOUS_CRIME_TYPES, find_street_crime_csvs, ) from pipeline.transform.postcode_boundaries.loader import load_postcode_polygons # Serious types first so column order is stable and self-documenting. ALL_CRIME_TYPES: tuple[str, ...] = SERIOUS_CRIME_TYPES + MINOR_CRIME_TYPES DEFAULT_BUFFER_M = 100.0 MONTH_DIR_RE = re.compile(r"^\d{4}-\d{2}$") # Generous GB bounds; points outside fall in no English postcode anyway, but # filtering first keeps the WGS84->BNG transform out of its undefined region. LON_BOUNDS = (-9.5, 2.5) LAT_BOUNDS = (49.0, 61.5) # Read CSVs in chunks of files to bound peak memory while keeping the STRtree # query vectorised over a useful number of points. _CSV_BATCH = 64 def _month_calendar(csvs: list[Path]) -> tuple[list[int], dict[int, int], int]: """Derive annualisation denominators from the monthly directory names. Each police.uk file lives under ``{crime_dir}/{YYYY-MM}/...`` and holds that month's incidents, so the set of month directories is the set of observed months. Returns the sorted distinct years, months-observed-per-year, and the total month count (the avg/yr denominator). """ months = sorted( {path.parent.name for path in csvs if MONTH_DIR_RE.fullmatch(path.parent.name)} ) if not months: raise ValueError("No valid YYYY-MM month directories found among crime CSVs") months_in_year: dict[int, int] = {} for month in months: year = int(month[:4]) months_in_year[year] = months_in_year.get(year, 0) + 1 years = sorted(months_in_year) return years, months_in_year, len(months) def _build_tree( polygons: np.ndarray, buffer_m: float ) -> tuple[np.ndarray, shapely.STRtree]: """Buffer postcode polygons outward by ``buffer_m`` and index them. Buffer index == postcode index. Geometries that fail to buffer are replaced with an empty polygon so the index stays aligned; they simply never match. """ buffers = shapely.buffer(polygons, buffer_m, quad_segs=8) broken = shapely.is_missing(buffers) | ~shapely.is_valid(buffers) if broken.any(): print(f" {int(broken.sum()):,} postcode buffers unusable; left empty") buffers[broken] = shapely.from_wkt("POLYGON EMPTY") return buffers, shapely.STRtree(buffers) def _accumulate_counts( csvs: list[Path], tree: shapely.STRtree, type_to_idx: dict[str, int], year_to_idx: dict[int, int], transformer: Transformer, counts: np.ndarray, ) -> None: """Stream the crime CSVs, counting points-in-buffer per (postcode, type, year).""" schema = { "Longitude": pl.Float64, "Latitude": pl.Float64, "Month": pl.Utf8, "Crime type": pl.Utf8, } years = list(year_to_idx) total_points = 0 total_matches = 0 total_dropped = 0 unknown_type_counts: dict[str, int] = {} for start in range(0, len(csvs), _CSV_BATCH): batch = csvs[start : start + _CSV_BATCH] frame = ( pl.scan_csv( batch, schema_overrides=schema, ignore_errors=True, ) .select("Longitude", "Latitude", "Month", "Crime type") # strict=False: a single malformed Month drops only that row instead # of aborting the whole build (a non-numeric year becomes null and is # filtered out by the year membership check below). .with_columns( pl.col("Month").str.slice(0, 4).cast(pl.Int32, strict=False).alias("year") ) .filter( pl.col("Longitude").is_not_null() & pl.col("Latitude").is_not_null() & pl.col("Longitude").is_between(*LON_BOUNDS) & pl.col("Latitude").is_between(*LAT_BOUNDS) & pl.col("Crime type").is_not_null() & (pl.col("Crime type") != "") & pl.col("year").is_in(years) ) # Canonicalise legacy pre-2014 crime-type names ("Violent crime", # "Public disorder and weapons") to their current equivalents before # indexing, so ~1.9M historical incidents are counted instead of # dropped. `.replace` leaves current types unchanged. .with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES)) # Map crime types to indices with default=None so an unrecognised # type yields a null index we can *report* rather than silently drop # (the legacy LSOA path surfaced unknown types via its dynamic pivot). .with_columns( pl.col("Crime type") .replace_strict(type_to_idx, default=None, return_dtype=pl.Int32) .alias("tidx"), pl.col("year") .replace_strict(year_to_idx, return_dtype=pl.Int32) .alias("yidx"), ) .select("Longitude", "Latitude", "Crime type", "tidx", "yidx") .collect(engine="streaming") ) if frame.height == 0: continue unknown = frame.filter(pl.col("tidx").is_null()) if unknown.height: for name, cnt in unknown.group_by("Crime type").len().iter_rows(): unknown_type_counts[name] = unknown_type_counts.get(name, 0) + cnt frame = frame.filter(pl.col("tidx").is_not_null()) if frame.height == 0: continue lon = frame["Longitude"].to_numpy() lat = frame["Latitude"].to_numpy() tidx = frame["tidx"].to_numpy() yidx = frame["yidx"].to_numpy() x, y = transformer.transform(lon, lat) finite = np.isfinite(x) & np.isfinite(y) total_dropped += int((~finite).sum()) if not finite.any(): continue x, y, tidx, yidx = x[finite], y[finite], tidx[finite], yidx[finite] total_points += x.size points = shapely.points(x, y) point_index, postcode_index = tree.query(points, predicate="intersects") if point_index.size: np.add.at( counts, (postcode_index, tidx[point_index], yidx[point_index]), 1, ) total_matches += point_index.size print( f" files {start + len(batch):,}/{len(csvs):,}: " f"{total_points:,} located points, {total_matches:,} postcode matches" ) if total_dropped: print(f"Dropped {total_dropped:,} points outside the BNG transform domain") if unknown_type_counts: total_unknown = sum(unknown_type_counts.values()) listed = ", ".join( f"{name!r} ({cnt:,})" for name, cnt in sorted( unknown_type_counts.items(), key=lambda kv: kv[1], reverse=True ) ) print( f"WARNING: dropped {total_unknown:,} incidents with crime types not in " f"ALL_CRIME_TYPES (taxonomy is stale -- update SERIOUS/MINOR_CRIME_TYPES): " f"{listed}", file=sys.stderr, ) def _rollup_long( long: pl.DataFrame, types: tuple[str, ...], rollup_name: str ) -> pl.DataFrame: """Sum per-year annualised counts across ``types`` into a single rollup.""" return ( long.filter(pl.col("Crime type").is_in(list(types))) .group_by("postcode", "year") .agg(pl.col("count").sum().round(1).alias("count")) .with_columns(pl.lit(rollup_name).alias("Crime type")) .select("postcode", "Crime type", "year", "count") ) def _write_avg_yr( postcodes: np.ndarray, counts: np.ndarray, years: list[int], months_in_year: dict[int, int], norm: np.ndarray, output_path: Path, ) -> None: """Write ``postcode`` + ``"{type} (avg/yr)"`` density-normalised averages. The headline figure is the **simple mean of the per-year annualised counts** (each year scaled to a 12-month equivalent), so it equals the average of the by-year chart bars instead of a month-weighted pooled rate. Each postcode's value is then multiplied by ``norm`` (median_area / buffered catchment area) so the metric is a density rather than a footprint-inflated raw count. """ months = np.array([months_in_year[year] for year in years], dtype=np.float64) per_year = counts.astype(np.float64) * 12.0 / months[None, None, :] # Average over the years *this postcode* actually has incidents of *this # type* -- the same per-(postcode, type) x-span the by-year chart plots # (server-rs/.../crime_by_year.rs), so the headline equals the mean of the # by-year bars. Dividing by a global years-present count (years a type # appeared anywhere in England) would deflate postcodes whose incidents # cluster in only a few years of the ~13-year window. years_present = np.clip((counts > 0).sum(axis=2), 1, None).astype(np.float64) avg = per_year.sum(axis=2) / years_present # (n_postcodes, n_types) avg = np.round(avg * norm[:, None], 1).astype(np.float32) data: dict[str, np.ndarray] = {"postcode": postcodes} for type_idx, name in enumerate(ALL_CRIME_TYPES): data[f"{name} (avg/yr)"] = avg[:, type_idx] output_path.parent.mkdir(parents=True, exist_ok=True) pl.DataFrame(data).write_parquet(output_path, compression="zstd") print(f"Wrote postcode crime averages: {output_path}") def _write_by_year( postcodes: np.ndarray, counts: np.ndarray, years: list[int], months_in_year: dict[int, int], norm: np.ndarray, output_path: Path, ) -> None: """Write nested ``"{type} (by year)"`` series plus Serious/Minor rollups. Per-year counts are area-normalised by the same ``norm`` (median_area / buffered catchment area) factor applied to the avg/yr headline, so the chart bars and the headline figure remain mutually consistent. """ months = np.array([months_in_year[year] for year in years], dtype=np.float64) annual = np.round( counts.astype(np.float64) * 12.0 / months[None, None, :] * norm[:, None, None], 1, ) pc_i, ty_i, yr_i = np.nonzero(counts) if pc_i.size == 0: raise ValueError("No crime points matched any postcode buffer") type_names = np.array(ALL_CRIME_TYPES, dtype=object) year_values = np.array(years, dtype=np.int32) long = pl.DataFrame( { "postcode": postcodes[pc_i], "Crime type": type_names[ty_i], "year": year_values[yr_i], "count": annual[pc_i, ty_i, yr_i].astype(np.float32), } ) serious = _rollup_long(long, SERIOUS_CRIME_TYPES, "Serious crime") minor = _rollup_long(long, MINOR_CRIME_TYPES, "Minor crime") combined = pl.concat([long, serious, minor]) by_type = ( combined.sort("year") .group_by("postcode", "Crime type") .agg(pl.struct("year", "count").alias("series")) ) wide = by_type.pivot(on="Crime type", index="postcode", values="series") type_cols = [c for c in wide.columns if c != "postcode"] wide = wide.rename({col: f"{col} (by year)" for col in type_cols}) output_path.parent.mkdir(parents=True, exist_ok=True) wide.write_parquet(output_path, compression="zstd") print(f"Wrote postcode crime by-year series: {output_path} {wide.shape}") def transform_crime_spatial( crime_dir: Path, boundaries_dir: Path, output_path: Path, by_year_output_path: Path, buffer_m: float = DEFAULT_BUFFER_M, max_postcodes: int | None = None, max_files: int | None = None, ) -> None: csvs, ignored_csv_count = find_street_crime_csvs(crime_dir) if not csvs: raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}") if max_files is not None: csvs = csvs[:max_files] years, months_in_year, valid_month_count = _month_calendar(csvs) print( f"Found {len(csvs):,} street crime CSVs across {valid_month_count} months " f"({years[0]}-{years[-1]})" + (f" (ignored {ignored_csv_count} non-street CSVs)" if ignored_csv_count else "") ) postcodes, polygons = load_postcode_polygons(boundaries_dir, max_postcodes) print(f"Buffering {len(postcodes):,} postcode polygons by {buffer_m:g}m...") buffers, tree = _build_tree(polygons, buffer_m) # Area-normalisation factor (median_area / catchment_area): divides out the # size of each postcode's catchment so the count measures crime density, not # how much ground the buffer sweeps. We normalise by the *buffered* area -- # the region that actually collects points -- rather than the raw polygon, so # a tiny unit postcode isn't over-inflated by the fixed buffer-ring floor. # Buffers are in EPSG:27700, so shapely.area is in m^2. areas = shapely.area(buffers).astype(np.float64) usable_area = np.isfinite(areas) & (areas > 0) if not usable_area.any(): raise ValueError("No postcode buffers have a positive area to normalise by") median_area = float(np.median(areas[usable_area])) norm = np.zeros(len(postcodes), dtype=np.float64) norm[usable_area] = median_area / areas[usable_area] print( f"Area-normalising to median catchment area {median_area:,.0f} m^2 " f"({int(usable_area.sum()):,}/{len(areas):,} postcodes have usable area)" ) type_to_idx = {name: idx for idx, name in enumerate(ALL_CRIME_TYPES)} year_to_idx = {year: idx for idx, year in enumerate(years)} counts = np.zeros((len(postcodes), len(ALL_CRIME_TYPES), len(years)), dtype=np.int32) transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True) _accumulate_counts(csvs, tree, type_to_idx, year_to_idx, transformer, counts) _write_avg_yr(postcodes, counts, years, months_in_year, norm, output_path) _write_by_year(postcodes, counts, years, months_in_year, norm, by_year_output_path) def main() -> None: parser = argparse.ArgumentParser( description="Count police.uk crime points within 50m of each postcode boundary" ) parser.add_argument( "--input", type=Path, default=Path("property-data/crime"), help="Directory containing police.uk street crime CSVs", ) parser.add_argument( "--boundaries", type=Path, default=Path("property-data/postcode_boundaries/units"), help="Directory of per-district postcode boundary GeoJSONs", ) parser.add_argument( "--output", type=Path, required=True, help="Output parquet: postcode + '{type} (avg/yr)' columns", ) parser.add_argument( "--output-by-year", type=Path, required=True, help="Output parquet: postcode + nested '{type} (by year)' columns", ) parser.add_argument( "--buffer-m", type=float, default=DEFAULT_BUFFER_M, help="Outward buffer (metres) added to each postcode boundary", ) parser.add_argument( "--max-postcodes", type=int, default=None, help="Testing only: process the first N postcodes", ) parser.add_argument( "--max-files", type=int, default=None, help="Testing only: process the first N monthly CSV files", ) args = parser.parse_args() if args.buffer_m <= 0: raise SystemExit("--buffer-m must be greater than zero") transform_crime_spatial( crime_dir=args.input, boundaries_dir=args.boundaries, output_path=args.output, by_year_output_path=args.output_by_year, buffer_m=args.buffer_m, max_postcodes=args.max_postcodes, max_files=args.max_files, ) if __name__ == "__main__": main()