perfect-postcode/pipeline/transform/crime_spatial.py

"""Aggregate police.uk street crime to postcodes by 50m spatial proximity.

Instead of attributing each incident to its published LSOA code, this transform
counts the anonymised incident *points* that fall within 50m of each postcode's
boundary polygon (the polygon buffered outward by 50m). A point inside several
overlapping buffers counts for each postcode -- the same multiplicity the
tree-density filter uses for features near more than one postcode.

The metric is a raw annualised count ("incidents/year within 50m"); there is no
per-capita denominator. Outputs mirror the old LSOA transform's shape but are
keyed on ``postcode`` instead of ``LSOA code``:

* ``crime_by_postcode.parquet`` -- ``postcode`` + ``"{type} (avg/yr)"`` columns.
* ``crime_by_postcode_by_year.parquet`` -- ``postcode`` + ``"{type} (by year)"``
  nested ``list[struct{year, count}]`` columns, with Serious/Minor rollups.

Caveat: police.uk coordinates are snapped to a fixed set of anonymous "map
points", not true locations, and a share of rows have no coordinate at all
(dropped here). Spatial totals are therefore lower than, and fuzzier than, the
old LSOA-tagged counts -- by design, not a regression.
"""

from __future__ import annotations

import argparse
import re
from pathlib import Path

import numpy as np
import polars as pl
import shapely
from pyproj import Transformer

from pipeline.transform.crime import (
    MINOR_CRIME_TYPES,
    SERIOUS_CRIME_TYPES,
    find_street_crime_csvs,
)
from pipeline.transform.postcode_boundaries.loader import load_postcode_polygons

# Serious types first so column order is stable and self-documenting.
ALL_CRIME_TYPES: tuple[str, ...] = SERIOUS_CRIME_TYPES + MINOR_CRIME_TYPES

DEFAULT_BUFFER_M = 50.0
MONTH_DIR_RE = re.compile(r"^\d{4}-\d{2}$")

# Generous GB bounds; points outside fall in no English postcode anyway, but
# filtering first keeps the WGS84->BNG transform out of its undefined region.
LON_BOUNDS = (-9.5, 2.5)
LAT_BOUNDS = (49.0, 61.5)

# Read CSVs in chunks of files to bound peak memory while keeping the STRtree
# query vectorised over a useful number of points.
_CSV_BATCH = 64


def _month_calendar(csvs: list[Path]) -> tuple[list[int], dict[int, int], int]:
    """Derive annualisation denominators from the monthly directory names.

    Each police.uk file lives under ``{crime_dir}/{YYYY-MM}/...`` and holds that
    month's incidents, so the set of month directories is the set of observed
    months. Returns the sorted distinct years, months-observed-per-year, and the
    total month count (the avg/yr denominator).
    """
    months = sorted(
        {path.parent.name for path in csvs if MONTH_DIR_RE.fullmatch(path.parent.name)}
    )
    if not months:
        raise ValueError("No valid YYYY-MM month directories found among crime CSVs")

    months_in_year: dict[int, int] = {}
    for month in months:
        year = int(month[:4])
        months_in_year[year] = months_in_year.get(year, 0) + 1

    years = sorted(months_in_year)
    return years, months_in_year, len(months)


def _build_tree(
    polygons: np.ndarray, buffer_m: float
) -> tuple[np.ndarray, shapely.STRtree]:
    """Buffer postcode polygons outward by ``buffer_m`` and index them.

    Buffer index == postcode index. Geometries that fail to buffer are replaced
    with an empty polygon so the index stays aligned; they simply never match.
    """
    buffers = shapely.buffer(polygons, buffer_m, quad_segs=8)
    broken = shapely.is_missing(buffers) | ~shapely.is_valid(buffers)
    if broken.any():
        print(f"  {int(broken.sum()):,} postcode buffers unusable; left empty")
        buffers[broken] = shapely.from_wkt("POLYGON EMPTY")
    return buffers, shapely.STRtree(buffers)


def _accumulate_counts(
    csvs: list[Path],
    tree: shapely.STRtree,
    type_to_idx: dict[str, int],
    year_to_idx: dict[int, int],
    transformer: Transformer,
    counts: np.ndarray,
) -> None:
    """Stream the crime CSVs, counting points-in-buffer per (postcode, type, year)."""
    schema = {
        "Longitude": pl.Float64,
        "Latitude": pl.Float64,
        "Month": pl.Utf8,
        "Crime type": pl.Utf8,
    }
    known_types = list(type_to_idx)
    total_points = 0
    total_matches = 0
    total_dropped = 0

    for start in range(0, len(csvs), _CSV_BATCH):
        batch = csvs[start : start + _CSV_BATCH]
        frame = (
            pl.scan_csv(
                batch,
                schema_overrides=schema,
                ignore_errors=True,
            )
            .select("Longitude", "Latitude", "Month", "Crime type")
            .with_columns(pl.col("Month").str.slice(0, 4).cast(pl.Int32).alias("year"))
            .filter(
                pl.col("Longitude").is_not_null()
                & pl.col("Latitude").is_not_null()
                & pl.col("Longitude").is_between(*LON_BOUNDS)
                & pl.col("Latitude").is_between(*LAT_BOUNDS)
                & pl.col("Crime type").is_in(known_types)
                & pl.col("year").is_in(list(year_to_idx))
            )
            .with_columns(
                pl.col("Crime type")
                .replace_strict(type_to_idx, return_dtype=pl.Int32)
                .alias("tidx"),
                pl.col("year")
                .replace_strict(year_to_idx, return_dtype=pl.Int32)
                .alias("yidx"),
            )
            .select("Longitude", "Latitude", "tidx", "yidx")
            .collect(engine="streaming")
        )

        rows_in = frame.height
        if rows_in == 0:
            continue

        lon = frame["Longitude"].to_numpy()
        lat = frame["Latitude"].to_numpy()
        tidx = frame["tidx"].to_numpy()
        yidx = frame["yidx"].to_numpy()

        x, y = transformer.transform(lon, lat)
        finite = np.isfinite(x) & np.isfinite(y)
        total_dropped += int((~finite).sum())
        if not finite.any():
            continue
        x, y, tidx, yidx = x[finite], y[finite], tidx[finite], yidx[finite]
        total_points += x.size

        points = shapely.points(x, y)
        point_index, postcode_index = tree.query(points, predicate="intersects")
        if point_index.size:
            np.add.at(
                counts,
                (postcode_index, tidx[point_index], yidx[point_index]),
                1,
            )
            total_matches += point_index.size

        print(
            f"  files {start + len(batch):,}/{len(csvs):,}: "
            f"{total_points:,} located points, {total_matches:,} postcode matches"
        )

    if total_dropped:
        print(f"Dropped {total_dropped:,} points outside the BNG transform domain")


def _rollup_long(
    long: pl.DataFrame, types: tuple[str, ...], rollup_name: str
) -> pl.DataFrame:
    """Sum per-year annualised counts across ``types`` into a single rollup."""
    return (
        long.filter(pl.col("Crime type").is_in(list(types)))
        .group_by("postcode", "year")
        .agg(pl.col("count").sum().round(1).alias("count"))
        .with_columns(pl.lit(rollup_name).alias("Crime type"))
        .select("postcode", "Crime type", "year", "count")
    )


def _write_avg_yr(
    postcodes: np.ndarray,
    counts: np.ndarray,
    valid_month_count: int,
    output_path: Path,
) -> None:
    """Write ``postcode`` + ``"{type} (avg/yr)"`` annualised totals."""
    totals = counts.sum(axis=2)  # (n_postcodes, n_types)
    avg = np.round(totals / valid_month_count * 12.0, 1).astype(np.float32)

    data: dict[str, np.ndarray] = {"postcode": postcodes}
    for type_idx, name in enumerate(ALL_CRIME_TYPES):
        data[f"{name} (avg/yr)"] = avg[:, type_idx]

    output_path.parent.mkdir(parents=True, exist_ok=True)
    pl.DataFrame(data).write_parquet(output_path, compression="zstd")
    print(f"Wrote postcode crime averages: {output_path}")


def _write_by_year(
    postcodes: np.ndarray,
    counts: np.ndarray,
    years: list[int],
    months_in_year: dict[int, int],
    output_path: Path,
) -> None:
    """Write nested ``"{type} (by year)"`` series plus Serious/Minor rollups."""
    months = np.array([months_in_year[year] for year in years], dtype=np.float64)
    annual = np.round(counts.astype(np.float64) * 12.0 / months[None, None, :], 1)

    pc_i, ty_i, yr_i = np.nonzero(counts)
    if pc_i.size == 0:
        raise ValueError("No crime points matched any postcode buffer")

    type_names = np.array(ALL_CRIME_TYPES, dtype=object)
    year_values = np.array(years, dtype=np.int32)
    long = pl.DataFrame(
        {
            "postcode": postcodes[pc_i],
            "Crime type": type_names[ty_i],
            "year": year_values[yr_i],
            "count": annual[pc_i, ty_i, yr_i].astype(np.float32),
        }
    )

    serious = _rollup_long(long, SERIOUS_CRIME_TYPES, "Serious crime")
    minor = _rollup_long(long, MINOR_CRIME_TYPES, "Minor crime")
    combined = pl.concat([long, serious, minor])

    by_type = (
        combined.sort("year")
        .group_by("postcode", "Crime type")
        .agg(pl.struct("year", "count").alias("series"))
    )
    wide = by_type.pivot(on="Crime type", index="postcode", values="series")
    type_cols = [c for c in wide.columns if c != "postcode"]
    wide = wide.rename({col: f"{col} (by year)" for col in type_cols})

    output_path.parent.mkdir(parents=True, exist_ok=True)
    wide.write_parquet(output_path, compression="zstd")
    print(f"Wrote postcode crime by-year series: {output_path}  {wide.shape}")


def transform_crime_spatial(
    crime_dir: Path,
    boundaries_dir: Path,
    output_path: Path,
    by_year_output_path: Path,
    buffer_m: float = DEFAULT_BUFFER_M,
    max_postcodes: int | None = None,
    max_files: int | None = None,
) -> None:
    csvs, ignored_csv_count = find_street_crime_csvs(crime_dir)
    if not csvs:
        raise FileNotFoundError(f"No street crime CSV files found in {crime_dir}")
    if max_files is not None:
        csvs = csvs[:max_files]

    years, months_in_year, valid_month_count = _month_calendar(csvs)
    print(
        f"Found {len(csvs):,} street crime CSVs across {valid_month_count} months "
        f"({years[0]}-{years[-1]})"
        + (f" (ignored {ignored_csv_count} non-street CSVs)" if ignored_csv_count else "")
    )

    postcodes, polygons = load_postcode_polygons(boundaries_dir, max_postcodes)
    print(f"Buffering {len(postcodes):,} postcode polygons by {buffer_m:g}m...")
    _buffers, tree = _build_tree(polygons, buffer_m)

    type_to_idx = {name: idx for idx, name in enumerate(ALL_CRIME_TYPES)}
    year_to_idx = {year: idx for idx, year in enumerate(years)}
    counts = np.zeros((len(postcodes), len(ALL_CRIME_TYPES), len(years)), dtype=np.int32)

    transformer = Transformer.from_crs("EPSG:4326", "EPSG:27700", always_xy=True)
    _accumulate_counts(csvs, tree, type_to_idx, year_to_idx, transformer, counts)

    _write_avg_yr(postcodes, counts, valid_month_count, output_path)
    _write_by_year(postcodes, counts, years, months_in_year, by_year_output_path)


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Count police.uk crime points within 50m of each postcode boundary"
    )
    parser.add_argument(
        "--input",
        type=Path,
        default=Path("property-data/crime"),
        help="Directory containing police.uk street crime CSVs",
    )
    parser.add_argument(
        "--boundaries",
        type=Path,
        default=Path("property-data/postcode_boundaries/units"),
        help="Directory of per-district postcode boundary GeoJSONs",
    )
    parser.add_argument(
        "--output",
        type=Path,
        required=True,
        help="Output parquet: postcode + '{type} (avg/yr)' columns",
    )
    parser.add_argument(
        "--output-by-year",
        type=Path,
        required=True,
        help="Output parquet: postcode + nested '{type} (by year)' columns",
    )
    parser.add_argument(
        "--buffer-m",
        type=float,
        default=DEFAULT_BUFFER_M,
        help="Outward buffer (metres) added to each postcode boundary",
    )
    parser.add_argument(
        "--max-postcodes",
        type=int,
        default=None,
        help="Testing only: process the first N postcodes",
    )
    parser.add_argument(
        "--max-files",
        type=int,
        default=None,
        help="Testing only: process the first N monthly CSV files",
    )
    args = parser.parse_args()

    if args.buffer_m <= 0:
        raise SystemExit("--buffer-m must be greater than zero")

    transform_crime_spatial(
        crime_dir=args.input,
        boundaries_dir=args.boundaries,
        output_path=args.output,
        by_year_output_path=args.output_by_year,
        buffer_m=args.buffer_m,
        max_postcodes=args.max_postcodes,
        max_files=args.max_files,
    )


if __name__ == "__main__":
    main()