perfect-postcode/pipeline/transform/crime_hotspot_tiles.py

"""Build PMTiles point tiles for the crime heatmap overlay.

The output intentionally keeps point features rather than H3/grid aggregates so
MapLibre can render a true client-side heatmap. Police.uk coordinates are
published anonymous map points, not exact offence locations.
"""

from __future__ import annotations

import argparse
import json
import shutil
import subprocess
import tempfile
from pathlib import Path

import polars as pl

from pipeline.local_temp import local_tmp_dir
from pipeline.transform.crime import LEGACY_CRIME_TYPE_ALIASES, find_street_crime_csvs


def _latest_months(crime_dir: Path, month_count: int) -> list[str]:
    csvs, _ignored = find_street_crime_csvs(crime_dir)
    months = sorted({path.parent.name for path in csvs})
    if not months:
        raise FileNotFoundError(f"No street crime CSVs found in {crime_dir}")
    return months[-month_count:]


def _street_csvs_for_months(crime_dir: Path, months: set[str]) -> list[Path]:
    csvs, _ignored = find_street_crime_csvs(crime_dir)
    selected = [path for path in csvs if path.parent.name in months]
    if not selected:
        raise FileNotFoundError(f"No street crime CSVs found for {sorted(months)}")
    return selected


def _require_tippecanoe() -> str:
    executable = shutil.which("tippecanoe")
    if executable is None:
        raise RuntimeError(
            "tippecanoe is required to build crime hotspot PMTiles. "
            "Install tippecanoe and rerun this target."
        )
    return executable


def _write_geojsonseq(csvs: list[Path], output_path: Path) -> tuple[int, int]:
    """Write one weighted GeoJSON point per distinct (anchor, month, type).

    Returns ``(feature_count, incident_count)``. police.uk snaps every incident
    to a shared "map point" anchor, so many incidents land on the exact same
    coordinate. Collapsing them into one feature carrying ``count`` (the number
    of incidents) keeps the per-crime-type and per-month filters intact while
    turning each hotspot into a single high-weight point. That matters because
    tippecanoe's ``--drop-densest-as-needed`` thins *feature density*, not
    weight: with one feature per row the busiest streets were silently deleted;
    with one weighted feature per anchor those hotspots survive and the dropped
    detail is only redundant duplicate points. The heatmap reads ``count`` as
    its weight.
    """
    grouped = (
        pl.scan_csv(
            csvs,
            schema_overrides={
                "Longitude": pl.Float64,
                "Latitude": pl.Float64,
                "Month": pl.Utf8,
                "Crime type": pl.Utf8,
            },
            ignore_errors=True,
        )
        .select(
            pl.col("Longitude").alias("lon"),
            pl.col("Latitude").alias("lat"),
            pl.col("Month").alias("month"),
            pl.col("Crime type").alias("crime_type"),
        )
        .drop_nulls(["lon", "lat"])
        .filter(pl.col("lon").is_between(-9.5, 5.0))
        .filter(pl.col("lat").is_between(49.0, 57.0))
        # Canonicalise any legacy pre-2014 type names so the heatmap's crime_type
        # values always match the frontend's canonical filter list (a no-op for
        # the recent months this overlay normally covers).
        .with_columns(pl.col("crime_type").replace(LEGACY_CRIME_TYPE_ALIASES))
        .group_by("lon", "lat", "month", "crime_type")
        .len()
        .rename({"len": "count"})
        .collect(engine="streaming")
    )

    incident_count = int(grouped["count"].sum())
    with output_path.open("w") as file:
        for row in grouped.iter_rows(named=True):
            feature = {
                "type": "Feature",
                "geometry": {
                    "type": "Point",
                    "coordinates": [row["lon"], row["lat"]],
                },
                "properties": {
                    "count": row["count"],
                    "weight": row["count"],
                    "month": row["month"],
                    "crime_type": row["crime_type"],
                },
            }
            file.write(json.dumps(feature, separators=(",", ":")) + "\n")

    return grouped.height, incident_count


def build_crime_hotspot_tiles(
    crime_dir: Path,
    output_path: Path,
    months: int,
    min_zoom: int,
    max_zoom: int,
) -> None:
    tippecanoe = _require_tippecanoe()
    selected_months = set(_latest_months(crime_dir, months))
    csvs = _street_csvs_for_months(crime_dir, selected_months)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with tempfile.TemporaryDirectory(dir=local_tmp_dir()) as tmp:
        ndjson_path = Path(tmp) / "crime_hotspots.geojsonseq"
        feature_count, incident_count = _write_geojsonseq(csvs, ndjson_path)
        print(
            f"Writing {feature_count:,} weighted crime heatmap points "
            f"({incident_count:,} incidents) "
            f"from {min(selected_months)} to {max(selected_months)}"
        )

        subprocess.run(
            [
                tippecanoe,
                "--force",
                "--output",
                str(output_path),
                "--layer",
                "crime_hotspots",
                "--minimum-zoom",
                str(min_zoom),
                "--maximum-zoom",
                str(max_zoom),
                "--drop-densest-as-needed",
                "--extend-zooms-if-still-dropping",
                "--temporary-directory",
                tmp,
                str(ndjson_path),
            ],
            check=True,
        )


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--input", type=Path, required=True, help="Crime CSV directory")
    parser.add_argument(
        "--output", type=Path, required=True, help="Output .pmtiles path"
    )
    parser.add_argument(
        "--months",
        type=int,
        default=12,
        help="Latest complete months to include in the heatmap",
    )
    parser.add_argument("--min-zoom", type=int, default=12)
    parser.add_argument("--max-zoom", type=int, default=16)
    args = parser.parse_args()

    build_crime_hotspot_tiles(
        args.input,
        args.output,
        args.months,
        args.min_zoom,
        args.max_zoom,
    )


if __name__ == "__main__":
    main()