Improve data pipeline

2026-06-01 20:10:03 +01:00 · 2026-06-01 20:10:03 +01:00 · f99bd4e5c9
commit f99bd4e5c9
parent e8345cbdc1
36 changed files with 966 additions and 129 deletions
--- a/pipeline/transform/crime_spatial.py
+++ b/pipeline/transform/crime_spatial.py
@ -44,6 +44,7 @@ import shapely
 from pyproj import Transformer

 from pipeline.transform.crime import (
+    LEGACY_CRIME_TYPE_ALIASES,
    MINOR_CRIME_TYPES,
    SERIOUS_CRIME_TYPES,
    find_street_crime_csvs,
@ -150,6 +151,11 @@ def _accumulate_counts(
                & (pl.col("Crime type") != "")
                & pl.col("year").is_in(years)
            )
+            # Canonicalise legacy pre-2014 crime-type names ("Violent crime",
+            # "Public disorder and weapons") to their current equivalents before
+            # indexing, so ~1.9M historical incidents are counted instead of
+            # dropped. `.replace` leaves current types unchanged.
+            .with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
            # Map crime types to indices with default=None so an unrecognised
            # type yields a null index we can *report* rather than silently drop
            # (the legacy LSOA path surfaced unknown types via its dynamic pivot).