Improve data pipeline

This commit is contained in:
Andras Schmelczer 2026-06-01 20:10:03 +01:00
parent e8345cbdc1
commit f99bd4e5c9
36 changed files with 966 additions and 129 deletions

View file

@ -44,6 +44,7 @@ import shapely
from pyproj import Transformer
from pipeline.transform.crime import (
LEGACY_CRIME_TYPE_ALIASES,
MINOR_CRIME_TYPES,
SERIOUS_CRIME_TYPES,
find_street_crime_csvs,
@ -150,6 +151,11 @@ def _accumulate_counts(
& (pl.col("Crime type") != "")
& pl.col("year").is_in(years)
)
# Canonicalise legacy pre-2014 crime-type names ("Violent crime",
# "Public disorder and weapons") to their current equivalents before
# indexing, so ~1.9M historical incidents are counted instead of
# dropped. `.replace` leaves current types unchanged.
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
# Map crime types to indices with default=None so an unrecognised
# type yields a null index we can *report* rather than silently drop
# (the legacy LSOA path surfaced unknown types via its dynamic pivot).