Improve data pipeline
This commit is contained in:
parent
e8345cbdc1
commit
f99bd4e5c9
36 changed files with 966 additions and 129 deletions
|
|
@ -44,6 +44,7 @@ import shapely
|
|||
from pyproj import Transformer
|
||||
|
||||
from pipeline.transform.crime import (
|
||||
LEGACY_CRIME_TYPE_ALIASES,
|
||||
MINOR_CRIME_TYPES,
|
||||
SERIOUS_CRIME_TYPES,
|
||||
find_street_crime_csvs,
|
||||
|
|
@ -150,6 +151,11 @@ def _accumulate_counts(
|
|||
& (pl.col("Crime type") != "")
|
||||
& pl.col("year").is_in(years)
|
||||
)
|
||||
# Canonicalise legacy pre-2014 crime-type names ("Violent crime",
|
||||
# "Public disorder and weapons") to their current equivalents before
|
||||
# indexing, so ~1.9M historical incidents are counted instead of
|
||||
# dropped. `.replace` leaves current types unchanged.
|
||||
.with_columns(pl.col("Crime type").replace(LEGACY_CRIME_TYPE_ALIASES))
|
||||
# Map crime types to indices with default=None so an unrecognised
|
||||
# type yields a null index we can *report* rather than silently drop
|
||||
# (the legacy LSOA path surfaced unknown types via its dynamic pivot).
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue