Improve data pipeline

This commit is contained in:
Andras Schmelczer 2026-06-01 20:10:03 +01:00
parent e8345cbdc1
commit f99bd4e5c9
36 changed files with 966 additions and 129 deletions

View file

@ -17,7 +17,7 @@ from pathlib import Path
import polars as pl
from pipeline.local_temp import local_tmp_dir
from pipeline.transform.crime import find_street_crime_csvs
from pipeline.transform.crime import LEGACY_CRIME_TYPE_ALIASES, find_street_crime_csvs
def _latest_months(crime_dir: Path, month_count: int) -> list[str]:
@ -80,6 +80,10 @@ def _write_geojsonseq(csvs: list[Path], output_path: Path) -> tuple[int, int]:
.drop_nulls(["lon", "lat"])
.filter(pl.col("lon").is_between(-9.5, 5.0))
.filter(pl.col("lat").is_between(49.0, 57.0))
# Canonicalise any legacy pre-2014 type names so the heatmap's crime_type
# values always match the frontend's canonical filter list (a no-op for
# the recent months this overlay normally covers).
.with_columns(pl.col("crime_type").replace(LEGACY_CRIME_TYPE_ALIASES))
.group_by("lon", "lat", "month", "crime_type")
.len()
.rename({"len": "count"})