Improve data pipeline

2026-06-01 20:10:03 +01:00 · 2026-06-01 20:10:03 +01:00 · f99bd4e5c9
commit f99bd4e5c9
parent e8345cbdc1
36 changed files with 966 additions and 129 deletions
--- a/pipeline/transform/test_crime_spatial.py
+++ b/pipeline/transform/test_crime_spatial.py
@ -279,3 +279,37 @@ def test_unknown_crime_type_is_dropped_with_warning(tmp_path, capsys):
    err = capsys.readouterr().err
    assert "Cyber fraud" in err
    assert "WARNING" in err
+
+
+def test_legacy_crime_types_are_mapped(tmp_path):
+    """Pre-2014 crime-type names are aliased to current equivalents in the
+    spatial transform instead of being dropped as unknown types."""
+    units = tmp_path / "units"
+    _write_boundaries(
+        units, {"AB1": [_square_feature("AB1 1AA", 1000, 1000, 1010, 1010)]}
+    )
+
+    crime = tmp_path / "crime"
+    _write_month(
+        crime,
+        "2013-01",
+        [
+            _crime_row("2013-01", 1005, 1005, "Violent crime"),
+            _crime_row("2013-01", 1005, 1005, "Public disorder and weapons"),
+        ],
+    )
+
+    output = tmp_path / "crime_by_postcode.parquet"
+    by_year = tmp_path / "crime_by_postcode_by_year.parquet"
+    transform_crime_spatial(crime, units, output, by_year, buffer_m=50.0)
+
+    row = pl.read_parquet(output).to_dicts()[0]
+    # Single postcode -> area-norm factor 1.0; single month/year -> x12.
+    assert row["Violence and sexual offences (avg/yr)"] == 12.0
+    assert row["Public order (avg/yr)"] == 12.0
+
+    by_year_row = pl.read_parquet(by_year).row(0, named=True)
+    assert by_year_row["Violence and sexual offences (by year)"] == [
+        {"year": 2013, "count": 12.0}
+    ]
+    assert by_year_row["Public order (by year)"] == [{"year": 2013, "count": 12.0}]